mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
wip
This commit is contained in:
parent
25f32000db
commit
904c904971
151
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
151
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
import request from "supertest";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import {
|
||||||
|
FirecrawlCrawlResponse,
|
||||||
|
FirecrawlCrawlStatusResponse,
|
||||||
|
FirecrawlScrapeResponse,
|
||||||
|
} from "../../types";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
describe("E2E Tests for Extract API Routes", () => {
|
||||||
|
describe("POST /v1/extract", () => {
|
||||||
|
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
urls: ["https://firecrawl.dev"],
|
||||||
|
prompt: "Who are the authors of the blog posts?",
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: { authors: { type: "array", items: { type: "string" } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("founders");
|
||||||
|
|
||||||
|
let gotItRight = 0;
|
||||||
|
for (const author of response.body.data?.authors) {
|
||||||
|
if (author.includes("Caleb Peffer")) gotItRight++;
|
||||||
|
if (author.includes("Gergő Móricz")) gotItRight++;
|
||||||
|
if (author.includes("Eric Ciarla")) gotItRight++;
|
||||||
|
if (author.includes("Nicolas Camara")) gotItRight++;
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(gotItRight).toBeGreaterThan(3);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
urls: ["mendable.ai"],
|
||||||
|
prompt: "Who are the founders of the company?",
|
||||||
|
allowExternalLinks: true,
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: { founders: { type: "array", items: { type: "string" } } },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("founders");
|
||||||
|
|
||||||
|
let gotItRight = 0;
|
||||||
|
for (const founder of response.body.data?.founders) {
|
||||||
|
if (founder.includes("Caleb")) gotItRight++;
|
||||||
|
if (founder.includes("Eric")) gotItRight++;
|
||||||
|
if (founder.includes("Nicolas")) gotItRight++;
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(gotItRight).toBe(3);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
urls: ["https://firecrawl.dev"],
|
||||||
|
prompt: "What are they hiring for?",
|
||||||
|
allowExternalLinks: true,
|
||||||
|
schema: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
console.log(response.body.data);
|
||||||
|
|
||||||
|
let gotItRight = 0;
|
||||||
|
for (const hiring of response.body.data?.items) {
|
||||||
|
if (hiring.includes("Developer Relations Specialist")) gotItRight++;
|
||||||
|
if (hiring.includes("Web Automation Engineer")) gotItRight++;
|
||||||
|
if (hiring.includes("Developer Experience Engineer")) gotItRight++;
|
||||||
|
if (hiring.includes("Developer Support Engineer")) gotItRight++;
|
||||||
|
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
|
||||||
|
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(gotItRight).toBeGreaterThan(5);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
urls: ["fivetran.com"],
|
||||||
|
prompt: "Does Fivetran have PCI DSS compliance?",
|
||||||
|
allowExternalLinks: true,
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
pciDssCompliance: { type: "boolean" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
urls: ["fivetran.com"],
|
||||||
|
prompt: "What are the Azure Data Connectors they offer?",
|
||||||
|
schema: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
connector: { type: "string" },
|
||||||
|
description: { type: "string" },
|
||||||
|
supportsCaptureDelete: { type: "boolean" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
console.log(response.body);
|
||||||
|
// expect(response.statusCode).toBe(200);
|
||||||
|
// expect(response.body).toHaveProperty("data");
|
||||||
|
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
|
}, 60000);
|
||||||
|
});
|
||||||
|
});
|
|
@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { generateFinalExtraction } from "../../lib/extract/completions";
|
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
|
|
||||||
const MAX_EXTRACT_LIMIT = 100;
|
const MAX_EXTRACT_LIMIT = 100;
|
||||||
const MAX_RANKING_LIMIT = 3;
|
const MAX_RANKING_LIMIT = 5;
|
||||||
|
const SCORE_THRESHOLD = 0.75;
|
||||||
|
|
||||||
export async function extractController(
|
export async function extractController(
|
||||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
res: Response<any> //ExtractResponse>
|
res: Response<ExtractResponse>
|
||||||
) {
|
) {
|
||||||
req.body = extractRequestSchema.parse(req.body);
|
req.body = extractRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = crypto.randomUUID();
|
const id = crypto.randomUUID();
|
||||||
let links: string[] = req.body.urls;
|
let links: string[]; //= req.body.urls;
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.urls[0],
|
originUrl: req.body.urls[0],
|
||||||
|
@ -59,10 +61,14 @@ export async function extractController(
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
let urlWithoutWww = req.body.urls[0].replace("www.", "");
|
let urlWithoutWww = req.body.urls[0].replace("www.", "");
|
||||||
|
console.log("urlWithoutWww", urlWithoutWww);
|
||||||
|
|
||||||
let mapUrl = req.body.prompt
|
const allowExternalLinks = req.body.allowExternalLinks ?? false;
|
||||||
? `"${req.body.prompt}" site:${urlWithoutWww}`
|
|
||||||
: `site:${req.body.urls[0]}`;
|
let mapUrl = req.body.prompt && allowExternalLinks
|
||||||
|
? `${req.body.prompt} ${urlWithoutWww}`
|
||||||
|
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
|
||||||
|
: `site:${urlWithoutWww}`;
|
||||||
|
|
||||||
const resultsPerPage = 100;
|
const resultsPerPage = 100;
|
||||||
const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
|
const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
|
||||||
|
@ -84,82 +90,103 @@ export async function extractController(
|
||||||
};
|
};
|
||||||
|
|
||||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||||
allResults = await Promise.all(pagePromises);
|
allResults = (await Promise.all(pagePromises)).flat();
|
||||||
|
// console.log("allResults", allResults);
|
||||||
|
// if allResults is empty, return an error
|
||||||
|
if (allResults.length === 0) {
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: "No results found",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||||
}
|
}
|
||||||
|
|
||||||
// console.log("allResults", allResults);
|
// console.log("allResults", allResults);
|
||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
// const [sitemap, ...searchResults] = await Promise.all([
|
||||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
// req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
|
||||||
...(cachedResult ? [] : pagePromises),
|
// ...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
// ]);
|
||||||
|
|
||||||
if (!cachedResult) {
|
// if (!cachedResult) {
|
||||||
allResults = searchResults;
|
// allResults = searchResults;
|
||||||
}
|
// }
|
||||||
|
|
||||||
if (sitemap !== null) {
|
links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||||
sitemap.forEach((x) => {
|
console.log("links", links);
|
||||||
links.push(x.url);
|
// if (sitemap !== null) {
|
||||||
});
|
// sitemap.forEach((x) => {
|
||||||
}
|
// links.push(x.url);
|
||||||
|
// });
|
||||||
|
// }
|
||||||
|
|
||||||
let mapResults = allResults
|
// let mapResults = allResults
|
||||||
.flat()
|
// .flat()
|
||||||
.filter((result) => result !== null && result !== undefined);
|
// .filter((result) => result !== null && result !== undefined);
|
||||||
|
|
||||||
const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
|
// const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
|
||||||
if (mapResults.length > minumumCutoff) {
|
// if (mapResults.length > minumumCutoff) {
|
||||||
mapResults = mapResults.slice(0, minumumCutoff);
|
// mapResults = mapResults.slice(0, minumumCutoff);
|
||||||
}
|
// }
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
// if (mapResults.length > 0) {
|
||||||
if (req.body.prompt) {
|
// if (req.body.prompt) {
|
||||||
// Ensure all map results are first, maintaining their order
|
// // Ensure all map results are first, maintaining their order
|
||||||
links = [
|
// links = [
|
||||||
mapResults[0].url,
|
// mapResults[0].url,
|
||||||
...mapResults.slice(1).map((x) => x.url),
|
// ...mapResults.slice(1).map((x) => x.url),
|
||||||
...links,
|
// ...links,
|
||||||
];
|
// ];
|
||||||
} else {
|
// } else {
|
||||||
mapResults.map((x) => {
|
// mapResults.map((x) => {
|
||||||
links.push(x.url);
|
// links.push(x.url);
|
||||||
});
|
// });
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
// console.log("mapResults", mapResults);
|
||||||
|
|
||||||
// console.log("links", links);
|
// console.log("links", links);
|
||||||
let linksAndScores: { link: string; score: number }[] = [];
|
let linksAndScores: { link: string; score: number }[] = [];
|
||||||
// Perform cosine similarity between the search query and the list of links
|
// Perform cosine similarity between the search query and the list of links
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
const searchQuery = req.body.prompt.toLowerCase();
|
const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
|
||||||
linksAndScores = await performRanking(links, searchQuery);
|
linksAndScores = await performRanking(links, searchQuery);
|
||||||
}
|
}
|
||||||
|
console.log("linksAndScores", linksAndScores);
|
||||||
|
links = linksAndScores
|
||||||
|
.filter(x => x.score > SCORE_THRESHOLD)
|
||||||
|
.map(x => x.link.split("url: ")[1].split(",")[0])
|
||||||
|
.filter(x => !isUrlBlocked(x))
|
||||||
|
|
||||||
|
console.log("links:", links.length);
|
||||||
|
|
||||||
|
// should we use some sort of llm to determine the best links?
|
||||||
|
|
||||||
// console.log("linksAndScores", linksAndScores);
|
// console.log("linksAndScores", linksAndScores);
|
||||||
|
|
||||||
links = links
|
// links = links
|
||||||
.map((x) => {
|
// .map((x) => {
|
||||||
try {
|
// try {
|
||||||
return checkAndUpdateURLForMap(x).url.trim();
|
// return checkAndUpdateURLForMap(x).url.trim();
|
||||||
} catch (_) {
|
// } catch (_) {
|
||||||
return null;
|
// return null;
|
||||||
}
|
// }
|
||||||
})
|
// })
|
||||||
.filter((x) => x !== null) as string[];
|
// .filter((x) => x !== null) as string[];
|
||||||
|
|
||||||
// allows for subdomains to be included
|
// allows for subdomains to be included
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
|
// links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
|
||||||
|
|
||||||
// if includeSubdomains is false, filter out subdomains
|
// if includeSubdomains is false, filter out subdomains
|
||||||
if (!req.body.includeSubdomains) {
|
// if (!req.body.includeSubdomains) {
|
||||||
links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
|
// links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
|
||||||
}
|
// z}
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = removeDuplicateUrls(links);
|
// links = removeDuplicateUrls(links);
|
||||||
|
|
||||||
// get top N links
|
// get top N links
|
||||||
links = links.slice(0, MAX_RANKING_LIMIT);
|
links = links.slice(0, MAX_RANKING_LIMIT);
|
||||||
|
@ -170,7 +197,7 @@ export async function extractController(
|
||||||
|
|
||||||
for (const url of links) {
|
for (const url of links) {
|
||||||
const origin = req.body.origin || "api";
|
const origin = req.body.origin || "api";
|
||||||
const timeout = req.body.timeout;
|
const timeout = req.body.timeout ?? 30000;
|
||||||
const jobId = crypto.randomUUID();
|
const jobId = crypto.randomUUID();
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
|
@ -196,7 +223,7 @@ export async function extractController(
|
||||||
jobPriority
|
jobPriority
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||||
|
|
||||||
let doc: Document;
|
let doc: Document;
|
||||||
try {
|
try {
|
||||||
|
@ -234,18 +261,20 @@ export async function extractController(
|
||||||
docs.push(doc);
|
docs.push(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(docs)
|
||||||
|
|
||||||
// console.log("docs", docs);
|
const completions = await generateOpenAICompletions(
|
||||||
|
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||||
|
{
|
||||||
|
mode: "llm",
|
||||||
|
systemPrompt: "Only use the provided content to answer the question.",
|
||||||
|
prompt: mapUrl,
|
||||||
|
schema: req.body.schema,
|
||||||
|
},
|
||||||
|
docs.map(x => x.markdown).join('\n')
|
||||||
|
);
|
||||||
|
|
||||||
// {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"}
|
console.log("completions", completions);
|
||||||
const completions = await generateFinalExtraction({
|
|
||||||
pagesContent: docs.map(x => x.markdown).join('\n'),
|
|
||||||
systemPrompt: '',
|
|
||||||
prompt: req.body.prompt,
|
|
||||||
schema: req.body.schema,
|
|
||||||
});
|
|
||||||
|
|
||||||
// console.log("completions", completions);
|
|
||||||
|
|
||||||
// if(req.body.extract && req.body.formats.includes("extract")) {
|
// if(req.body.extract && req.body.formats.includes("extract")) {
|
||||||
// creditsToBeBilled = 5;
|
// creditsToBeBilled = 5;
|
||||||
|
@ -315,9 +344,18 @@ export async function extractController(
|
||||||
// scrape_id: result.scrape_id
|
// scrape_id: result.scrape_id
|
||||||
// };
|
// };
|
||||||
|
|
||||||
|
console.log("completions.extract", completions.extract);
|
||||||
|
|
||||||
|
let data: any;
|
||||||
|
try {
|
||||||
|
data = JSON.parse(completions.extract);
|
||||||
|
} catch (e) {
|
||||||
|
data = completions.extract;
|
||||||
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: completions.content, // includeMetadata ? mapResults : linksToReturn,
|
data: data, // includeMetadata ? mapResults : linksToReturn,
|
||||||
scrape_id: id, //origin?.includes("website") ? id : undefined,
|
scrape_id: id, //origin?.includes("website") ? id : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
|
@ -161,6 +161,7 @@ export const extractV1Options = z.object({
|
||||||
limit: z.number().int().positive().finite().safe().optional(),
|
limit: z.number().int().positive().finite().safe().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
|
allowExternalLinks: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
@ -353,7 +354,7 @@ export type ExtractResponse =
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
data: Document;
|
data: z.infer<typeof extractRequestSchema>;
|
||||||
scrape_id?: string;
|
scrape_id?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,121 +1,124 @@
|
||||||
import OpenAI from "openai";
|
// use llmExtract.ts instead
|
||||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
|
||||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
|
||||||
import { ExtractOptions } from "../../controllers/v1/types";
|
|
||||||
import { Document } from "../entities";
|
|
||||||
import { z } from "zod";
|
|
||||||
|
|
||||||
const maxTokens = 32000;
|
// import OpenAI from "openai";
|
||||||
const modifier = 4;
|
// import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
|
// import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
|
// import { ExtractOptions } from "../../controllers/v1/types";
|
||||||
|
// import { Document } from "../entities";
|
||||||
|
// import { z } from "zod";
|
||||||
|
|
||||||
export class LLMRefusalError extends Error {
|
// const maxTokens = 32000;
|
||||||
constructor(refusal: string) {
|
// const modifier = 4;
|
||||||
super("LLM refused to extract the website's content");
|
|
||||||
this.name = "LLMRefusalError";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
interface GenerateCompletionsParams {
|
// export class LLMRefusalError extends Error {
|
||||||
systemPrompt?: string;
|
// constructor(refusal: string) {
|
||||||
prompt?: string;
|
// super("LLM refused to extract the website's content");
|
||||||
schema?: any;
|
// this.name = "LLMRefusalError";
|
||||||
pagesContent: string;
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
export async function generateBasicCompletion(prompt: string) {
|
// interface GenerateCompletionsParams {
|
||||||
const openai = new OpenAI();
|
// systemPrompt?: string;
|
||||||
const model: TiktokenModel =
|
// prompt?: string;
|
||||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
// schema?: any;
|
||||||
|
// pagesContent: string;
|
||||||
|
// }
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
// export async function generateBasicCompletion(prompt: string) {
|
||||||
model,
|
// const openai = new OpenAI();
|
||||||
messages: [{ role: "user", content: prompt }],
|
// const model: TiktokenModel =
|
||||||
});
|
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
return completion.choices[0].message.content;
|
// const completion = await openai.chat.completions.create({
|
||||||
}
|
// model,
|
||||||
|
// messages: [{ role: "user", content: prompt }],
|
||||||
|
// });
|
||||||
|
|
||||||
export async function generateFinalExtraction({
|
// return completion.choices[0].message.content;
|
||||||
pagesContent,
|
// }
|
||||||
systemPrompt,
|
|
||||||
prompt,
|
|
||||||
schema,
|
|
||||||
}: GenerateCompletionsParams): Promise<{
|
|
||||||
content: string;
|
|
||||||
metadata: { numTokens: number; warning: string };
|
|
||||||
}> {
|
|
||||||
const openai = new OpenAI();
|
|
||||||
const model: TiktokenModel =
|
|
||||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
|
||||||
|
|
||||||
let extractionContent = pagesContent;
|
// export async function generateFinalExtraction({
|
||||||
let numTokens = 0;
|
// pagesContent,
|
||||||
let warning = "";
|
// systemPrompt,
|
||||||
|
// prompt,
|
||||||
|
// schema,
|
||||||
|
// }: GenerateCompletionsParams): Promise<{
|
||||||
|
// content: string;
|
||||||
|
// metadata: { numTokens: number; warning: string };
|
||||||
|
// }> {
|
||||||
|
// const openai = new OpenAI();
|
||||||
|
// const model: TiktokenModel =
|
||||||
|
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
const encoder = encoding_for_model(model);
|
// let extractionContent = pagesContent;
|
||||||
try {
|
// let numTokens = 0;
|
||||||
const tokens = encoder.encode(extractionContent);
|
// let warning = "";
|
||||||
numTokens = tokens.length;
|
|
||||||
} catch (error) {
|
|
||||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
|
||||||
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
|
||||||
} finally {
|
|
||||||
encoder.free();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numTokens > maxTokens) {
|
// const encoder = encoding_for_model(model);
|
||||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
// try {
|
||||||
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
// const tokens = encoder.encode(extractionContent);
|
||||||
}
|
// numTokens = tokens.length;
|
||||||
|
// } catch (error) {
|
||||||
|
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||||
|
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||||
|
// } finally {
|
||||||
|
// encoder.free();
|
||||||
|
// }
|
||||||
|
|
||||||
if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
// if (numTokens > maxTokens) {
|
||||||
schema = {
|
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||||
type: "object",
|
// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||||
properties: {
|
// }
|
||||||
items: schema,
|
|
||||||
},
|
|
||||||
required: ["items"],
|
|
||||||
additionalProperties: false,
|
|
||||||
};
|
|
||||||
} else if (schema) {
|
|
||||||
schema.additionalProperties = false;
|
|
||||||
schema.required = Object.keys(schema.properties);
|
|
||||||
}
|
|
||||||
|
|
||||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
// if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||||
model,
|
// schema = {
|
||||||
messages: [
|
// type: "object",
|
||||||
{ role: "system", content: systemPrompt ?? "" },
|
// properties: {
|
||||||
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
// items: schema,
|
||||||
{
|
// },
|
||||||
role: "user",
|
// required: ["items"],
|
||||||
content: prompt
|
// additionalProperties: false,
|
||||||
? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
// };
|
||||||
: "Transform the above content into structured JSON output.",
|
// } else if (schema) {
|
||||||
},
|
// schema.additionalProperties = false;
|
||||||
],
|
// schema.required = Object.keys(schema.properties);
|
||||||
response_format: schema
|
// }
|
||||||
? {
|
|
||||||
type: "json_schema",
|
|
||||||
json_schema: {
|
|
||||||
name: "websiteContent",
|
|
||||||
schema: schema,
|
|
||||||
strict: true,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
: { type: "json_object" },
|
|
||||||
});
|
|
||||||
|
|
||||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
// const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
// temperature: 0,
|
||||||
}
|
// model,
|
||||||
|
// messages: [
|
||||||
|
// { role: "system", content: systemPrompt ?? "" },
|
||||||
|
// { role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||||
|
// {
|
||||||
|
// role: "user",
|
||||||
|
// content: prompt
|
||||||
|
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||||
|
// : "Transform the above content into structured JSON output.",
|
||||||
|
// },
|
||||||
|
// ],
|
||||||
|
// response_format: schema
|
||||||
|
// ? {
|
||||||
|
// type: "json_schema",
|
||||||
|
// json_schema: {
|
||||||
|
// name: "websiteContent",
|
||||||
|
// schema: schema,
|
||||||
|
// strict: true,
|
||||||
|
// },
|
||||||
|
// }
|
||||||
|
// : { type: "json_object" },
|
||||||
|
// });
|
||||||
|
|
||||||
const extraction = jsonCompletion.choices[0].message.parsed;
|
// if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||||
return {
|
// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||||
content: extraction ?? "",
|
// }
|
||||||
metadata: {
|
|
||||||
numTokens,
|
// const extraction = jsonCompletion.choices[0].message.parsed;
|
||||||
warning,
|
// return {
|
||||||
},
|
// content: extraction ?? "",
|
||||||
};
|
// metadata: {
|
||||||
}
|
// numTokens,
|
||||||
|
// warning,
|
||||||
|
// },
|
||||||
|
// };
|
||||||
|
// }
|
||||||
|
|
|
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
|
export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> {
|
||||||
|
let extract: any;
|
||||||
|
let warning: string | undefined;
|
||||||
|
|
||||||
const openai = new OpenAI();
|
const openai = new OpenAI();
|
||||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
if (document.markdown === undefined) {
|
if (markdown === undefined) {
|
||||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||||
}
|
}
|
||||||
|
|
||||||
let extractionContent = document.markdown;
|
|
||||||
|
|
||||||
// count number of tokens
|
// count number of tokens
|
||||||
let numTokens = 0;
|
let numTokens = 0;
|
||||||
const encoder = encoding_for_model(model as TiktokenModel);
|
const encoder = encoding_for_model(model as TiktokenModel);
|
||||||
try {
|
try {
|
||||||
// Encode the message into tokens
|
// Encode the message into tokens
|
||||||
const tokens = encoder.encode(extractionContent);
|
const tokens = encoder.encode(markdown);
|
||||||
|
|
||||||
// Return the number of tokens
|
// Return the number of tokens
|
||||||
numTokens = tokens.length;
|
numTokens = tokens.length;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn("Calculating num tokens of string failed", { error, extractionContent });
|
logger.warn("Calculating num tokens of string failed", { error, markdown });
|
||||||
|
|
||||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
markdown = markdown.slice(0, maxTokens * modifier);
|
||||||
|
|
||||||
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
||||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||||
} finally {
|
} finally {
|
||||||
// Free the encoder resources after use
|
// Free the encoder resources after use
|
||||||
encoder.free();
|
encoder.free();
|
||||||
|
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||||
|
|
||||||
if (numTokens > maxTokens) {
|
if (numTokens > maxTokens) {
|
||||||
// trim the document to the maximum number of tokens, tokens != characters
|
// trim the document to the maximum number of tokens, tokens != characters
|
||||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
markdown = markdown.slice(0, maxTokens * modifier);
|
||||||
|
|
||||||
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
||||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||||
}
|
}
|
||||||
|
|
||||||
let schema = options.schema;
|
let schema = options.schema;
|
||||||
|
@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: [{ type: "text", text: extractionContent }],
|
content: [{ type: "text", text: markdown }],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
|
@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.extract = jsonCompletion.choices[0].message.parsed;
|
extract = jsonCompletion.choices[0].message.parsed;
|
||||||
|
|
||||||
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
|
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||||
try {
|
try {
|
||||||
document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
||||||
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
||||||
|
@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options.schema && options.schema.type === "array") {
|
if (options.schema && options.schema.type === "array") {
|
||||||
document.extract = document.extract?.items;
|
extract = extract?.items;
|
||||||
}
|
}
|
||||||
return document;
|
return { extract, warning };
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
||||||
if (meta.options.formats.includes("extract")) {
|
if (meta.options.formats.includes("extract")) {
|
||||||
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
|
const { extract, warning } = await generateOpenAICompletions(
|
||||||
|
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
|
||||||
|
meta.options.extract!,
|
||||||
|
document.markdown,
|
||||||
|
document.warning,
|
||||||
|
);
|
||||||
|
document.extract = extract;
|
||||||
|
document.warning = warning;
|
||||||
}
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
|
|
@ -37,7 +37,6 @@ export async function fireEngineMap(
|
||||||
);
|
);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL);
|
|
||||||
|
|
||||||
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
|
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user