This commit is contained in:
rafaelmmiller 2024-11-13 18:06:20 -03:00
parent 25f32000db
commit 904c904971
6 changed files with 397 additions and 197 deletions

View File

@ -0,0 +1,151 @@
import request from "supertest";
import dotenv from "dotenv";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for Extract API Routes", () => {
describe("POST /v1/extract", () => {
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev"],
prompt: "Who are the authors of the blog posts?",
schema: {
type: "object",
properties: { authors: { type: "array", items: { type: "string" } } },
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("founders");
let gotItRight = 0;
for (const author of response.body.data?.authors) {
if (author.includes("Caleb Peffer")) gotItRight++;
if (author.includes("Gergő Móricz")) gotItRight++;
if (author.includes("Eric Ciarla")) gotItRight++;
if (author.includes("Nicolas Camara")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(3);
}, 60000);
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["mendable.ai"],
prompt: "Who are the founders of the company?",
allowExternalLinks: true,
schema: {
type: "object",
properties: { founders: { type: "array", items: { type: "string" } } },
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("founders");
let gotItRight = 0;
for (const founder of response.body.data?.founders) {
if (founder.includes("Caleb")) gotItRight++;
if (founder.includes("Eric")) gotItRight++;
if (founder.includes("Nicolas")) gotItRight++;
}
expect(gotItRight).toBe(3);
}, 60000);
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev"],
prompt: "What are they hiring for?",
allowExternalLinks: true,
schema: {
type: "array",
items: {
type: "string"
}
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
console.log(response.body.data);
let gotItRight = 0;
for (const hiring of response.body.data?.items) {
if (hiring.includes("Developer Relations Specialist")) gotItRight++;
if (hiring.includes("Web Automation Engineer")) gotItRight++;
if (hiring.includes("Developer Experience Engineer")) gotItRight++;
if (hiring.includes("Developer Support Engineer")) gotItRight++;
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(5);
}, 60000);
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com"],
prompt: "Does Fivetran have PCI DSS compliance?",
allowExternalLinks: true,
schema: {
type: "object",
properties: {
pciDssCompliance: { type: "boolean" }
}
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com"],
prompt: "What are the Azure Data Connectors they offer?",
schema: {
type: "array",
items: {
type: "object",
properties: {
connector: { type: "string" },
description: { type: "string" },
supportsCaptureDelete: { type: "boolean" }
}
}
}
})
console.log(response.body);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("data");
// expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
});
});

View File

@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs";
import { addScrapeJob } from "../../services/queue-jobs"; import { addScrapeJob } from "../../services/queue-jobs";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { generateFinalExtraction } from "../../lib/extract/completions"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
configDotenv(); configDotenv();
const redis = new Redis(process.env.REDIS_URL!); const redis = new Redis(process.env.REDIS_URL!);
const MAX_EXTRACT_LIMIT = 100; const MAX_EXTRACT_LIMIT = 100;
const MAX_RANKING_LIMIT = 3; const MAX_RANKING_LIMIT = 5;
const SCORE_THRESHOLD = 0.75;
export async function extractController( export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<any> //ExtractResponse> res: Response<ExtractResponse>
) { ) {
req.body = extractRequestSchema.parse(req.body); req.body = extractRequestSchema.parse(req.body);
const id = crypto.randomUUID(); const id = crypto.randomUUID();
let links: string[] = req.body.urls; let links: string[]; //= req.body.urls;
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: req.body.urls[0], originUrl: req.body.urls[0],
@ -59,10 +61,14 @@ export async function extractController(
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
let urlWithoutWww = req.body.urls[0].replace("www.", ""); let urlWithoutWww = req.body.urls[0].replace("www.", "");
console.log("urlWithoutWww", urlWithoutWww);
let mapUrl = req.body.prompt const allowExternalLinks = req.body.allowExternalLinks ?? false;
? `"${req.body.prompt}" site:${urlWithoutWww}`
: `site:${req.body.urls[0]}`; let mapUrl = req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
const resultsPerPage = 100; const resultsPerPage = 100;
const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage); const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
@ -84,82 +90,103 @@ export async function extractController(
}; };
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
allResults = await Promise.all(pagePromises); allResults = (await Promise.all(pagePromises)).flat();
// console.log("allResults", allResults);
// if allResults is empty, return an error
if (allResults.length === 0) {
return res.status(400).json({
success: false,
error: "No results found",
});
}
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
} }
// console.log("allResults", allResults); // console.log("allResults", allResults);
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([ // const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), // req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises), // ...(cachedResult ? [] : pagePromises),
]); // ]);
if (!cachedResult) { // if (!cachedResult) {
allResults = searchResults; // allResults = searchResults;
} // }
if (sitemap !== null) { links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
sitemap.forEach((x) => { console.log("links", links);
links.push(x.url); // if (sitemap !== null) {
}); // sitemap.forEach((x) => {
} // links.push(x.url);
// });
// }
let mapResults = allResults // let mapResults = allResults
.flat() // .flat()
.filter((result) => result !== null && result !== undefined); // .filter((result) => result !== null && result !== undefined);
const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT); // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
if (mapResults.length > minumumCutoff) { // if (mapResults.length > minumumCutoff) {
mapResults = mapResults.slice(0, minumumCutoff); // mapResults = mapResults.slice(0, minumumCutoff);
} // }
if (mapResults.length > 0) { // if (mapResults.length > 0) {
if (req.body.prompt) { // if (req.body.prompt) {
// Ensure all map results are first, maintaining their order // // Ensure all map results are first, maintaining their order
links = [ // links = [
mapResults[0].url, // mapResults[0].url,
...mapResults.slice(1).map((x) => x.url), // ...mapResults.slice(1).map((x) => x.url),
...links, // ...links,
]; // ];
} else { // } else {
mapResults.map((x) => { // mapResults.map((x) => {
links.push(x.url); // links.push(x.url);
}); // });
} // }
} // }
// console.log("mapResults", mapResults);
// console.log("links", links); // console.log("links", links);
let linksAndScores: { link: string; score: number }[] = []; let linksAndScores: { link: string; score: number }[] = [];
// Perform cosine similarity between the search query and the list of links // Perform cosine similarity between the search query and the list of links
if (req.body.prompt) { if (req.body.prompt) {
const searchQuery = req.body.prompt.toLowerCase(); const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
linksAndScores = await performRanking(links, searchQuery); linksAndScores = await performRanking(links, searchQuery);
} }
console.log("linksAndScores", linksAndScores);
links = linksAndScores
.filter(x => x.score > SCORE_THRESHOLD)
.map(x => x.link.split("url: ")[1].split(",")[0])
.filter(x => !isUrlBlocked(x))
console.log("links:", links.length);
// should we use some sort of llm to determine the best links?
// console.log("linksAndScores", linksAndScores); // console.log("linksAndScores", linksAndScores);
links = links // links = links
.map((x) => { // .map((x) => {
try { // try {
return checkAndUpdateURLForMap(x).url.trim(); // return checkAndUpdateURLForMap(x).url.trim();
} catch (_) { // } catch (_) {
return null; // return null;
} // }
}) // })
.filter((x) => x !== null) as string[]; // .filter((x) => x !== null) as string[];
// allows for subdomains to be included // allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.urls[0])); // links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
// if includeSubdomains is false, filter out subdomains // if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) { // if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.urls[0])); // links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
} // z}
// remove duplicates that could be due to http/https or www // remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links); // links = removeDuplicateUrls(links);
// get top N links // get top N links
links = links.slice(0, MAX_RANKING_LIMIT); links = links.slice(0, MAX_RANKING_LIMIT);
@ -170,7 +197,7 @@ export async function extractController(
for (const url of links) { for (const url of links) {
const origin = req.body.origin || "api"; const origin = req.body.origin || "api";
const timeout = req.body.timeout; const timeout = req.body.timeout ?? 30000;
const jobId = crypto.randomUUID(); const jobId = crypto.randomUUID();
const startTime = new Date().getTime(); const startTime = new Date().getTime();
@ -196,7 +223,7 @@ export async function extractController(
jobPriority jobPriority
); );
const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
let doc: Document; let doc: Document;
try { try {
@ -234,18 +261,20 @@ export async function extractController(
docs.push(doc); docs.push(doc);
} }
console.log(docs)
// console.log("docs", docs); const completions = await generateOpenAICompletions(
logger.child({ method: "extractController/generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt: "Only use the provided content to answer the question.",
prompt: mapUrl,
schema: req.body.schema,
},
docs.map(x => x.markdown).join('\n')
);
// {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"} console.log("completions", completions);
const completions = await generateFinalExtraction({
pagesContent: docs.map(x => x.markdown).join('\n'),
systemPrompt: '',
prompt: req.body.prompt,
schema: req.body.schema,
});
// console.log("completions", completions);
// if(req.body.extract && req.body.formats.includes("extract")) { // if(req.body.extract && req.body.formats.includes("extract")) {
// creditsToBeBilled = 5; // creditsToBeBilled = 5;
@ -315,9 +344,18 @@ export async function extractController(
// scrape_id: result.scrape_id // scrape_id: result.scrape_id
// }; // };
console.log("completions.extract", completions.extract);
let data: any;
try {
data = JSON.parse(completions.extract);
} catch (e) {
data = completions.extract;
}
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
data: completions.content, // includeMetadata ? mapResults : linksToReturn, data: data, // includeMetadata ? mapResults : linksToReturn,
scrape_id: id, //origin?.includes("website") ? id : undefined, scrape_id: id, //origin?.includes("website") ? id : undefined,
}); });
} }

View File

@ -161,6 +161,7 @@ export const extractV1Options = z.object({
limit: z.number().int().positive().finite().safe().optional(), limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000),
}).strict(strictMessage) }).strict(strictMessage)
@ -353,7 +354,7 @@ export type ExtractResponse =
| { | {
success: true; success: true;
warning?: string; warning?: string;
data: Document; data: z.infer<typeof extractRequestSchema>;
scrape_id?: string; scrape_id?: string;
}; };

View File

@ -1,121 +1,124 @@
import OpenAI from "openai"; // use llmExtract.ts instead
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { ExtractOptions } from "../../controllers/v1/types";
import { Document } from "../entities";
import { z } from "zod";
const maxTokens = 32000; // import OpenAI from "openai";
const modifier = 4; // import { encoding_for_model } from "@dqbd/tiktoken";
// import { TiktokenModel } from "@dqbd/tiktoken";
// import { ExtractOptions } from "../../controllers/v1/types";
// import { Document } from "../entities";
// import { z } from "zod";
export class LLMRefusalError extends Error { // const maxTokens = 32000;
constructor(refusal: string) { // const modifier = 4;
super("LLM refused to extract the website's content");
this.name = "LLMRefusalError";
}
}
interface GenerateCompletionsParams { // export class LLMRefusalError extends Error {
systemPrompt?: string; // constructor(refusal: string) {
prompt?: string; // super("LLM refused to extract the website's content");
schema?: any; // this.name = "LLMRefusalError";
pagesContent: string; // }
} // }
export async function generateBasicCompletion(prompt: string) { // interface GenerateCompletionsParams {
const openai = new OpenAI(); // systemPrompt?: string;
const model: TiktokenModel = // prompt?: string;
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; // schema?: any;
// pagesContent: string;
// }
const completion = await openai.chat.completions.create({ // export async function generateBasicCompletion(prompt: string) {
model, // const openai = new OpenAI();
messages: [{ role: "user", content: prompt }], // const model: TiktokenModel =
}); // (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
return completion.choices[0].message.content; // const completion = await openai.chat.completions.create({
} // model,
// messages: [{ role: "user", content: prompt }],
// });
export async function generateFinalExtraction({ // return completion.choices[0].message.content;
pagesContent, // }
systemPrompt,
prompt,
schema,
}: GenerateCompletionsParams): Promise<{
content: string;
metadata: { numTokens: number; warning: string };
}> {
const openai = new OpenAI();
const model: TiktokenModel =
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
let extractionContent = pagesContent; // export async function generateFinalExtraction({
let numTokens = 0; // pagesContent,
let warning = ""; // systemPrompt,
// prompt,
// schema,
// }: GenerateCompletionsParams): Promise<{
// content: string;
// metadata: { numTokens: number; warning: string };
// }> {
// const openai = new OpenAI();
// const model: TiktokenModel =
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
const encoder = encoding_for_model(model); // let extractionContent = pagesContent;
try { // let numTokens = 0;
const tokens = encoder.encode(extractionContent); // let warning = "";
numTokens = tokens.length;
} catch (error) {
extractionContent = extractionContent.slice(0, maxTokens * modifier);
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
} finally {
encoder.free();
}
if (numTokens > maxTokens) { // const encoder = encoding_for_model(model);
extractionContent = extractionContent.slice(0, maxTokens * modifier); // try {
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`; // const tokens = encoder.encode(extractionContent);
} // numTokens = tokens.length;
// } catch (error) {
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
// } finally {
// encoder.free();
// }
if (schema && (schema.type === "array" || schema._type === "ZodArray")) { // if (numTokens > maxTokens) {
schema = { // extractionContent = extractionContent.slice(0, maxTokens * modifier);
type: "object", // warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
properties: { // }
items: schema,
},
required: ["items"],
additionalProperties: false,
};
} else if (schema) {
schema.additionalProperties = false;
schema.required = Object.keys(schema.properties);
}
const jsonCompletion = await openai.beta.chat.completions.parse({ // if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
model, // schema = {
messages: [ // type: "object",
{ role: "system", content: systemPrompt ?? "" }, // properties: {
{ role: "user", content: [{ type: "text", text: extractionContent }] }, // items: schema,
{ // },
role: "user", // required: ["items"],
content: prompt // additionalProperties: false,
? `Transform the above content into structured JSON output based on the following user request: ${prompt}` // };
: "Transform the above content into structured JSON output.", // } else if (schema) {
}, // schema.additionalProperties = false;
], // schema.required = Object.keys(schema.properties);
response_format: schema // }
? {
type: "json_schema",
json_schema: {
name: "websiteContent",
schema: schema,
strict: true,
},
}
: { type: "json_object" },
});
if (jsonCompletion.choices[0].message.refusal !== null) { // const jsonCompletion = await openai.beta.chat.completions.parse({
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); // temperature: 0,
} // model,
// messages: [
// { role: "system", content: systemPrompt ?? "" },
// { role: "user", content: [{ type: "text", text: extractionContent }] },
// {
// role: "user",
// content: prompt
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
// : "Transform the above content into structured JSON output.",
// },
// ],
// response_format: schema
// ? {
// type: "json_schema",
// json_schema: {
// name: "websiteContent",
// schema: schema,
// strict: true,
// },
// }
// : { type: "json_object" },
// });
const extraction = jsonCompletion.choices[0].message.parsed; // if (jsonCompletion.choices[0].message.refusal !== null) {
return { // throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
content: extraction ?? "", // }
metadata: {
numTokens, // const extraction = jsonCompletion.choices[0].message.parsed;
warning, // return {
}, // content: extraction ?? "",
}; // metadata: {
} // numTokens,
// warning,
// },
// };
// }

View File

@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
} }
} }
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> { export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> {
let extract: any;
let warning: string | undefined;
const openai = new OpenAI(); const openai = new OpenAI();
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (document.markdown === undefined) { if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected"); throw new Error("document.markdown is undefined -- this is unexpected");
} }
let extractionContent = document.markdown;
// count number of tokens // count number of tokens
let numTokens = 0; let numTokens = 0;
const encoder = encoding_for_model(model as TiktokenModel); const encoder = encoding_for_model(model as TiktokenModel);
try { try {
// Encode the message into tokens // Encode the message into tokens
const tokens = encoder.encode(extractionContent); const tokens = encoder.encode(markdown);
// Return the number of tokens // Return the number of tokens
numTokens = tokens.length; numTokens = tokens.length;
} catch (error) { } catch (error) {
logger.warn("Calculating num tokens of string failed", { error, extractionContent }); logger.warn("Calculating num tokens of string failed", { error, markdown });
extractionContent = extractionContent.slice(0, maxTokens * modifier); markdown = markdown.slice(0, maxTokens * modifier);
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
document.warning = document.warning === undefined ? warning : " " + warning; warning = previousWarning === undefined ? w : w + " " + previousWarning;
} finally { } finally {
// Free the encoder resources after use // Free the encoder resources after use
encoder.free(); encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
extractionContent = extractionContent.slice(0, maxTokens * modifier); markdown = markdown.slice(0, maxTokens * modifier);
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
document.warning = document.warning === undefined ? warning : " " + warning; warning = previousWarning === undefined ? w : w + " " + previousWarning;
} }
let schema = options.schema; let schema = options.schema;
@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
}, },
{ {
role: "user", role: "user",
content: [{ type: "text", text: extractionContent }], content: [{ type: "text", text: markdown }],
}, },
{ {
role: "user", role: "user",
@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
} }
document.extract = jsonCompletion.choices[0].message.parsed; extract = jsonCompletion.choices[0].message.parsed;
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) { if (extract === null && jsonCompletion.choices[0].message.content !== null) {
try { try {
document.extract = JSON.parse(jsonCompletion.choices[0].message.content); extract = JSON.parse(jsonCompletion.choices[0].message.content);
} catch (e) { } catch (e) {
logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
} }
if (options.schema && options.schema.type === "array") { if (options.schema && options.schema.type === "array") {
document.extract = document.extract?.items; extract = extract?.items;
} }
return document; return { extract, warning };
} }
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> { export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("extract")) { if (meta.options.formats.includes("extract")) {
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!); const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
meta.options.extract!,
document.markdown,
document.warning,
);
document.extract = extract;
document.warning = warning;
} }
return document; return document;

View File

@ -37,7 +37,6 @@ export async function fireEngineMap(
); );
return []; return [];
} }
console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL);
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
method: "POST", method: "POST",