mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
wip
This commit is contained in:
parent
25f32000db
commit
904c904971
151
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
151
apps/api/src/__tests__/e2e_extract/index.test.ts
Normal file
|
@ -0,0 +1,151 @@
|
|||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
FirecrawlCrawlResponse,
|
||||
FirecrawlCrawlStatusResponse,
|
||||
FirecrawlScrapeResponse,
|
||||
} from "../../types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for Extract API Routes", () => {
|
||||
describe("POST /v1/extract", () => {
|
||||
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://firecrawl.dev"],
|
||||
prompt: "Who are the authors of the blog posts?",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: { authors: { type: "array", items: { type: "string" } } },
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("founders");
|
||||
|
||||
let gotItRight = 0;
|
||||
for (const author of response.body.data?.authors) {
|
||||
if (author.includes("Caleb Peffer")) gotItRight++;
|
||||
if (author.includes("Gergő Móricz")) gotItRight++;
|
||||
if (author.includes("Eric Ciarla")) gotItRight++;
|
||||
if (author.includes("Nicolas Camara")) gotItRight++;
|
||||
}
|
||||
|
||||
expect(gotItRight).toBeGreaterThan(3);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["mendable.ai"],
|
||||
prompt: "Who are the founders of the company?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: { founders: { type: "array", items: { type: "string" } } },
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("founders");
|
||||
|
||||
let gotItRight = 0;
|
||||
for (const founder of response.body.data?.founders) {
|
||||
if (founder.includes("Caleb")) gotItRight++;
|
||||
if (founder.includes("Eric")) gotItRight++;
|
||||
if (founder.includes("Nicolas")) gotItRight++;
|
||||
}
|
||||
|
||||
expect(gotItRight).toBe(3);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["https://firecrawl.dev"],
|
||||
prompt: "What are they hiring for?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
}
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
console.log(response.body.data);
|
||||
|
||||
let gotItRight = 0;
|
||||
for (const hiring of response.body.data?.items) {
|
||||
if (hiring.includes("Developer Relations Specialist")) gotItRight++;
|
||||
if (hiring.includes("Web Automation Engineer")) gotItRight++;
|
||||
if (hiring.includes("Developer Experience Engineer")) gotItRight++;
|
||||
if (hiring.includes("Developer Support Engineer")) gotItRight++;
|
||||
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
|
||||
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
|
||||
}
|
||||
|
||||
expect(gotItRight).toBeGreaterThan(5);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["fivetran.com"],
|
||||
prompt: "Does Fivetran have PCI DSS compliance?",
|
||||
allowExternalLinks: true,
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
pciDssCompliance: { type: "boolean" }
|
||||
}
|
||||
},
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
urls: ["fivetran.com"],
|
||||
prompt: "What are the Azure Data Connectors they offer?",
|
||||
schema: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
connector: { type: "string" },
|
||||
description: { type: "string" },
|
||||
supportsCaptureDelete: { type: "boolean" }
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
console.log(response.body);
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// expect(response.body).toHaveProperty("data");
|
||||
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||
}, 60000);
|
||||
});
|
||||
});
|
|
@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs";
|
|||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { PlanType } from "../../types";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { generateFinalExtraction } from "../../lib/extract/completions";
|
||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
|
||||
const MAX_EXTRACT_LIMIT = 100;
|
||||
const MAX_RANKING_LIMIT = 3;
|
||||
const MAX_RANKING_LIMIT = 5;
|
||||
const SCORE_THRESHOLD = 0.75;
|
||||
|
||||
export async function extractController(
|
||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||
res: Response<any> //ExtractResponse>
|
||||
res: Response<ExtractResponse>
|
||||
) {
|
||||
req.body = extractRequestSchema.parse(req.body);
|
||||
|
||||
const id = crypto.randomUUID();
|
||||
let links: string[] = req.body.urls;
|
||||
let links: string[]; //= req.body.urls;
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.urls[0],
|
||||
|
@ -59,10 +61,14 @@ export async function extractController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
let urlWithoutWww = req.body.urls[0].replace("www.", "");
|
||||
console.log("urlWithoutWww", urlWithoutWww);
|
||||
|
||||
let mapUrl = req.body.prompt
|
||||
? `"${req.body.prompt}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.urls[0]}`;
|
||||
const allowExternalLinks = req.body.allowExternalLinks ?? false;
|
||||
|
||||
let mapUrl = req.body.prompt && allowExternalLinks
|
||||
? `${req.body.prompt} ${urlWithoutWww}`
|
||||
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
|
||||
: `site:${urlWithoutWww}`;
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
|
||||
|
@ -84,82 +90,103 @@ export async function extractController(
|
|||
};
|
||||
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||
allResults = await Promise.all(pagePromises);
|
||||
allResults = (await Promise.all(pagePromises)).flat();
|
||||
// console.log("allResults", allResults);
|
||||
// if allResults is empty, return an error
|
||||
if (allResults.length === 0) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: "No results found",
|
||||
});
|
||||
}
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
}
|
||||
|
||||
// console.log("allResults", allResults);
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
// const [sitemap, ...searchResults] = await Promise.all([
|
||||
// req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
|
||||
// ...(cachedResult ? [] : pagePromises),
|
||||
// ]);
|
||||
|
||||
if (!cachedResult) {
|
||||
allResults = searchResults;
|
||||
}
|
||||
// if (!cachedResult) {
|
||||
// allResults = searchResults;
|
||||
// }
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||
console.log("links", links);
|
||||
// if (sitemap !== null) {
|
||||
// sitemap.forEach((x) => {
|
||||
// links.push(x.url);
|
||||
// });
|
||||
// }
|
||||
|
||||
let mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
// let mapResults = allResults
|
||||
// .flat()
|
||||
// .filter((result) => result !== null && result !== undefined);
|
||||
|
||||
const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
|
||||
if (mapResults.length > minumumCutoff) {
|
||||
mapResults = mapResults.slice(0, minumumCutoff);
|
||||
}
|
||||
// const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
|
||||
// if (mapResults.length > minumumCutoff) {
|
||||
// mapResults = mapResults.slice(0, minumumCutoff);
|
||||
// }
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.prompt) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
// if (mapResults.length > 0) {
|
||||
// if (req.body.prompt) {
|
||||
// // Ensure all map results are first, maintaining their order
|
||||
// links = [
|
||||
// mapResults[0].url,
|
||||
// ...mapResults.slice(1).map((x) => x.url),
|
||||
// ...links,
|
||||
// ];
|
||||
// } else {
|
||||
// mapResults.map((x) => {
|
||||
// links.push(x.url);
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// console.log("mapResults", mapResults);
|
||||
|
||||
// console.log("links", links);
|
||||
let linksAndScores: { link: string; score: number }[] = [];
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.prompt) {
|
||||
const searchQuery = req.body.prompt.toLowerCase();
|
||||
const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
|
||||
linksAndScores = await performRanking(links, searchQuery);
|
||||
}
|
||||
console.log("linksAndScores", linksAndScores);
|
||||
links = linksAndScores
|
||||
.filter(x => x.score > SCORE_THRESHOLD)
|
||||
.map(x => x.link.split("url: ")[1].split(",")[0])
|
||||
.filter(x => !isUrlBlocked(x))
|
||||
|
||||
console.log("links:", links.length);
|
||||
|
||||
// should we use some sort of llm to determine the best links?
|
||||
|
||||
// console.log("linksAndScores", linksAndScores);
|
||||
|
||||
links = links
|
||||
.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null) as string[];
|
||||
// links = links
|
||||
// .map((x) => {
|
||||
// try {
|
||||
// return checkAndUpdateURLForMap(x).url.trim();
|
||||
// } catch (_) {
|
||||
// return null;
|
||||
// }
|
||||
// })
|
||||
// .filter((x) => x !== null) as string[];
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
|
||||
// links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
|
||||
}
|
||||
// if (!req.body.includeSubdomains) {
|
||||
// links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
|
||||
// z}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
// links = removeDuplicateUrls(links);
|
||||
|
||||
// get top N links
|
||||
links = links.slice(0, MAX_RANKING_LIMIT);
|
||||
|
@ -170,7 +197,7 @@ export async function extractController(
|
|||
|
||||
for (const url of links) {
|
||||
const origin = req.body.origin || "api";
|
||||
const timeout = req.body.timeout;
|
||||
const timeout = req.body.timeout ?? 30000;
|
||||
const jobId = crypto.randomUUID();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -196,7 +223,7 @@ export async function extractController(
|
|||
jobPriority
|
||||
);
|
||||
|
||||
const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||
const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||
|
||||
let doc: Document;
|
||||
try {
|
||||
|
@ -234,18 +261,20 @@ export async function extractController(
|
|||
docs.push(doc);
|
||||
}
|
||||
|
||||
console.log(docs)
|
||||
|
||||
// console.log("docs", docs);
|
||||
const completions = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt: "Only use the provided content to answer the question.",
|
||||
prompt: mapUrl,
|
||||
schema: req.body.schema,
|
||||
},
|
||||
docs.map(x => x.markdown).join('\n')
|
||||
);
|
||||
|
||||
// {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"}
|
||||
const completions = await generateFinalExtraction({
|
||||
pagesContent: docs.map(x => x.markdown).join('\n'),
|
||||
systemPrompt: '',
|
||||
prompt: req.body.prompt,
|
||||
schema: req.body.schema,
|
||||
});
|
||||
|
||||
// console.log("completions", completions);
|
||||
console.log("completions", completions);
|
||||
|
||||
// if(req.body.extract && req.body.formats.includes("extract")) {
|
||||
// creditsToBeBilled = 5;
|
||||
|
@ -315,9 +344,18 @@ export async function extractController(
|
|||
// scrape_id: result.scrape_id
|
||||
// };
|
||||
|
||||
console.log("completions.extract", completions.extract);
|
||||
|
||||
let data: any;
|
||||
try {
|
||||
data = JSON.parse(completions.extract);
|
||||
} catch (e) {
|
||||
data = completions.extract;
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: completions.content, // includeMetadata ? mapResults : linksToReturn,
|
||||
data: data, // includeMetadata ? mapResults : linksToReturn,
|
||||
scrape_id: id, //origin?.includes("website") ? id : undefined,
|
||||
});
|
||||
}
|
|
@ -161,6 +161,7 @@ export const extractV1Options = z.object({
|
|||
limit: z.number().int().positive().finite().safe().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
}).strict(strictMessage)
|
||||
|
@ -353,7 +354,7 @@ export type ExtractResponse =
|
|||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
data: z.infer<typeof extractRequestSchema>;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,121 +1,124 @@
|
|||
import OpenAI from "openai";
|
||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
import { Document } from "../entities";
|
||||
import { z } from "zod";
|
||||
// use llmExtract.ts instead
|
||||
|
||||
const maxTokens = 32000;
|
||||
const modifier = 4;
|
||||
// import OpenAI from "openai";
|
||||
// import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
// import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
// import { ExtractOptions } from "../../controllers/v1/types";
|
||||
// import { Document } from "../entities";
|
||||
// import { z } from "zod";
|
||||
|
||||
export class LLMRefusalError extends Error {
|
||||
constructor(refusal: string) {
|
||||
super("LLM refused to extract the website's content");
|
||||
this.name = "LLMRefusalError";
|
||||
}
|
||||
}
|
||||
// const maxTokens = 32000;
|
||||
// const modifier = 4;
|
||||
|
||||
interface GenerateCompletionsParams {
|
||||
systemPrompt?: string;
|
||||
prompt?: string;
|
||||
schema?: any;
|
||||
pagesContent: string;
|
||||
}
|
||||
// export class LLMRefusalError extends Error {
|
||||
// constructor(refusal: string) {
|
||||
// super("LLM refused to extract the website's content");
|
||||
// this.name = "LLMRefusalError";
|
||||
// }
|
||||
// }
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel =
|
||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
// interface GenerateCompletionsParams {
|
||||
// systemPrompt?: string;
|
||||
// prompt?: string;
|
||||
// schema?: any;
|
||||
// pagesContent: string;
|
||||
// }
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
});
|
||||
// export async function generateBasicCompletion(prompt: string) {
|
||||
// const openai = new OpenAI();
|
||||
// const model: TiktokenModel =
|
||||
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
return completion.choices[0].message.content;
|
||||
}
|
||||
// const completion = await openai.chat.completions.create({
|
||||
// model,
|
||||
// messages: [{ role: "user", content: prompt }],
|
||||
// });
|
||||
|
||||
export async function generateFinalExtraction({
|
||||
pagesContent,
|
||||
systemPrompt,
|
||||
prompt,
|
||||
schema,
|
||||
}: GenerateCompletionsParams): Promise<{
|
||||
content: string;
|
||||
metadata: { numTokens: number; warning: string };
|
||||
}> {
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel =
|
||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
// return completion.choices[0].message.content;
|
||||
// }
|
||||
|
||||
let extractionContent = pagesContent;
|
||||
let numTokens = 0;
|
||||
let warning = "";
|
||||
// export async function generateFinalExtraction({
|
||||
// pagesContent,
|
||||
// systemPrompt,
|
||||
// prompt,
|
||||
// schema,
|
||||
// }: GenerateCompletionsParams): Promise<{
|
||||
// content: string;
|
||||
// metadata: { numTokens: number; warning: string };
|
||||
// }> {
|
||||
// const openai = new OpenAI();
|
||||
// const model: TiktokenModel =
|
||||
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
const encoder = encoding_for_model(model);
|
||||
try {
|
||||
const tokens = encoder.encode(extractionContent);
|
||||
numTokens = tokens.length;
|
||||
} catch (error) {
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
} finally {
|
||||
encoder.free();
|
||||
}
|
||||
// let extractionContent = pagesContent;
|
||||
// let numTokens = 0;
|
||||
// let warning = "";
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||
}
|
||||
// const encoder = encoding_for_model(model);
|
||||
// try {
|
||||
// const tokens = encoder.encode(extractionContent);
|
||||
// numTokens = tokens.length;
|
||||
// } catch (error) {
|
||||
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
// } finally {
|
||||
// encoder.free();
|
||||
// }
|
||||
|
||||
if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
items: schema,
|
||||
},
|
||||
required: ["items"],
|
||||
additionalProperties: false,
|
||||
};
|
||||
} else if (schema) {
|
||||
schema.additionalProperties = false;
|
||||
schema.required = Object.keys(schema.properties);
|
||||
}
|
||||
// if (numTokens > maxTokens) {
|
||||
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||
// }
|
||||
|
||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||
model,
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt ?? "" },
|
||||
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||
{
|
||||
role: "user",
|
||||
content: prompt
|
||||
? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||
: "Transform the above content into structured JSON output.",
|
||||
},
|
||||
],
|
||||
response_format: schema
|
||||
? {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
name: "websiteContent",
|
||||
schema: schema,
|
||||
strict: true,
|
||||
},
|
||||
}
|
||||
: { type: "json_object" },
|
||||
});
|
||||
// if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||
// schema = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// items: schema,
|
||||
// },
|
||||
// required: ["items"],
|
||||
// additionalProperties: false,
|
||||
// };
|
||||
// } else if (schema) {
|
||||
// schema.additionalProperties = false;
|
||||
// schema.required = Object.keys(schema.properties);
|
||||
// }
|
||||
|
||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
}
|
||||
// const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||
// temperature: 0,
|
||||
// model,
|
||||
// messages: [
|
||||
// { role: "system", content: systemPrompt ?? "" },
|
||||
// { role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||
// {
|
||||
// role: "user",
|
||||
// content: prompt
|
||||
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||
// : "Transform the above content into structured JSON output.",
|
||||
// },
|
||||
// ],
|
||||
// response_format: schema
|
||||
// ? {
|
||||
// type: "json_schema",
|
||||
// json_schema: {
|
||||
// name: "websiteContent",
|
||||
// schema: schema,
|
||||
// strict: true,
|
||||
// },
|
||||
// }
|
||||
// : { type: "json_object" },
|
||||
// });
|
||||
|
||||
const extraction = jsonCompletion.choices[0].message.parsed;
|
||||
return {
|
||||
content: extraction ?? "",
|
||||
metadata: {
|
||||
numTokens,
|
||||
warning,
|
||||
},
|
||||
};
|
||||
}
|
||||
// if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||
// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
// }
|
||||
|
||||
// const extraction = jsonCompletion.choices[0].message.parsed;
|
||||
// return {
|
||||
// content: extraction ?? "",
|
||||
// metadata: {
|
||||
// numTokens,
|
||||
// warning,
|
||||
// },
|
||||
// };
|
||||
// }
|
||||
|
|
|
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
|
|||
}
|
||||
}
|
||||
|
||||
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
|
||||
export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
if (document.markdown === undefined) {
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
|
||||
let extractionContent = document.markdown;
|
||||
|
||||
// count number of tokens
|
||||
let numTokens = 0;
|
||||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
try {
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(extractionContent);
|
||||
const tokens = encoder.encode(markdown);
|
||||
|
||||
// Return the number of tokens
|
||||
numTokens = tokens.length;
|
||||
} catch (error) {
|
||||
logger.warn("Calculating num tokens of string failed", { error, extractionContent });
|
||||
logger.warn("Calculating num tokens of string failed", { error, markdown });
|
||||
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
markdown = markdown.slice(0, maxTokens * modifier);
|
||||
|
||||
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
||||
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||
} finally {
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
|
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
|||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
markdown = markdown.slice(0, maxTokens * modifier);
|
||||
|
||||
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
||||
document.warning = document.warning === undefined ? warning : " " + warning;
|
||||
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||
}
|
||||
|
||||
let schema = options.schema;
|
||||
|
@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
|||
},
|
||||
{
|
||||
role: "user",
|
||||
content: [{ type: "text", text: extractionContent }],
|
||||
content: [{ type: "text", text: markdown }],
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
|
@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
|||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
}
|
||||
|
||||
document.extract = jsonCompletion.choices[0].message.parsed;
|
||||
extract = jsonCompletion.choices[0].message.parsed;
|
||||
|
||||
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||
try {
|
||||
document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||
} catch (e) {
|
||||
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
||||
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
||||
|
@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
|
|||
}
|
||||
|
||||
if (options.schema && options.schema.type === "array") {
|
||||
document.extract = document.extract?.items;
|
||||
extract = extract?.items;
|
||||
}
|
||||
return document;
|
||||
return { extract, warning };
|
||||
}
|
||||
|
||||
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
||||
if (meta.options.formats.includes("extract")) {
|
||||
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
|
||||
const { extract, warning } = await generateOpenAICompletions(
|
||||
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
|
||||
meta.options.extract!,
|
||||
document.markdown,
|
||||
document.warning,
|
||||
);
|
||||
document.extract = extract;
|
||||
document.warning = warning;
|
||||
}
|
||||
|
||||
return document;
|
||||
|
|
|
@ -37,7 +37,6 @@ export async function fireEngineMap(
|
|||
);
|
||||
return [];
|
||||
}
|
||||
console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL);
|
||||
|
||||
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
|
||||
method: "POST",
|
||||
|
|
Loading…
Reference in New Issue
Block a user