wip

2024-11-16 11:42:24 +08:00 · 2024-11-13 18:06:20 -03:00 · 2024-11-13 18:06:20 -03:00 · 904c904971
commit 904c904971
parent 25f32000db
6 changed files with 397 additions and 197 deletions
--- a/apps/api/src/tests/e2e_extract/index.test.ts
+++ b/apps/api/src/tests/e2e_extract/index.test.ts
@ -0,0 +1,151 @@
 import request from "supertest";
 import dotenv from "dotenv";
 import {
  FirecrawlCrawlResponse,
  FirecrawlCrawlStatusResponse,
  FirecrawlScrapeResponse,
 } from "../../types";
 dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
 describe("E2E Tests for Extract API Routes", () => {
  describe("POST /v1/extract", () => {
    it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
      const response = await request(TEST_URL)
        .post("/v1/extract")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          urls: ["https://firecrawl.dev"],
          prompt: "Who are the authors of the blog posts?",
          schema: {
            type: "object",
            properties: { authors: { type: "array", items: { type: "string" } } },
          },
        });
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("data");
      expect(response.body.data).toHaveProperty("founders");
      let gotItRight = 0;
      for (const author of response.body.data?.authors) {
        if (author.includes("Caleb Peffer")) gotItRight++;
        if (author.includes("Gergő Móricz")) gotItRight++;
        if (author.includes("Eric Ciarla")) gotItRight++;
        if (author.includes("Nicolas Camara")) gotItRight++;
      }
      expect(gotItRight).toBeGreaterThan(3);
    }, 60000);
    it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
      const response = await request(TEST_URL)
        .post("/v1/extract")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          urls: ["mendable.ai"],
          prompt: "Who are the founders of the company?",
          allowExternalLinks: true,
          schema: {
            type: "object",
            properties: { founders: { type: "array", items: { type: "string" } } },
          },
        });
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("data");
      expect(response.body.data).toHaveProperty("founders");
      let gotItRight = 0;
      for (const founder of response.body.data?.founders) {
        if (founder.includes("Caleb")) gotItRight++;
        if (founder.includes("Eric")) gotItRight++;
        if (founder.includes("Nicolas")) gotItRight++;
      }
      expect(gotItRight).toBe(3);
    }, 60000);
    it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
      const response = await request(TEST_URL)
        .post("/v1/extract")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          urls: ["https://firecrawl.dev"],
          prompt: "What are they hiring for?",
          allowExternalLinks: true,
          schema: {
            type: "array",
            items: {
              type: "string"
            }
          },
        });
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("data");
      console.log(response.body.data);
      let gotItRight = 0;
      for (const hiring of response.body.data?.items) {
        if (hiring.includes("Developer Relations Specialist")) gotItRight++;
        if (hiring.includes("Web Automation Engineer")) gotItRight++;
        if (hiring.includes("Developer Experience Engineer")) gotItRight++;
        if (hiring.includes("Developer Support Engineer")) gotItRight++;
        if (hiring.includes("Dev Ops Engineer")) gotItRight++;
        if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
      }
      expect(gotItRight).toBeGreaterThan(5);
    }, 60000);
    it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
      const response = await request(TEST_URL)
        .post("/v1/extract")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          urls: ["fivetran.com"],
          prompt: "Does Fivetran have PCI DSS compliance?",
          allowExternalLinks: true,
          schema: {
            type: "object",
            properties: {
              pciDssCompliance: { type: "boolean" }
            }
          },
        });
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("data");
      expect(response.body.data?.pciDssCompliance).toBe(true);
    }, 60000);
    it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
      const response = await request(TEST_URL)
        .post("/v1/extract")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          urls: ["fivetran.com"],
          prompt: "What are the Azure Data Connectors they offer?",
          schema: {
            type: "array",
            items: {
              type: "object",
              properties: {
                connector: { type: "string" },
                description: { type: "string" },
                supportsCaptureDelete: { type: "boolean" }
              }
            }
          }
        })
      console.log(response.body);
      // expect(response.statusCode).toBe(200);
      // expect(response.body).toHaveProperty("data");
      // expect(response.body.data?.pciDssCompliance).toBe(true);
    }, 60000);
  });
 });
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs";
 import { addScrapeJob } from "../../services/queue-jobs";
 import { PlanType } from "../../types";
 import { getJobPriority } from "../../lib/job-priority";
-import { generateFinalExtraction } from "../../lib/extract/completions";
+import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
 const MAX_EXTRACT_LIMIT = 100;
-const MAX_RANKING_LIMIT = 3;
+const MAX_RANKING_LIMIT = 5;
 const SCORE_THRESHOLD = 0.75;
 export async function extractController(
  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
-  res: Response<any> //ExtractResponse>
+  res: Response<ExtractResponse>
 ) {
  req.body = extractRequestSchema.parse(req.body);
  const id = crypto.randomUUID();
-  let links: string[] = req.body.urls;
+  let links: string[]; //= req.body.urls;
  const sc: StoredCrawl = {
    originUrl: req.body.urls[0],
@ -59,10 +61,14 @@ export async function extractController(
  const crawler = crawlToCrawler(id, sc);
  let urlWithoutWww = req.body.urls[0].replace("www.", "");
  console.log("urlWithoutWww", urlWithoutWww);
-  let mapUrl = req.body.prompt
+  const allowExternalLinks = req.body.allowExternalLinks ?? false;
-    ? `"${req.body.prompt}" site:${urlWithoutWww}`
+
-    : `site:${req.body.urls[0]}`;
+  let mapUrl = req.body.prompt && allowExternalLinks
    ? `${req.body.prompt} ${urlWithoutWww}`
    : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
    : `site:${urlWithoutWww}`;
  const resultsPerPage = 100;
  const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
@ -84,82 +90,103 @@ export async function extractController(
    };
    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
-    allResults = await Promise.all(pagePromises);
+    allResults = (await Promise.all(pagePromises)).flat();
    // console.log("allResults", allResults);
    // if allResults is empty, return an error
    if (allResults.length === 0) {
      return res.status(400).json({
        success: false,
        error: "No results found",
      });
    }
    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
  }
  // console.log("allResults", allResults);
  // Parallelize sitemap fetch with serper search
-  const [sitemap, ...searchResults] = await Promise.all([
+  // const [sitemap, ...searchResults] = await Promise.all([
-    req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
+  //   req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
-    ...(cachedResult ? [] : pagePromises),
+  //   ...(cachedResult ? [] : pagePromises),
-  ]);
+  // ]);
-  if (!cachedResult) {
+  // if (!cachedResult) {
-    allResults = searchResults;
+  //   allResults = searchResults;
-  }
+  // }
-  if (sitemap !== null) {
+  links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
-    sitemap.forEach((x) => {
+  console.log("links", links);
-      links.push(x.url);
+  // if (sitemap !== null) {
-    });
+  //   sitemap.forEach((x) => {
-  }
+  //     links.push(x.url);
  //   });
  // }
-  let mapResults = allResults
+  // let mapResults = allResults
-    .flat()
+  //   .flat()
-    .filter((result) => result !== null && result !== undefined);
+  //   .filter((result) => result !== null && result !== undefined);
-  const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
+  // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
-  if (mapResults.length > minumumCutoff) {
+  // if (mapResults.length > minumumCutoff) {
-    mapResults = mapResults.slice(0, minumumCutoff);
+  //   mapResults = mapResults.slice(0, minumumCutoff);
-  }
+  // }
-  if (mapResults.length > 0) {
+  // if (mapResults.length > 0) {
-    if (req.body.prompt) {
+  //   if (req.body.prompt) {
-      // Ensure all map results are first, maintaining their order
+  //     // Ensure all map results are first, maintaining their order
-      links = [
+  //     links = [
-        mapResults[0].url,
+  //       mapResults[0].url,
-        ...mapResults.slice(1).map((x) => x.url),
+  //       ...mapResults.slice(1).map((x) => x.url),
-        ...links,
+  //       ...links,
-      ];
+  //     ];
-    } else {
+  //   } else {
-      mapResults.map((x) => {
+  //     mapResults.map((x) => {
-        links.push(x.url);
+  //       links.push(x.url);
-      });
+  //     });
-    }
+  //   }
-  }
+  // }
  // console.log("mapResults", mapResults);
  // console.log("links", links);
  let linksAndScores: { link: string; score: number }[] = [];
  // Perform cosine similarity between the search query and the list of links
  if (req.body.prompt) {
-    const searchQuery = req.body.prompt.toLowerCase();
+    const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
    linksAndScores = await performRanking(links, searchQuery);
  }
  console.log("linksAndScores", linksAndScores);
  links = linksAndScores
    .filter(x => x.score > SCORE_THRESHOLD)
    .map(x => x.link.split("url: ")[1].split(",")[0])
    .filter(x => !isUrlBlocked(x))
  console.log("links:", links.length);
  // should we use some sort of llm to determine the best links?
  // console.log("linksAndScores", linksAndScores);
-  links = links
+  // links = links
-    .map((x) => {
+  //   .map((x) => {
-      try {
+  //     try {
-        return checkAndUpdateURLForMap(x).url.trim();
+  //       return checkAndUpdateURLForMap(x).url.trim();
-      } catch (_) {
+  //     } catch (_) {
-        return null;
+  //       return null;
-      }
+  //     }
-    })
+  //   })
-    .filter((x) => x !== null) as string[];
+  //   .filter((x) => x !== null) as string[];
  // allows for subdomains to be included
-  links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
+  // links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
  // if includeSubdomains is false, filter out subdomains
-  if (!req.body.includeSubdomains) {
+  // if (!req.body.includeSubdomains) {
-    links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
+  //   links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
-  }
+  // z}
  // remove duplicates that could be due to http/https or www
-  links = removeDuplicateUrls(links);
+  // links = removeDuplicateUrls(links);
  // get top N links
  links = links.slice(0, MAX_RANKING_LIMIT);
@ -170,7 +197,7 @@ export async function extractController(
  for (const url of links) {
    const origin = req.body.origin || "api";
-    const timeout = req.body.timeout;
+    const timeout = req.body.timeout ?? 30000;
    const jobId = crypto.randomUUID();
    const startTime = new Date().getTime();
@ -196,7 +223,7 @@ export async function extractController(
      jobPriority
    );
-    const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
+    const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
    let doc: Document;
    try {
@ -234,18 +261,20 @@ export async function extractController(
    docs.push(doc);
  }
  console.log(docs)
-  // console.log("docs", docs);
+  const completions = await generateOpenAICompletions(
    logger.child({ method: "extractController/generateOpenAICompletions" }),
    {
      mode: "llm",
      systemPrompt: "Only use the provided content to answer the question.",
      prompt: mapUrl,
      schema: req.body.schema,
    },
    docs.map(x => x.markdown).join('\n')
  );
-  // {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"}
+  console.log("completions", completions);
  const completions = await generateFinalExtraction({
    pagesContent: docs.map(x => x.markdown).join('\n'),
    systemPrompt: '',
    prompt: req.body.prompt,
    schema: req.body.schema,
  });
  // console.log("completions", completions);
  // if(req.body.extract && req.body.formats.includes("extract")) {
  //   creditsToBeBilled = 5;
@ -315,9 +344,18 @@ export async function extractController(
  //   scrape_id: result.scrape_id
  // };
  console.log("completions.extract", completions.extract);
  let data: any;
  try {
    data = JSON.parse(completions.extract);
  } catch (e) {
    data = completions.extract;
  }
  return res.status(200).json({
    success: true,
-    data: completions.content, // includeMetadata ? mapResults : linksToReturn,
+    data: data, // includeMetadata ? mapResults : linksToReturn,
    scrape_id: id, //origin?.includes("website") ? id : undefined,
  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -161,6 +161,7 @@ export const extractV1Options = z.object({
  limit: z.number().int().positive().finite().safe().optional(),
  ignoreSitemap: z.boolean().default(false),
  includeSubdomains: z.boolean().default(true),
  allowExternalLinks: z.boolean().default(false),
  origin: z.string().optional().default("api"),
  timeout: z.number().int().positive().finite().safe().default(60000),
 }).strict(strictMessage)
@ -353,7 +354,7 @@ export type ExtractResponse =
  | {
      success: true;
      warning?: string;
-      data: Document;
+      data: z.infer<typeof extractRequestSchema>;
      scrape_id?: string;
    };
--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -1,121 +1,124 @@
-import OpenAI from "openai";
+// use llmExtract.ts instead
 import { encoding_for_model } from "@dqbd/tiktoken";
 import { TiktokenModel } from "@dqbd/tiktoken";
 import { ExtractOptions } from "../../controllers/v1/types";
 import { Document } from "../entities";
 import { z } from "zod";
-const maxTokens = 32000;
+// import OpenAI from "openai";
-const modifier = 4;
+// import { encoding_for_model } from "@dqbd/tiktoken";
 // import { TiktokenModel } from "@dqbd/tiktoken";
 // import { ExtractOptions } from "../../controllers/v1/types";
 // import { Document } from "../entities";
 // import { z } from "zod";
-export class LLMRefusalError extends Error {
+// const maxTokens = 32000;
-  constructor(refusal: string) {
+// const modifier = 4;
    super("LLM refused to extract the website's content");
    this.name = "LLMRefusalError";
  }
 }
-interface GenerateCompletionsParams {
+// export class LLMRefusalError extends Error {
-  systemPrompt?: string;
+//   constructor(refusal: string) {
-  prompt?: string;
+//     super("LLM refused to extract the website's content");
-  schema?: any;
+//     this.name = "LLMRefusalError";
-  pagesContent: string;
+//   }
-}
+// }
-export async function generateBasicCompletion(prompt: string) {
+// interface GenerateCompletionsParams {
-  const openai = new OpenAI();
+//   systemPrompt?: string;
-  const model: TiktokenModel =
+//   prompt?: string;
-    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+//   schema?: any;
 //   pagesContent: string;
 // }
-  const completion = await openai.chat.completions.create({
+// export async function generateBasicCompletion(prompt: string) {
-    model,
+//   const openai = new OpenAI();
-    messages: [{ role: "user", content: prompt }],
+//   const model: TiktokenModel =
-  });
+//     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
-  return completion.choices[0].message.content;
+//   const completion = await openai.chat.completions.create({
-}
+//     model,
 //     messages: [{ role: "user", content: prompt }],
 //   });
-export async function generateFinalExtraction({
+//   return completion.choices[0].message.content;
-  pagesContent,
+// }
  systemPrompt,
  prompt,
  schema,
 }: GenerateCompletionsParams): Promise<{
  content: string;
  metadata: { numTokens: number; warning: string };
 }> {
  const openai = new OpenAI();
  const model: TiktokenModel =
    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
-  let extractionContent = pagesContent;
+// export async function generateFinalExtraction({
-  let numTokens = 0;
+//   pagesContent,
-  let warning = "";
+//   systemPrompt,
 //   prompt,
 //   schema,
 // }: GenerateCompletionsParams): Promise<{
 //   content: string;
 //   metadata: { numTokens: number; warning: string };
 // }> {
 //   const openai = new OpenAI();
 //   const model: TiktokenModel =
 //     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
-  const encoder = encoding_for_model(model);
+//   let extractionContent = pagesContent;
-  try {
+//   let numTokens = 0;
-    const tokens = encoder.encode(extractionContent);
+//   let warning = "";
    numTokens = tokens.length;
  } catch (error) {
    extractionContent = extractionContent.slice(0, maxTokens * modifier);
    warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
  } finally {
    encoder.free();
  }
-  if (numTokens > maxTokens) {
+//   const encoder = encoding_for_model(model);
-    extractionContent = extractionContent.slice(0, maxTokens * modifier);
+//   try {
-    warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
+//     const tokens = encoder.encode(extractionContent);
-  }
+//     numTokens = tokens.length;
 //   } catch (error) {
 //     extractionContent = extractionContent.slice(0, maxTokens * modifier);
 //     warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
 //   } finally {
 //     encoder.free();
 //   }
-  if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
+//   if (numTokens > maxTokens) {
-    schema = {
+//     extractionContent = extractionContent.slice(0, maxTokens * modifier);
-      type: "object",
+//     warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
-      properties: {
+//   }
        items: schema,
      },
      required: ["items"],
      additionalProperties: false,
    };
  } else if (schema) {
    schema.additionalProperties = false;
    schema.required = Object.keys(schema.properties);
  }
-  const jsonCompletion = await openai.beta.chat.completions.parse({
+//   if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
-    model,
+//     schema = {
-    messages: [
+//       type: "object",
-      { role: "system", content: systemPrompt ?? "" },
+//       properties: {
-      { role: "user", content: [{ type: "text", text: extractionContent }] },
+//         items: schema,
-      {
+//       },
-        role: "user",
+//       required: ["items"],
-        content: prompt
+//       additionalProperties: false,
-          ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
+//     };
-          : "Transform the above content into structured JSON output.",
+//   } else if (schema) {
-      },
+//     schema.additionalProperties = false;
-    ],
+//     schema.required = Object.keys(schema.properties);
-    response_format: schema
+//   }
      ? {
          type: "json_schema",
          json_schema: {
            name: "websiteContent",
            schema: schema,
            strict: true,
          },
        }
      : { type: "json_object" },
  });
-  if (jsonCompletion.choices[0].message.refusal !== null) {
+//   const jsonCompletion = await openai.beta.chat.completions.parse({
-    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
+//     temperature: 0,
-  }
+//     model,
 //     messages: [
 //       { role: "system", content: systemPrompt ?? "" },
 //       { role: "user", content: [{ type: "text", text: extractionContent }] },
 //       {
 //         role: "user",
 //         content: prompt
 //           ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
 //           : "Transform the above content into structured JSON output.",
 //       },
 //     ],
 //     response_format: schema
 //       ? {
 //           type: "json_schema",
 //           json_schema: {
 //             name: "websiteContent",
 //             schema: schema,
 //             strict: true,
 //           },
 //         }
 //       : { type: "json_object" },
 //   });
-  const extraction = jsonCompletion.choices[0].message.parsed;
+//   if (jsonCompletion.choices[0].message.refusal !== null) {
-  return {
+//     throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
-    content: extraction ?? "",
+//   }
-    metadata: {
+
-      numTokens,
+//   const extraction = jsonCompletion.choices[0].message.parsed;
-      warning,
+//   return {
-    },
+//     content: extraction ?? "",
-  };
+//     metadata: {
-}
+//       numTokens,
 //       warning,
 //     },
 //   };
 // }
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
    }
 }
-async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
+export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> {
    let extract: any;
    let warning: string | undefined;
    const openai = new OpenAI();
    const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
-    if (document.markdown === undefined) {
+    if (markdown === undefined) {
        throw new Error("document.markdown is undefined -- this is unexpected");
    }
    let extractionContent = document.markdown;
    // count number of tokens
    let numTokens = 0;
    const encoder = encoding_for_model(model as TiktokenModel);
    try {
        // Encode the message into tokens
-        const tokens = encoder.encode(extractionContent);
+        const tokens = encoder.encode(markdown);
        // Return the number of tokens
        numTokens = tokens.length;
    } catch (error) {
-        logger.warn("Calculating num tokens of string failed", { error, extractionContent });
+        logger.warn("Calculating num tokens of string failed", { error, markdown });
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);
-        const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
+        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    } finally {
        // Free the encoder resources after use
        encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
    if (numTokens > maxTokens) {
        // trim the document to the maximum number of tokens, tokens != characters
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);
-        const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
+        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    }
    let schema = options.schema;
@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            },
            {
                role: "user",
-                content: [{ type: "text", text: extractionContent }],
+                content: [{ type: "text", text: markdown }],
            },
            {
                role: "user",
@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
    }
-    document.extract = jsonCompletion.choices[0].message.parsed;
+    extract = jsonCompletion.choices[0].message.parsed;
-    if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
+    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
        try {
-            document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
+            extract = JSON.parse(jsonCompletion.choices[0].message.content);
        } catch (e) {
            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
    }
    if (options.schema && options.schema.type === "array") {
-        document.extract = document.extract?.items;
+        extract = extract?.items;
    }
-    return document;
+    return { extract, warning };
 }
 export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
    if (meta.options.formats.includes("extract")) {
-        document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
+        const { extract, warning } = await generateOpenAICompletions(
          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
          meta.options.extract!,
          document.markdown,
          document.warning,
        );
        document.extract = extract;
        document.warning = warning;
    }
    return document;
--- a/apps/api/src/search/fireEngine.ts
+++ b/apps/api/src/search/fireEngine.ts
@ -37,7 +37,6 @@ export async function fireEngineMap(
      );
      return [];
    }
    console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL);
    const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
      method: "POST",