wip

2024-11-16 11:42:24 +08:00 · 2024-11-13 18:06:20 -03:00 · 2024-11-13 18:06:20 -03:00 · 904c904971
commit 904c904971
parent 25f32000db
6 changed files with 397 additions and 197 deletions
--- a/apps/api/src/tests/e2e_extract/index.test.ts
+++ b/apps/api/src/tests/e2e_extract/index.test.ts
@ -0,0 +1,151 @@
+import request from "supertest";
+import dotenv from "dotenv";
+import {
+  FirecrawlCrawlResponse,
+  FirecrawlCrawlStatusResponse,
+  FirecrawlScrapeResponse,
+} from "../../types";
+
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+
+describe("E2E Tests for Extract API Routes", () => {
+  describe("POST /v1/extract", () => {
+    it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://firecrawl.dev"],
+          prompt: "Who are the authors of the blog posts?",
+          schema: {
+            type: "object",
+            properties: { authors: { type: "array", items: { type: "string" } } },
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("founders");
+
+      let gotItRight = 0;
+      for (const author of response.body.data?.authors) {
+        if (author.includes("Caleb Peffer")) gotItRight++;
+        if (author.includes("Gergő Móricz")) gotItRight++;
+        if (author.includes("Eric Ciarla")) gotItRight++;
+        if (author.includes("Nicolas Camara")) gotItRight++;
+      }
+
+      expect(gotItRight).toBeGreaterThan(3);
+    }, 60000);
+
+    it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["mendable.ai"],
+          prompt: "Who are the founders of the company?",
+          allowExternalLinks: true,
+          schema: {
+            type: "object",
+            properties: { founders: { type: "array", items: { type: "string" } } },
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("founders");
+
+      let gotItRight = 0;
+      for (const founder of response.body.data?.founders) {
+        if (founder.includes("Caleb")) gotItRight++;
+        if (founder.includes("Eric")) gotItRight++;
+        if (founder.includes("Nicolas")) gotItRight++;
+      }
+
+      expect(gotItRight).toBe(3);
+    }, 60000);
+
+    it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["https://firecrawl.dev"],
+          prompt: "What are they hiring for?",
+          allowExternalLinks: true,
+          schema: {
+            type: "array",
+            items: {
+              type: "string"
+            }
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      console.log(response.body.data);
+
+      let gotItRight = 0;
+      for (const hiring of response.body.data?.items) {
+        if (hiring.includes("Developer Relations Specialist")) gotItRight++;
+        if (hiring.includes("Web Automation Engineer")) gotItRight++;
+        if (hiring.includes("Developer Experience Engineer")) gotItRight++;
+        if (hiring.includes("Developer Support Engineer")) gotItRight++;
+        if (hiring.includes("Dev Ops Engineer")) gotItRight++;
+        if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
+      }
+
+      expect(gotItRight).toBeGreaterThan(5);
+    }, 60000);
+
+    it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["fivetran.com"],
+          prompt: "Does Fivetran have PCI DSS compliance?",
+          allowExternalLinks: true,
+          schema: {
+            type: "object",
+            properties: {
+              pciDssCompliance: { type: "boolean" }
+            }
+          },
+        });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data?.pciDssCompliance).toBe(true);
+    }, 60000);
+
+    it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/extract")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          urls: ["fivetran.com"],
+          prompt: "What are the Azure Data Connectors they offer?",
+          schema: {
+            type: "array",
+            items: {
+              type: "object",
+              properties: {
+                connector: { type: "string" },
+                description: { type: "string" },
+                supportsCaptureDelete: { type: "boolean" }
+              }
+            }
+          }
+        })
+
+      console.log(response.body);
+      // expect(response.statusCode).toBe(200);
+      // expect(response.body).toHaveProperty("data");
+      // expect(response.body.data?.pciDssCompliance).toBe(true);
+    }, 60000);
+  });
+});
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -26,22 +26,24 @@ import { waitForJob } from "../../services/queue-jobs";
 import { addScrapeJob } from "../../services/queue-jobs";
 import { PlanType } from "../../types";
 import { getJobPriority } from "../../lib/job-priority";
-import { generateFinalExtraction } from "../../lib/extract/completions";
+import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
+import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";

 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);

 const MAX_EXTRACT_LIMIT = 100;
-const MAX_RANKING_LIMIT = 3;
+const MAX_RANKING_LIMIT = 5;
+const SCORE_THRESHOLD = 0.75;

 export async function extractController(
  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
-  res: Response<any> //ExtractResponse>
+  res: Response<ExtractResponse>
 ) {
  req.body = extractRequestSchema.parse(req.body);

  const id = crypto.randomUUID();
-  let links: string[] = req.body.urls;
+  let links: string[]; //= req.body.urls;

  const sc: StoredCrawl = {
    originUrl: req.body.urls[0],
@ -59,10 +61,14 @@ export async function extractController(
  const crawler = crawlToCrawler(id, sc);

  let urlWithoutWww = req.body.urls[0].replace("www.", "");
+  console.log("urlWithoutWww", urlWithoutWww);

-  let mapUrl = req.body.prompt
-    ? `"${req.body.prompt}" site:${urlWithoutWww}`
-    : `site:${req.body.urls[0]}`;
+  const allowExternalLinks = req.body.allowExternalLinks ?? false;
+
+  let mapUrl = req.body.prompt && allowExternalLinks
+    ? `${req.body.prompt} ${urlWithoutWww}`
+    : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
+    : `site:${urlWithoutWww}`;

  const resultsPerPage = 100;
  const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
@ -84,82 +90,103 @@ export async function extractController(
    };

    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
-    allResults = await Promise.all(pagePromises);
+    allResults = (await Promise.all(pagePromises)).flat();
+    // console.log("allResults", allResults);
+    // if allResults is empty, return an error
+    if (allResults.length === 0) {
+      return res.status(400).json({
+        success: false,
+        error: "No results found",
+      });
+    }

    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
  }

  // console.log("allResults", allResults);
  // Parallelize sitemap fetch with serper search
-  const [sitemap, ...searchResults] = await Promise.all([
-    req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
-    ...(cachedResult ? [] : pagePromises),
-  ]);
+  // const [sitemap, ...searchResults] = await Promise.all([
+  //   req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
+  //   ...(cachedResult ? [] : pagePromises),
+  // ]);

-  if (!cachedResult) {
-    allResults = searchResults;
-  }
+  // if (!cachedResult) {
+  //   allResults = searchResults;
+  // }

-  if (sitemap !== null) {
-    sitemap.forEach((x) => {
-      links.push(x.url);
-    });
-  }
+  links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
+  console.log("links", links);
+  // if (sitemap !== null) {
+  //   sitemap.forEach((x) => {
+  //     links.push(x.url);
+  //   });
+  // }

-  let mapResults = allResults
-    .flat()
-    .filter((result) => result !== null && result !== undefined);
+  // let mapResults = allResults
+  //   .flat()
+  //   .filter((result) => result !== null && result !== undefined);

-  const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
-  if (mapResults.length > minumumCutoff) {
-    mapResults = mapResults.slice(0, minumumCutoff);
-  }
+  // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
+  // if (mapResults.length > minumumCutoff) {
+  //   mapResults = mapResults.slice(0, minumumCutoff);
+  // }

-  if (mapResults.length > 0) {
-    if (req.body.prompt) {
-      // Ensure all map results are first, maintaining their order
-      links = [
-        mapResults[0].url,
-        ...mapResults.slice(1).map((x) => x.url),
-        ...links,
-      ];
-    } else {
-      mapResults.map((x) => {
-        links.push(x.url);
-      });
-    }
-  }
+  // if (mapResults.length > 0) {
+  //   if (req.body.prompt) {
+  //     // Ensure all map results are first, maintaining their order
+  //     links = [
+  //       mapResults[0].url,
+  //       ...mapResults.slice(1).map((x) => x.url),
+  //       ...links,
+  //     ];
+  //   } else {
+  //     mapResults.map((x) => {
+  //       links.push(x.url);
+  //     });
+  //   }
+  // }
+
+  // console.log("mapResults", mapResults);

  // console.log("links", links);
  let linksAndScores: { link: string; score: number }[] = [];
  // Perform cosine similarity between the search query and the list of links
  if (req.body.prompt) {
-    const searchQuery = req.body.prompt.toLowerCase();
+    const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
    linksAndScores = await performRanking(links, searchQuery);
  }
+  console.log("linksAndScores", linksAndScores);
+  links = linksAndScores
+    .filter(x => x.score > SCORE_THRESHOLD)
+    .map(x => x.link.split("url: ")[1].split(",")[0])
+    .filter(x => !isUrlBlocked(x))
+
+  console.log("links:", links.length);
+
+  // should we use some sort of llm to determine the best links?

  // console.log("linksAndScores", linksAndScores);

-  links = links
-    .map((x) => {
-      try {
-        return checkAndUpdateURLForMap(x).url.trim();
-      } catch (_) {
-        return null;
-      }
-    })
-    .filter((x) => x !== null) as string[];
+  // links = links
+  //   .map((x) => {
+  //     try {
+  //       return checkAndUpdateURLForMap(x).url.trim();
+  //     } catch (_) {
+  //       return null;
+  //     }
+  //   })
+  //   .filter((x) => x !== null) as string[];

  // allows for subdomains to be included
-  links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
+  // links = links.filter((x) => isSameDomain(x, req.body.urls[0]));

  // if includeSubdomains is false, filter out subdomains
-  if (!req.body.includeSubdomains) {
-    links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
-  }
+  // if (!req.body.includeSubdomains) {
+  //   links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
+  // z}

  // remove duplicates that could be due to http/https or www
-  links = removeDuplicateUrls(links);
+  // links = removeDuplicateUrls(links);

  // get top N links
  links = links.slice(0, MAX_RANKING_LIMIT);
@ -170,7 +197,7 @@ export async function extractController(

  for (const url of links) {
    const origin = req.body.origin || "api";
-    const timeout = req.body.timeout;
+    const timeout = req.body.timeout ?? 30000;
    const jobId = crypto.randomUUID();

    const startTime = new Date().getTime();
@ -196,7 +223,7 @@ export async function extractController(
      jobPriority
    );

-    const totalWait = 60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
+    const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);

    let doc: Document;
    try {
@ -234,18 +261,20 @@ export async function extractController(
    docs.push(doc);
  }

+  console.log(docs)

-  // console.log("docs", docs);
+  const completions = await generateOpenAICompletions(
+    logger.child({ method: "extractController/generateOpenAICompletions" }),
+    {
+      mode: "llm",
+      systemPrompt: "Only use the provided content to answer the question.",
+      prompt: mapUrl,
+      schema: req.body.schema,
+    },
+    docs.map(x => x.markdown).join('\n')
+  );

-  // {"message":"Missing required parameter: 'response_format.json_schema.schema'.","type":"invalid_request_error","param":"response_format.json_schema.schema","code":"missing_required_parameter"},"code":"missing_required_parameter","param":"response_format.json_schema.schema","type":"invalid_request_error"}
-  const completions = await generateFinalExtraction({
-    pagesContent: docs.map(x => x.markdown).join('\n'),
-    systemPrompt: '',
-    prompt: req.body.prompt,
-    schema: req.body.schema,
-  });
-
-  // console.log("completions", completions);
+  console.log("completions", completions);

  // if(req.body.extract && req.body.formats.includes("extract")) {
  //   creditsToBeBilled = 5;
@ -315,9 +344,18 @@ export async function extractController(
  //   scrape_id: result.scrape_id
  // };

+  console.log("completions.extract", completions.extract);
+
+  let data: any;
+  try {
+    data = JSON.parse(completions.extract);
+  } catch (e) {
+    data = completions.extract;
+  }
+
  return res.status(200).json({
    success: true,
-    data: completions.content, // includeMetadata ? mapResults : linksToReturn,
+    data: data, // includeMetadata ? mapResults : linksToReturn,
    scrape_id: id, //origin?.includes("website") ? id : undefined,
  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -161,6 +161,7 @@ export const extractV1Options = z.object({
  limit: z.number().int().positive().finite().safe().optional(),
  ignoreSitemap: z.boolean().default(false),
  includeSubdomains: z.boolean().default(true),
+  allowExternalLinks: z.boolean().default(false),
  origin: z.string().optional().default("api"),
  timeout: z.number().int().positive().finite().safe().default(60000),
 }).strict(strictMessage)
@ -353,7 +354,7 @@ export type ExtractResponse =
  | {
      success: true;
      warning?: string;
-      data: Document;
+      data: z.infer<typeof extractRequestSchema>;
      scrape_id?: string;
    };

--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -1,121 +1,124 @@
-import OpenAI from "openai";
-import { encoding_for_model } from "@dqbd/tiktoken";
-import { TiktokenModel } from "@dqbd/tiktoken";
-import { ExtractOptions } from "../../controllers/v1/types";
-import { Document } from "../entities";
-import { z } from "zod";
+// use llmExtract.ts instead

-const maxTokens = 32000;
-const modifier = 4;
+// import OpenAI from "openai";
+// import { encoding_for_model } from "@dqbd/tiktoken";
+// import { TiktokenModel } from "@dqbd/tiktoken";
+// import { ExtractOptions } from "../../controllers/v1/types";
+// import { Document } from "../entities";
+// import { z } from "zod";

-export class LLMRefusalError extends Error {
-  constructor(refusal: string) {
-    super("LLM refused to extract the website's content");
-    this.name = "LLMRefusalError";
-  }
-}
+// const maxTokens = 32000;
+// const modifier = 4;

-interface GenerateCompletionsParams {
-  systemPrompt?: string;
-  prompt?: string;
-  schema?: any;
-  pagesContent: string;
-}
+// export class LLMRefusalError extends Error {
+//   constructor(refusal: string) {
+//     super("LLM refused to extract the website's content");
+//     this.name = "LLMRefusalError";
+//   }
+// }

-export async function generateBasicCompletion(prompt: string) {
-  const openai = new OpenAI();
-  const model: TiktokenModel =
-    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+// interface GenerateCompletionsParams {
+//   systemPrompt?: string;
+//   prompt?: string;
+//   schema?: any;
+//   pagesContent: string;
+// }

-  const completion = await openai.chat.completions.create({
-    model,
-    messages: [{ role: "user", content: prompt }],
-  });
+// export async function generateBasicCompletion(prompt: string) {
+//   const openai = new OpenAI();
+//   const model: TiktokenModel =
+//     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

-  return completion.choices[0].message.content;
-}
+//   const completion = await openai.chat.completions.create({
+//     model,
+//     messages: [{ role: "user", content: prompt }],
+//   });

-export async function generateFinalExtraction({
-  pagesContent,
-  systemPrompt,
-  prompt,
-  schema,
-}: GenerateCompletionsParams): Promise<{
-  content: string;
-  metadata: { numTokens: number; warning: string };
-}> {
-  const openai = new OpenAI();
-  const model: TiktokenModel =
-    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+//   return completion.choices[0].message.content;
+// }

-  let extractionContent = pagesContent;
-  let numTokens = 0;
-  let warning = "";
+// export async function generateFinalExtraction({
+//   pagesContent,
+//   systemPrompt,
+//   prompt,
+//   schema,
+// }: GenerateCompletionsParams): Promise<{
+//   content: string;
+//   metadata: { numTokens: number; warning: string };
+// }> {
+//   const openai = new OpenAI();
+//   const model: TiktokenModel =
+//     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

-  const encoder = encoding_for_model(model);
-  try {
-    const tokens = encoder.encode(extractionContent);
-    numTokens = tokens.length;
-  } catch (error) {
-    extractionContent = extractionContent.slice(0, maxTokens * modifier);
-    warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
-  } finally {
-    encoder.free();
-  }
+//   let extractionContent = pagesContent;
+//   let numTokens = 0;
+//   let warning = "";

-  if (numTokens > maxTokens) {
-    extractionContent = extractionContent.slice(0, maxTokens * modifier);
-    warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
-  }
+//   const encoder = encoding_for_model(model);
+//   try {
+//     const tokens = encoder.encode(extractionContent);
+//     numTokens = tokens.length;
+//   } catch (error) {
+//     extractionContent = extractionContent.slice(0, maxTokens * modifier);
+//     warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
+//   } finally {
+//     encoder.free();
+//   }

-  if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
-    schema = {
-      type: "object",
-      properties: {
-        items: schema,
-      },
-      required: ["items"],
-      additionalProperties: false,
-    };
-  } else if (schema) {
-    schema.additionalProperties = false;
-    schema.required = Object.keys(schema.properties);
-  }
+//   if (numTokens > maxTokens) {
+//     extractionContent = extractionContent.slice(0, maxTokens * modifier);
+//     warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
+//   }

-  const jsonCompletion = await openai.beta.chat.completions.parse({
-    model,
-    messages: [
-      { role: "system", content: systemPrompt ?? "" },
-      { role: "user", content: [{ type: "text", text: extractionContent }] },
-      {
-        role: "user",
-        content: prompt
-          ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
-          : "Transform the above content into structured JSON output.",
-      },
-    ],
-    response_format: schema
-      ? {
-          type: "json_schema",
-          json_schema: {
-            name: "websiteContent",
-            schema: schema,
-            strict: true,
-          },
-        }
-      : { type: "json_object" },
-  });
+//   if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
+//     schema = {
+//       type: "object",
+//       properties: {
+//         items: schema,
+//       },
+//       required: ["items"],
+//       additionalProperties: false,
+//     };
+//   } else if (schema) {
+//     schema.additionalProperties = false;
+//     schema.required = Object.keys(schema.properties);
+//   }

-  if (jsonCompletion.choices[0].message.refusal !== null) {
-    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
-  }
+//   const jsonCompletion = await openai.beta.chat.completions.parse({
+//     temperature: 0,
+//     model,
+//     messages: [
+//       { role: "system", content: systemPrompt ?? "" },
+//       { role: "user", content: [{ type: "text", text: extractionContent }] },
+//       {
+//         role: "user",
+//         content: prompt
+//           ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
+//           : "Transform the above content into structured JSON output.",
+//       },
+//     ],
+//     response_format: schema
+//       ? {
+//           type: "json_schema",
+//           json_schema: {
+//             name: "websiteContent",
+//             schema: schema,
+//             strict: true,
+//           },
+//         }
+//       : { type: "json_object" },
+//   });

-  const extraction = jsonCompletion.choices[0].message.parsed;
-  return {
-    content: extraction ?? "",
-    metadata: {
-      numTokens,
-      warning,
-    },
-  };
-}
+//   if (jsonCompletion.choices[0].message.refusal !== null) {
+//     throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
+//   }
+
+//   const extraction = jsonCompletion.choices[0].message.parsed;
+//   return {
+//     content: extraction ?? "",
+//     metadata: {
+//       numTokens,
+//       warning,
+//     },
+//   };
+// }
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
    }
 }

-async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
+export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, warning: string | undefined }> {
+    let extract: any;
+    let warning: string | undefined;
+
    const openai = new OpenAI();
    const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

-    if (document.markdown === undefined) {
+    if (markdown === undefined) {
        throw new Error("document.markdown is undefined -- this is unexpected");
    }

-    let extractionContent = document.markdown;
-
    // count number of tokens
    let numTokens = 0;
    const encoder = encoding_for_model(model as TiktokenModel);
    try {
        // Encode the message into tokens
-        const tokens = encoder.encode(extractionContent);
+        const tokens = encoder.encode(markdown);
    
        // Return the number of tokens
        numTokens = tokens.length;
    } catch (error) {
-        logger.warn("Calculating num tokens of string failed", { error, extractionContent });
+        logger.warn("Calculating num tokens of string failed", { error, markdown });

-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);

-        const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    } finally {
        // Free the encoder resources after use
        encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt

    if (numTokens > maxTokens) {
        // trim the document to the maximum number of tokens, tokens != characters
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);

-        const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    }

    let schema = options.schema;
@ -120,7 +121,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            },
            {
                role: "user",
-                content: [{ type: "text", text: extractionContent }],
+                content: [{ type: "text", text: markdown }],
            },
            {
                role: "user",
@ -143,11 +144,11 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
    }

-    document.extract = jsonCompletion.choices[0].message.parsed;
+    extract = jsonCompletion.choices[0].message.parsed;

-    if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
+    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
        try {
-            document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
+            extract = JSON.parse(jsonCompletion.choices[0].message.content);
        } catch (e) {
            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
@ -155,14 +156,21 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
    }

    if (options.schema && options.schema.type === "array") {
-        document.extract = document.extract?.items;
+        extract = extract?.items;
    }
-    return document;
+    return { extract, warning };
 }

 export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
    if (meta.options.formats.includes("extract")) {
-        document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
+        const { extract, warning } = await generateOpenAICompletions(
+          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
+          meta.options.extract!,
+          document.markdown,
+          document.warning,
+        );
+        document.extract = extract;
+        document.warning = warning;
    }

    return document;
--- a/apps/api/src/search/fireEngine.ts
+++ b/apps/api/src/search/fireEngine.ts
@ -37,7 +37,6 @@ export async function fireEngineMap(
      );
      return [];
    }
-    console.log("process.env.FIRE_ENGINE_BETA_URL", process.env.FIRE_ENGINE_BETA_URL);

    const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
      method: "POST",