From 65d89afba9081b526fb1ee03a4540f6284fe4be4 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 13 May 2024 13:01:43 -0700
Subject: [PATCH] Nick:

---
 .../src/__tests__/e2e_withAuth/index.test.ts  | 10 ++++++++
 apps/api/src/controllers/scrape.ts            | 25 ++++++++++++++-----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 5e3777b3..0e2caeb7 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -176,6 +176,16 @@ describe("E2E Tests for API Routes", () => {
     //   expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
     // });
 
+    it("should return a timeout error when scraping takes longer than the specified timeout", async () => {
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://firecrawl.dev", timeout: 1000 });
+
+      expect(response.statusCode).toBe(408);
+    }, 3000); 
+
     it("should return a successful response with a valid API key", async () => {
       const response = await request(TEST_URL)
         .post("/v0/crawlWebsitePreview")
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index 021a9d05..449a50f5 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -15,6 +15,7 @@ export async function scrapeHelper(
   crawlerOptions: any,
   pageOptions: PageOptions,
   extractorOptions: ExtractorOptions,
+  timeout: number
 ): Promise<{
   success: boolean;
   error?: string;
@@ -30,7 +31,6 @@ export async function scrapeHelper(
     return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
   }
 
-
   const a = new WebScraperDataProvider();
   await a.setOptions({
     mode: "single_urls",
@@ -42,7 +42,19 @@ export async function scrapeHelper(
     extractorOptions: extractorOptions,
   });
 
-  const docs = await a.getDocuments(false);
+  const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
+    setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
+  );
+
+  const docsPromise = a.getDocuments(false);
+
+  let docs;
+  try {
+    docs = await Promise.race([docsPromise, timeoutPromise]);
+  } catch (error) {
+    return error;
+  }
+
   // make sure doc.content is not empty
   const filteredDocs = docs.filter(
     (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
@@ -51,12 +63,11 @@ export async function scrapeHelper(
     return { success: true, error: "No page found", returnCode: 200 };
   }
 
-
-  let creditsToBeBilled =  filteredDocs.length;
+  let creditsToBeBilled = filteredDocs.length;
   const creditsPerLLMExtract = 5;
 
-  if (extractorOptions.mode === "llm-extraction"){
-    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length)
+  if (extractorOptions.mode === "llm-extraction") {
+    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
   }
 
   const billingResult = await billTeam(
@@ -96,6 +107,7 @@ export async function scrapeController(req: Request, res: Response) {
       mode: "markdown"
     }
     const origin = req.body.origin ?? "api";
+    const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
 
     try {
       const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@@ -114,6 +126,7 @@ export async function scrapeController(req: Request, res: Response) {
       crawlerOptions,
       pageOptions,
       extractorOptions,
+      timeout
     );
     const endTime = new Date().getTime();
     const timeTakenInSeconds = (endTime - startTime) / 1000;