From 65d89afba9081b526fb1ee03a4540f6284fe4be4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 13 May 2024 13:01:43 -0700 Subject: [PATCH] Nick: --- .../src/__tests__/e2e_withAuth/index.test.ts | 10 ++++++++ apps/api/src/controllers/scrape.ts | 25 ++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 5e3777b3..0e2caeb7 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -176,6 +176,16 @@ describe("E2E Tests for API Routes", () => { // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); // }); + it("should return a timeout error when scraping takes longer than the specified timeout", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); + + expect(response.statusCode).toBe(408); + }, 3000); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 021a9d05..449a50f5 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -15,6 +15,7 @@ export async function scrapeHelper( crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, + timeout: number ): Promise<{ success: boolean; error?: string; @@ -30,7 +31,6 @@ export async function scrapeHelper( return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; } - const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", @@ -42,7 +42,19 @@ export async function scrapeHelper( extractorOptions: extractorOptions, }); - const docs = await a.getDocuments(false); + const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) => + setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout) + ); + + const docsPromise = a.getDocuments(false); + + let docs; + try { + docs = await Promise.race([docsPromise, timeoutPromise]); + } catch (error) { + return error; + } + // make sure doc.content is not empty const filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 @@ -51,12 +63,11 @@ export async function scrapeHelper( return { success: true, error: "No page found", returnCode: 200 }; } - - let creditsToBeBilled = filteredDocs.length; + let creditsToBeBilled = filteredDocs.length; const creditsPerLLMExtract = 5; - if (extractorOptions.mode === "llm-extraction"){ - creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) + if (extractorOptions.mode === "llm-extraction") { + creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); } const billingResult = await billTeam( @@ -96,6 +107,7 @@ export async function scrapeController(req: Request, res: Response) { mode: "markdown" } const origin = req.body.origin ?? "api"; + const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -114,6 +126,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions, pageOptions, extractorOptions, + timeout ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000;