From f2ecf0cc36467f97ab0a65dd2987d3f90210ae67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 19:46:00 +0100 Subject: [PATCH 01/12] fix(v0): crawl timeout errors --- apps/api/src/controllers/v0/crawl-status.ts | 2 +- apps/api/src/controllers/v0/crawl.ts | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 7b6e610a..9c799eeb 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -75,7 +75,7 @@ export async function crawlStatusController(req: Request, res: Response) { const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; - const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); if ( jobs.length > 0 && diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index cb7a3ccc..fa7627da 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -138,6 +138,8 @@ export async function crawlController(req: Request, res: Response) { const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + delete (scrapeOptions as any).timeout; + const sc: StoredCrawl = { originUrl: url, crawlerOptions, From 3a5eee6e3fac7ec4057b439097258da318b5aab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 19:58:07 +0100 Subject: [PATCH 02/12] feat: improve requests.http using format features --- apps/api/requests.http | 85 +++++++++++++----------------------------- 1 file changed, 25 insertions(+), 60 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 809bae7b..eb3c0962 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,5 +1,8 @@ -### Crawl Website -POST http://localhost:3002/v0/scrape HTTP/1.1 +@baseUrl = https://api.firecrawl.dev + +### Scrape Website +# @name scrape +POST {{baseUrl}}/v1/scrape HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json @@ -7,17 +10,9 @@ content-type: application/json "url":"firecrawl.dev" } -### Check Job Status -GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - - -### Check Job Status -GET http://localhost:3002/v0/jobs/active HTTP/1.1 - - -### Scrape Website -POST http://localhost:3002/v0/crawl HTTP/1.1 +### Crawl Website +# @name crawl +POST {{baseUrl}}/v1/crawl HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json @@ -25,58 +20,28 @@ content-type: application/json "url": "firecrawl.dev" } -## "reoveTags": [], - # "mode": "crawl", - # "crawlerOptions": { - # "allowBackwardCrawling": false - # }, - # "pageOptions": { - # "onlyMainContent": false, - # "includeHtml": false, - # "parsePDF": true - # } +### Check Crawl Status +@crawlId = {{crawl.response.body.$.id}} +# @name crawlStatus +GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} - - - - - - -### Scrape Website -POST http://localhost:3002/v0/scrape HTTP/1.1 +### Batch Scrape Websites +# @name batchScrape +POST {{baseUrl}}/v1/batch/scrape HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url":"https://mendable.ai" + "urls": [ + "firecrawl.dev", + "mendable.ai" + ] } - - -### Check Job Status -GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - -### Get Job Result - -POST https://api.firecrawl.dev/v0/crawl HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "url":"https://mendable.ai" -} - -### Check Job Status -GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 -Authorization: Bearer {{$dotenv TEST_API_KEY}} - -### Get Active Jobs Count -GET http://localhost:3002/serverHealthCheck -content-type: application/json - -### Notify Server Health Check -GET http://localhost:3002/serverHealthCheck/notify -content-type: application/json - +### Check Batch Scrape Status +@batchScrapeId = {{batchScrape.response.body.$.id}} +# @name batchScrapeStatus +GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} \ No newline at end of file From 687ea69621aca0cde920e8982101039eead0e4c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 19:59:09 +0100 Subject: [PATCH 03/12] fix(requests.http): default to localhost baseUrl --- apps/api/requests.http | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index eb3c0962..4ce40b2c 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,4 +1,6 @@ -@baseUrl = https://api.firecrawl.dev +# Pick your baseUrl here: +@baseUrl = http://localhost:3002 +# @baseUrl = https://api.firecrawl.dev ### Scrape Website # @name scrape From 9ace2ad071e902850d23bf388034a5d0f7bf5024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 20:55:14 +0100 Subject: [PATCH 04/12] fix(scrapeURL/pdf): fix llamaparse upload --- apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 11 +++----- .../scraper/scrapeURL/engines/pdf/index.ts | 25 +++++++++++++++---- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 5 ++-- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index bb4ea268..aebd90a5 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -81,7 +81,6 @@ "escape-html": "^1.0.3", "express-rate-limit": "^7.3.1", "express-ws": "^5.0.2", - "form-data": "^4.0.0", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.4.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 3350c74e..f98055fb 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -101,9 +101,6 @@ importers: express-ws: specifier: ^5.0.2 version: 5.0.2(express@4.19.2) - form-data: - specifier: ^4.0.0 - version: 4.0.0 glob: specifier: ^10.4.2 version: 10.4.2 @@ -3932,8 +3929,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.6.2: - resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==} + typescript@5.6.3: + resolution: {integrity: sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==} engines: {node: '>=14.17'} hasBin: true @@ -7742,7 +7739,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.6.2 + typescript: 5.6.3 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -8320,7 +8317,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.6.2: {} + typescript@5.6.3: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index ea44b051..8b42ee71 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -1,5 +1,4 @@ import { createReadStream, promises as fs } from "node:fs"; -import FormData from "form-data"; import { Meta } from "../.."; import { EngineScrapeResult } from ".."; import * as marked from "marked"; @@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); const uploadForm = new FormData(); - uploadForm.append("file", createReadStream(tempFilePath), { - filename: tempFilePath, - contentType: "application/pdf", // NOTE: request.headers["Content-Type"]? - }); + + // This is utterly stupid but it works! - mogery + uploadForm.append("file", { + [Symbol.toStringTag]: "Blob", + name: tempFilePath, + stream() { + return createReadStream(tempFilePath) as unknown as ReadableStream + }, + arrayBuffer() { + throw Error("Unimplemented in mock Blob: arrayBuffer") + }, + size: (await fs.stat(tempFilePath)).size, + text() { + throw Error("Unimplemented in mock Blob: text") + }, + slice(start, end, contentType) { + throw Error("Unimplemented in mock Blob: slice") + }, + type: "application/pdf", + } as Blob); const upload = await robustFetch({ url: "https://api.cloud.llamaindex.ai/api/parsing/upload", diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 03bbd80c..738e240e 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -2,7 +2,6 @@ import { Logger } from "winston"; import { z, ZodError } from "zod"; import { v4 as uuid } from "uuid"; import * as Sentry from "@sentry/node"; -import FormData from "form-data"; export type RobustFetchParams> = { url: string; @@ -38,14 +37,14 @@ export async function robustFetch, Output = z.infer method, headers: { ...(body instanceof FormData - ? body.getHeaders() + ? ({}) : body !== undefined ? ({ "Content-Type": "application/json", }) : {}), ...(headers !== undefined ? headers : {}), }, ...(body instanceof FormData ? ({ - body: body.getBuffer(), + body, }) : body !== undefined ? ({ body: JSON.stringify(body), }) : {}), From 7081beff1f96a55af1f81adc603ce00a50e60772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 22:26:36 +0100 Subject: [PATCH 05/12] fix(scrapeURL/pdf): retry --- apps/api/src/scraper/scrapeURL/engines/pdf/index.ts | 2 ++ apps/api/src/scraper/scrapeURL/lib/fetch.ts | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 8b42ee71..d0591b57 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -62,6 +62,8 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis schema: z.object({ markdown: z.string(), }), + tryCount: 16, + tryCooldown: 250, }); return { diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 738e240e..09a280b8 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -15,6 +15,7 @@ export type RobustFetchParams> = { ignoreFailure?: boolean; requestId?: string; tryCount?: number; + tryCooldown?: number; }; export async function robustFetch, Output = z.infer>({ @@ -28,8 +29,9 @@ export async function robustFetch, Output = z.infer ignoreFailure = false, requestId = uuid(), tryCount = 1, + tryCooldown, }: RobustFetchParams): Promise { - const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount }; + const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown }; let request: Response; try { @@ -86,6 +88,9 @@ export async function robustFetch, Output = z.infer if (request.status >= 300) { if (tryCount > 1) { logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId }); + if (tryCooldown !== undefined) { + await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown)); + } return await robustFetch({ ...params, requestId, From 16e850288cd6dea1e69c769d613271306ed241c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 22:46:58 +0100 Subject: [PATCH 06/12] fix(scrapeURL/pdf,docx): ignore SSL when downloading PDF --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 9 +++++++++ .../api/src/scraper/scrapeURL/engines/docx/index.ts | 2 +- apps/api/src/scraper/scrapeURL/engines/pdf/index.ts | 2 +- .../scraper/scrapeURL/engines/utils/downloadFile.ts | 13 +++++++++++-- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index aebd90a5..0da99459 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -113,6 +113,7 @@ "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", + "undici": "^6.20.1", "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "winston": "^3.14.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index f98055fb..c2a9c8a3 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -197,6 +197,9 @@ importers: typesense: specifier: ^1.5.4 version: 1.8.2(@babel/runtime@7.24.6) + undici: + specifier: ^6.20.1 + version: 6.20.1 unstructured-client: specifier: ^0.11.3 version: 0.11.3(zod@3.23.8) @@ -3957,6 +3960,10 @@ packages: undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} + undici@6.20.1: + resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==} + engines: {node: '>=18.17'} + union@0.5.0: resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} engines: {node: '>= 0.8.0'} @@ -8341,6 +8348,8 @@ snapshots: undici-types@5.26.5: {} + undici@6.20.1: {} + union@0.5.0: dependencies: qs: 6.12.2 diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index f8196ccd..9881fae7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -1,6 +1,6 @@ import { Meta } from "../.."; import { EngineScrapeResult } from ".."; -import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; +import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index d0591b57..bdc916e0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis schema: z.object({ markdown: z.string(), }), - tryCount: 16, + tryCount: 32, tryCooldown: 250, }); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 8db8892b..736faba7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs"; import { EngineError } from "../../error"; import { Writable } from "stream"; import { v4 as uuid } from "uuid"; +import * as undici from "undici"; export async function fetchFileToBuffer(url: string): Promise<{ response: Response, @@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{ } export async function downloadFile(id: string, url: string): Promise<{ - response: Response + response: undici.Response tempFilePath: string }> { const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); const tempFileWrite = createWriteStream(tempFilePath); - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + // TODO: maybe we could use tlsclient for this? for proxying + // use undici to ignore SSL for now + const response = await undici.fetch(url, { + dispatcher: new undici.Agent({ + connect: { + rejectUnauthorized: false, + }, + }) + }); // This should never happen in the current state of JS (2024), but let's check anyways. if (response.body === null) { From 93ac20f930e7c4ffd1cba11f1cc315fc51f0f725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 22:53:29 +0100 Subject: [PATCH 07/12] fix(queue-worker): do not kill crawl on one-page error --- apps/api/src/services/queue-worker.ts | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 5a0b28db..3ea976d6 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -568,24 +568,24 @@ async function processJob(job: Job & { id: string }, token: string) { crawl_id: job.data.crawl_id, }); - await logJob({ - job_id: job.data.crawl_id, - success: false, - message: - typeof error === "string" - ? error - : error.message ?? - "Something went wrong... Contact help@mendable.ai", - num_docs: 0, - docs: [], - time_taken: 0, - team_id: job.data.team_id, - mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", - url: sc ? sc.originUrl ?? job.data.url : job.data.url, - crawlerOptions: sc ? sc.crawlerOptions : undefined, - scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, - origin: job.data.origin, - }); + // await logJob({ + // job_id: job.data.crawl_id, + // success: false, + // message: + // typeof error === "string" + // ? error + // : error.message ?? + // "Something went wrong... Contact help@mendable.ai", + // num_docs: 0, + // docs: [], + // time_taken: 0, + // team_id: job.data.team_id, + // mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", + // url: sc ? sc.originUrl ?? job.data.url : job.data.url, + // crawlerOptions: sc ? sc.crawlerOptions : undefined, + // scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, + // origin: job.data.origin, + // }); } // done(null, data); return data; From 5ce4aaf0ec534d9b359a1a6e8e7c7229742212cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 23:35:07 +0100 Subject: [PATCH 08/12] fix(crawl): initialURL setting is unnecessary --- apps/api/src/lib/crawl-redis.ts | 4 ++-- apps/api/src/services/queue-worker.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index bd79a86d..b5936ad6 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { const crawler = new WebCrawler({ jobId: id, - initialUrl: initialUrl ?? sc.originUrl!, + initialUrl: sc.originUrl!, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 3ea976d6..831dec6b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined); + const crawler = crawlToCrawler(job.data.crawl_id, sc); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), From 1a636b4e59e3c12a9d360fa723ab775b906a6758 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 12 Nov 2024 20:09:01 -0500 Subject: [PATCH 09/12] Update email_notification.ts --- apps/api/src/services/notification/email_notification.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 5aa95b30..e451e0c0 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -6,6 +6,7 @@ import { logger } from "../../../src/lib/logger"; import { sendSlackWebhook } from "../alerts/slack"; import { getNotificationString } from "./notification_string"; import { AuthCreditUsageChunk } from "../../controllers/v1/types"; +import { redlock } from "../redlock"; const emailTemplates: Record< NotificationType, @@ -88,6 +89,7 @@ export async function sendNotificationInternal( if (team_id === "preview") { return { success: true }; } + return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => { if (!bypassRecentChecks) { const fifteenDaysAgo = new Date(); @@ -171,5 +173,6 @@ export async function sendNotificationInternal( return { success: false }; } - return { success: true }; + return { success: true }; + }); } From 32be2cf786294b8722f85eb45ba3202236fb885d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 13 Nov 2024 19:36:44 +0100 Subject: [PATCH 10/12] feat(v1/webhook): complex webhook object w/ headers (#899) * feat(v1/webhook): complex webhook object w/ headers * feat(js-sdk/crawl): add complex webhook support --- apps/api/src/controllers/v1/types.ts | 13 ++++++++++++- apps/api/src/services/webhook.ts | 14 +++++++++----- apps/api/src/types.ts | 5 +++-- apps/js-sdk/firecrawl/src/index.ts | 5 ++++- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ec78509a..b2edd6e7 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -220,11 +220,22 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; +export const webhookSchema = z.preprocess(x => { + if (typeof x === "string") { + return { url: x }; + } else { + return x; + } +}, z.object({ + url: z.string().url(), + headers: z.record(z.string(), z.string()).default({}), +}).strict(strictMessage)) + export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), - webhook: z.string().url().optional(), + webhook: webhookSchema.optional(), limit: z.number().default(10000), }).strict(strictMessage); diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 620b6832..1cc4db84 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,15 +1,17 @@ import axios from "axios"; -import { logger } from "../../src/lib/logger"; +import { logger } from "../lib/logger"; import { supabase_service } from "./supabase"; import { WebhookEventType } from "../types"; import { configDotenv } from "dotenv"; +import { z } from "zod"; +import { webhookSchema } from "../controllers/v1/types"; configDotenv(); export const callWebhook = async ( teamId: string, id: string, data: any | null, - specified?: string, + specified?: z.infer, v1 = false, eventType: WebhookEventType = "crawl.page", awaitWebhook: boolean = false @@ -20,7 +22,7 @@ export const callWebhook = async ( id ); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; - let webhookUrl = specified ?? selfHostedUrl; + let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined); // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // and the USE_DB_AUTHENTICATION environment variable is set to true @@ -73,7 +75,7 @@ export const callWebhook = async ( if (awaitWebhook) { try { await axios.post( - webhookUrl, + webhookUrl.url, { success: !v1 ? data.success @@ -92,6 +94,7 @@ export const callWebhook = async ( { headers: { "Content-Type": "application/json", + ...webhookUrl.headers, }, timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } @@ -104,7 +107,7 @@ export const callWebhook = async ( } else { axios .post( - webhookUrl, + webhookUrl.url, { success: !v1 ? data.success @@ -123,6 +126,7 @@ export const callWebhook = async ( { headers: { "Content-Type": "application/json", + ...webhookUrl.headers, }, timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 2da97bd1..d7821407 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,4 +1,5 @@ -import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types"; +import { z } from "zod"; +import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types"; import { ExtractorOptions, Document } from "./lib/entities"; import { InternalOptions } from "./scraper/scrapeURL"; @@ -33,7 +34,7 @@ export interface WebScraperOptions { origin?: string; crawl_id?: string; sitemapped?: boolean; - webhook?: string; + webhook?: z.infer; v1?: boolean; is_scrape?: boolean; } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 401b1c20..45e19197 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -153,7 +153,10 @@ export interface CrawlParams { allowExternalLinks?: boolean; ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; - webhook?: string; + webhook?: string | { + url: string; + headers?: Record; + }; deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; } From 0d1c4e4e09fd5d33b657f86256bc1d450b04a48d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 13 Nov 2024 13:54:22 -0500 Subject: [PATCH 11/12] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f7891b9e..5d0a7fc9 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.1", + "version": "1.8.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 0310cd2afae449a84c8f1814dcad9bf08e2cede4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 13 Nov 2024 21:38:44 +0100 Subject: [PATCH 12/12] fix(crawl): redirect rebase --- apps/api/src/lib/crawl-redis.ts | 3 ++- apps/api/src/scraper/WebScraper/crawler.ts | 4 +++- apps/api/src/services/queue-worker.ts | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b5936ad6..2b255971 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler { const crawler = new WebCrawler({ jobId: id, initialUrl: sc.originUrl!, + baseUrl: newBase ? new URL(newBase).origin : undefined, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index e5a25f37..7b4a97d9 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -27,6 +27,7 @@ export class WebCrawler { constructor({ jobId, initialUrl, + baseUrl, includes, excludes, maxCrawledLinks = 10000, @@ -38,6 +39,7 @@ export class WebCrawler { }: { jobId: string; initialUrl: string; + baseUrl?: string; includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; @@ -49,7 +51,7 @@ export class WebCrawler { }) { this.jobId = jobId; this.initialUrl = initialUrl; - this.baseUrl = new URL(initialUrl).origin; + this.baseUrl = baseUrl ?? new URL(initialUrl).origin; this.includes = Array.isArray(includes) ? includes : []; this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 831dec6b..33b2ca9a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc); + const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),