From a8dc75f762faca41ba73c113830ec6ebc4b78448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 11 Nov 2024 21:36:22 +0100 Subject: [PATCH] feat(crawl): add parameter to treat differing query parameters as different URLs (#892) * add parameter to crawleroptions * add code to make it work --- apps/api/src/controllers/v0/crawl.ts | 1 + apps/api/src/controllers/v1/batch-scrape.ts | 1 + apps/api/src/controllers/v1/crawl.ts | 1 + apps/api/src/controllers/v1/types.ts | 3 +++ apps/api/src/lib/crawl-redis.ts | 26 +++++++++------------ apps/api/src/services/queue-worker.ts | 2 +- apps/js-sdk/firecrawl/src/index.ts | 1 + 7 files changed, 19 insertions(+), 16 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index d502d142..cb7a3ccc 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -195,6 +195,7 @@ export async function crawlController(req: Request, res: Response) { await lockURLs( id, + sc, jobs.map((x) => x.data.url) ); await addCrawlJobs( diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 9c6a288c..b018dc99 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -76,6 +76,7 @@ export async function batchScrapeController( await lockURLs( id, + sc, jobs.map((x) => x.data.url) ); await addCrawlJobs( diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index f9f60e71..aaf33f29 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -133,6 +133,7 @@ export async function crawlController( await lockURLs( id, + sc, jobs.map((x) => x.data.url) ); await addCrawlJobs( diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 28dbb48f..ec78509a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -205,6 +205,7 @@ const crawlerOptions = z.object({ allowExternalLinks: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true), + ignoreQueryParameters: z.boolean().default(false), }).strict(strictMessage); // export type CrawlerOptions = { @@ -460,6 +461,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { allowExternalContentLinks: x.allowExternalLinks, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, + ignoreQueryParameters: x.ignoreQueryParameters, }; } @@ -474,6 +476,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions allowExternalLinks: x.allowExternalContentLinks, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, + ignoreQueryParameters: x.ignoreQueryParameters, }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 34b164d2..ddc542f9 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -90,9 +90,11 @@ export async function getThrottledJobs(teamId: string): Promise { return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); } -export function normalizeURL(url: string): string { +export function normalizeURL(url: string, sc: StoredCrawl): string { const urlO = new URL(url); - urlO.search = ""; + if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) { + urlO.search = ""; + } urlO.hash = ""; return urlO.href; } @@ -130,12 +132,15 @@ export function generateURLPermutations(url: string | URL): URL[] { export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { if (typeof sc.crawlerOptions?.limit === "number") { - if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { + if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) { return false; } } - url = normalizeURL(url); + url = normalizeURL(url, sc); + + await redisConnection.sadd("crawl:" + id + ":visited_unique", url); + await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX"); let res: boolean; if (!sc.crawlerOptions.deduplicateSimilarURLs) { @@ -150,18 +155,9 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise } /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap -export async function lockURLs(id: string, urls: string[]): Promise { +export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise { urls = urls.map(url => { - try { - const urlO = new URL(url); - urlO.search = ""; - urlO.hash = ""; - return urlO.href; - } catch (error) { - logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); - } - - return url; + return normalizeURL(url, sc); }); const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0 diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e29187c2..21dd472e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -320,7 +320,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) { + if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) { logger.debug("Was redirected, locking new URL..."); await lockURL(job.data.crawl_id, sc, doc.metadata.url); } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 0bd99457..401b1c20 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -155,6 +155,7 @@ export interface CrawlParams { scrapeOptions?: CrawlScrapeOptions; webhook?: string; deduplicateSimilarURLs?: boolean; + ignoreQueryParameters?: boolean; } /**