From a0fdc6f7c6ec646f9a1627baf1afff314628b487 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 14 May 2024 12:12:40 -0700 Subject: [PATCH] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 8 +++----- apps/api/src/scraper/WebScraper/index.ts | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3dc6dc43..521b1e1d 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -15,7 +15,7 @@ export class WebCrawler { private maxCrawledLinks: number; private maxCrawledDepth: number; private visited: Set = new Set(); - private crawledUrls: Set<{ url: string, html: string }> = new Set(); + private crawledUrls: Map = new Map(); private limit: number; private robotsTxtUrl: string; private robots: any; @@ -143,7 +143,7 @@ export class WebCrawler { return; } const newUrls = await this.crawl(task); - newUrls.forEach((page) => this.crawledUrls.add(page)); + newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { inProgress({ current: this.crawledUrls.size, @@ -175,7 +175,7 @@ export class WebCrawler { } ); await queue.drain(); - return Array.from(this.crawledUrls); + return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } async crawl(url: string): Promise<{url: string, html: string}[]> { @@ -310,5 +310,3 @@ export class WebCrawler { return []; } } - - diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1f5a785a..13f39c2b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -176,9 +176,8 @@ export class WebScraperDataProvider { if (this.crawlerMode === "fast" && links.some((link) => link.html)) { console.log("Fast mode enabled"); documents = await this.processLinks(allLinks, inProgress, allHtmls); - }else{ - documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls); + documents = await this.processLinks(allLinks, inProgress); } return this.cacheAndFinalizeDocuments(documents, allLinks);