This commit is contained in:
Nicolas 2024-05-14 12:12:40 -07:00
parent 7f31959be7
commit a0fdc6f7c6
2 changed files with 4 additions and 7 deletions

View File

@ -15,7 +15,7 @@ export class WebCrawler {
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number; private maxCrawledDepth: number;
private visited: Set<string> = new Set(); private visited: Set<string> = new Set();
private crawledUrls: Set<{ url: string, html: string }> = new Set(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
@ -143,7 +143,7 @@ export class WebCrawler {
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task);
newUrls.forEach((page) => this.crawledUrls.add(page)); newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
@ -175,7 +175,7 @@ export class WebCrawler {
} }
); );
await queue.drain(); await queue.drain();
return Array.from(this.crawledUrls); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<{url: string, html: string}[]> { async crawl(url: string): Promise<{url: string, html: string}[]> {
@ -310,5 +310,3 @@ export class WebCrawler {
return []; return [];
} }
} }

View File

@ -176,9 +176,8 @@ export class WebScraperDataProvider {
if (this.crawlerMode === "fast" && links.some((link) => link.html)) { if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
console.log("Fast mode enabled"); console.log("Fast mode enabled");
documents = await this.processLinks(allLinks, inProgress, allHtmls); documents = await this.processLinks(allLinks, inProgress, allHtmls);
}else{ }else{
documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls); documents = await this.processLinks(allLinks, inProgress);
} }
return this.cacheAndFinalizeDocuments(documents, allLinks); return this.cacheAndFinalizeDocuments(documents, allLinks);