mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick:
This commit is contained in:
parent
7f31959be7
commit
a0fdc6f7c6
|
@ -15,7 +15,7 @@ export class WebCrawler {
|
|||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number;
|
||||
private visited: Set<string> = new Set();
|
||||
private crawledUrls: Set<{ url: string, html: string }> = new Set();
|
||||
private crawledUrls: Map<string, string> = new Map();
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
|
@ -143,7 +143,7 @@ export class WebCrawler {
|
|||
return;
|
||||
}
|
||||
const newUrls = await this.crawl(task);
|
||||
newUrls.forEach((page) => this.crawledUrls.add(page));
|
||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||
if (inProgress && newUrls.length > 0) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
|
@ -175,7 +175,7 @@ export class WebCrawler {
|
|||
}
|
||||
);
|
||||
await queue.drain();
|
||||
return Array.from(this.crawledUrls);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||
|
@ -310,5 +310,3 @@ export class WebCrawler {
|
|||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -176,9 +176,8 @@ export class WebScraperDataProvider {
|
|||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||
console.log("Fast mode enabled");
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
|
||||
}else{
|
||||
documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
|
||||
documents = await this.processLinks(allLinks, inProgress);
|
||||
}
|
||||
|
||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||
|
|
Loading…
Reference in New Issue
Block a user