mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 19:58:08 +08:00
Nick:
This commit is contained in:
parent
7f31959be7
commit
a0fdc6f7c6
|
@ -15,7 +15,7 @@ export class WebCrawler {
|
||||||
private maxCrawledLinks: number;
|
private maxCrawledLinks: number;
|
||||||
private maxCrawledDepth: number;
|
private maxCrawledDepth: number;
|
||||||
private visited: Set<string> = new Set();
|
private visited: Set<string> = new Set();
|
||||||
private crawledUrls: Set<{ url: string, html: string }> = new Set();
|
private crawledUrls: Map<string, string> = new Map();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
|
@ -143,7 +143,7 @@ export class WebCrawler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task);
|
||||||
newUrls.forEach((page) => this.crawledUrls.add(page));
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
inProgress({
|
inProgress({
|
||||||
current: this.crawledUrls.size,
|
current: this.crawledUrls.size,
|
||||||
|
@ -175,7 +175,7 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
await queue.drain();
|
await queue.drain();
|
||||||
return Array.from(this.crawledUrls);
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||||
|
@ -310,5 +310,3 @@ export class WebCrawler {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -176,9 +176,8 @@ export class WebScraperDataProvider {
|
||||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||||
console.log("Fast mode enabled");
|
console.log("Fast mode enabled");
|
||||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
documents = await this.convertUrlsToDocuments(allLinks, inProgress, allHtmls);
|
documents = await this.processLinks(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user