mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Fixing child links only bug
This commit is contained in:
parent
da8d94105d
commit
fa014defc7
|
@ -88,6 +88,10 @@ export class WebCrawler {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!this.initialUrl.includes(link)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
})
|
})
|
||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
|
@ -109,7 +113,7 @@ export class WebCrawler {
|
||||||
|
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -145,12 +145,18 @@ export class WebScraperDataProvider {
|
||||||
|
|
||||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||||
|
|
||||||
const allLinks = links.map((e) => e.url);
|
let allLinks = links.map((e) => e.url);
|
||||||
const allHtmls = links.map((e)=> e.html);
|
const allHtmls = links.map((e)=> e.html);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
allLinks = allLinks.filter(link => {
|
||||||
|
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
||||||
|
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
||||||
|
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
|
||||||
|
});
|
||||||
|
|
||||||
let documents = [];
|
let documents = [];
|
||||||
// check if fast mode is enabled and there is html inside the links
|
// check if fast mode is enabled and there is html inside the links
|
||||||
|
@ -175,6 +181,12 @@ export class WebScraperDataProvider {
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap(this.urls[0]);
|
||||||
|
links = links.filter(link => {
|
||||||
|
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
|
||||||
|
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
|
||||||
|
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
|
||||||
|
});
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
{
|
{
|
||||||
"website": "https://mendable.ai/pricing",
|
"website": "https://mendable.ai/pricing",
|
||||||
"expected_min_num_of_pages": 29,
|
"expected_min_num_of_pages": 29,
|
||||||
"expected_crawled_pages": [
|
"expected_not_crawled_pages": [
|
||||||
"https://mendable.ai/",
|
"https://mendable.ai/",
|
||||||
"https://mendable.ai/blog",
|
"https://mendable.ai/blog",
|
||||||
"https://mendable.ai/signin",
|
"https://mendable.ai/signin",
|
||||||
|
@ -34,7 +34,9 @@
|
||||||
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
|
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
|
||||||
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
|
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
|
||||||
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
|
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
|
||||||
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
|
"https://www.agentops.ai/blog/hr-made-simple-with-saas"
|
||||||
|
],
|
||||||
|
"expected_not_crawled_pages": [
|
||||||
"https://www.agentops.ai/about-us",
|
"https://www.agentops.ai/about-us",
|
||||||
"https://www.agentops.ai/contact-us"
|
"https://www.agentops.ai/contact-us"
|
||||||
]
|
]
|
||||||
|
@ -69,7 +71,7 @@
|
||||||
{
|
{
|
||||||
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
|
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
|
||||||
"expected_min_num_of_pages": 100,
|
"expected_min_num_of_pages": 100,
|
||||||
"expected_crawled_pages": [
|
"expected_not_crawled_pages": [
|
||||||
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
|
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
|
||||||
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
|
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
|
||||||
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
|
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
|
||||||
|
@ -79,15 +81,10 @@
|
||||||
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
|
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"website": "https://mendable.ai/blog",
|
|
||||||
"expected_min_num_of_pages": 0,
|
|
||||||
"expected_crawled_pages": [""]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"website": "https://www.framer.com/pricing",
|
"website": "https://www.framer.com/pricing",
|
||||||
"expected_min_num_of_pages": 58,
|
"expected_min_num_of_pages": 58,
|
||||||
"expected_crawled_pages": [
|
"expected_not_crawled_pages": [
|
||||||
"https://www.framer.com/features/navigation/",
|
"https://www.framer.com/features/navigation/",
|
||||||
"https://www.framer.com/contact/",
|
"https://www.framer.com/contact/",
|
||||||
"https://www.framer.com/add-ons/",
|
"https://www.framer.com/add-ons/",
|
||||||
|
@ -101,7 +98,7 @@
|
||||||
{
|
{
|
||||||
"website": "https://fly.io/docs/gpus/gpu-quickstart",
|
"website": "https://fly.io/docs/gpus/gpu-quickstart",
|
||||||
"expected_min_num_of_pages": 39,
|
"expected_min_num_of_pages": 39,
|
||||||
"expected_crawled_pages": [
|
"expected_not_crawled_pages": [
|
||||||
"https://fly.io/docs/getting-started/",
|
"https://fly.io/docs/getting-started/",
|
||||||
"https://fly.io/docs/hands-on/",
|
"https://fly.io/docs/hands-on/",
|
||||||
"https://fly.io/docs/about/support/",
|
"https://fly.io/docs/about/support/",
|
||||||
|
@ -118,8 +115,8 @@
|
||||||
"expected_crawled_pages": [""]
|
"expected_crawled_pages": [""]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"website": "https://www.instructables.com",
|
"website": "https://www.instructables.com/circuits",
|
||||||
"expected_min_num_of_pages": 78,
|
"expected_min_num_of_pages": 12,
|
||||||
"expected_crawled_pages": [
|
"expected_crawled_pages": [
|
||||||
"https://www.instructables.com/circuits/",
|
"https://www.instructables.com/circuits/",
|
||||||
"https://www.instructables.com/circuits/apple/projects/",
|
"https://www.instructables.com/circuits/apple/projects/",
|
||||||
|
|
|
@ -62,6 +62,7 @@ describe("Crawling Checkup (E2E)", () => {
|
||||||
// fail the test
|
// fail the test
|
||||||
console.log('No response');
|
console.log('No response');
|
||||||
continue;
|
continue;
|
||||||
|
// continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!completedResponse.body || completedResponse.body.status !== "completed") {
|
if (!completedResponse.body || completedResponse.body.status !== "completed") {
|
||||||
|
@ -72,7 +73,7 @@ describe("Crawling Checkup (E2E)", () => {
|
||||||
actual_output: 'FAILURE',
|
actual_output: 'FAILURE',
|
||||||
error: `Crawl job did not complete successfully.`
|
error: `Crawl job did not complete successfully.`
|
||||||
});
|
});
|
||||||
return null;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check how many webpages were crawled successfully
|
// check how many webpages were crawled successfully
|
||||||
|
@ -85,11 +86,11 @@ describe("Crawling Checkup (E2E)", () => {
|
||||||
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
|
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
|
||||||
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
|
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
|
||||||
});
|
});
|
||||||
return null;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// checks if crawled pages contain expected_crawled_pages
|
// checks if crawled pages contain expected_crawled_pages
|
||||||
if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.some((d: { url: string }) => d.url === page))) {
|
if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
|
||||||
errorLog.push({
|
errorLog.push({
|
||||||
website: websiteData.website,
|
website: websiteData.website,
|
||||||
prompt: 'CRAWL',
|
prompt: 'CRAWL',
|
||||||
|
@ -97,7 +98,19 @@ describe("Crawling Checkup (E2E)", () => {
|
||||||
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||||
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
|
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
|
||||||
});
|
});
|
||||||
return null;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// checks if crawled pages not contain expected_not_crawled_pages
|
||||||
|
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
|
||||||
|
errorLog.push({
|
||||||
|
website: websiteData.website,
|
||||||
|
prompt: 'CRAWL',
|
||||||
|
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
|
||||||
|
actual_output: `FAILURE: ${completedResponse.body.data}`,
|
||||||
|
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
|
||||||
|
});
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
passedTests++;
|
passedTests++;
|
||||||
|
@ -110,6 +123,7 @@ describe("Crawling Checkup (E2E)", () => {
|
||||||
actual_output: 'FAILURE',
|
actual_output: 'FAILURE',
|
||||||
error: `Error processing ${websiteData.website}: ${error}`
|
error: `Error processing ${websiteData.website}: ${error}`
|
||||||
});
|
});
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user