Fixing child links only bug

This commit is contained in:
rafaelsideguide 2024-05-15 18:35:09 -03:00
parent da8d94105d
commit fa014defc7
4 changed files with 45 additions and 18 deletions

View File

@ -88,6 +88,10 @@ export class WebCrawler {
return false; return false;
} }
if (!this.initialUrl.includes(link)) {
return false;
}
return true; return true;
}) })
.slice(0, limit); .slice(0, limit);
@ -109,7 +113,7 @@ export class WebCrawler {
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }

View File

@ -145,12 +145,18 @@ export class WebScraperDataProvider {
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
const allLinks = links.map((e) => e.url); let allLinks = links.map((e) => e.url);
const allHtmls = links.map((e)=> e.html); const allHtmls = links.map((e)=> e.html);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(allLinks , inProgress); return this.returnOnlyUrlsResponse(allLinks , inProgress);
} }
allLinks = allLinks.filter(link => {
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
});
let documents = []; let documents = [];
// check if fast mode is enabled and there is html inside the links // check if fast mode is enabled and there is html inside the links
@ -175,6 +181,12 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
links = links.filter(link => {
const normalizedInitialUrl = this.urls[0].endsWith('/') ? this.urls[0] : `${this.urls[0]}/`;
const normalizedLink = link.endsWith('/') ? link : `${link}/`;
return !normalizedLink.startsWith(normalizedInitialUrl) || normalizedLink !== normalizedInitialUrl;
});
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }

View File

@ -2,7 +2,7 @@
{ {
"website": "https://mendable.ai/pricing", "website": "https://mendable.ai/pricing",
"expected_min_num_of_pages": 29, "expected_min_num_of_pages": 29,
"expected_crawled_pages": [ "expected_not_crawled_pages": [
"https://mendable.ai/", "https://mendable.ai/",
"https://mendable.ai/blog", "https://mendable.ai/blog",
"https://mendable.ai/signin", "https://mendable.ai/signin",
@ -34,7 +34,9 @@
"https://www.agentops.ai/blog/streamlining-hr-with-saas", "https://www.agentops.ai/blog/streamlining-hr-with-saas",
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions", "https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas", "https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
"https://www.agentops.ai/blog/hr-made-simple-with-saas", "https://www.agentops.ai/blog/hr-made-simple-with-saas"
],
"expected_not_crawled_pages": [
"https://www.agentops.ai/about-us", "https://www.agentops.ai/about-us",
"https://www.agentops.ai/contact-us" "https://www.agentops.ai/contact-us"
] ]
@ -69,7 +71,7 @@
{ {
"website": "https://en.wikipedia.org/wiki/T._N._Seshan", "website": "https://en.wikipedia.org/wiki/T._N._Seshan",
"expected_min_num_of_pages": 100, "expected_min_num_of_pages": 100,
"expected_crawled_pages": [ "expected_not_crawled_pages": [
"https://en.wikipedia.org/wiki/Wikipedia:Contents", "https://en.wikipedia.org/wiki/Wikipedia:Contents",
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us", "https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
"https://en.wikipedia.org/wiki/V._S._Ramadevi", "https://en.wikipedia.org/wiki/V._S._Ramadevi",
@ -79,15 +81,10 @@
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg" "https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
] ]
}, },
{
"website": "https://mendable.ai/blog",
"expected_min_num_of_pages": 0,
"expected_crawled_pages": [""]
},
{ {
"website": "https://www.framer.com/pricing", "website": "https://www.framer.com/pricing",
"expected_min_num_of_pages": 58, "expected_min_num_of_pages": 58,
"expected_crawled_pages": [ "expected_not_crawled_pages": [
"https://www.framer.com/features/navigation/", "https://www.framer.com/features/navigation/",
"https://www.framer.com/contact/", "https://www.framer.com/contact/",
"https://www.framer.com/add-ons/", "https://www.framer.com/add-ons/",
@ -101,7 +98,7 @@
{ {
"website": "https://fly.io/docs/gpus/gpu-quickstart", "website": "https://fly.io/docs/gpus/gpu-quickstart",
"expected_min_num_of_pages": 39, "expected_min_num_of_pages": 39,
"expected_crawled_pages": [ "expected_not_crawled_pages": [
"https://fly.io/docs/getting-started/", "https://fly.io/docs/getting-started/",
"https://fly.io/docs/hands-on/", "https://fly.io/docs/hands-on/",
"https://fly.io/docs/about/support/", "https://fly.io/docs/about/support/",
@ -118,8 +115,8 @@
"expected_crawled_pages": [""] "expected_crawled_pages": [""]
}, },
{ {
"website": "https://www.instructables.com", "website": "https://www.instructables.com/circuits",
"expected_min_num_of_pages": 78, "expected_min_num_of_pages": 12,
"expected_crawled_pages": [ "expected_crawled_pages": [
"https://www.instructables.com/circuits/", "https://www.instructables.com/circuits/",
"https://www.instructables.com/circuits/apple/projects/", "https://www.instructables.com/circuits/apple/projects/",

View File

@ -62,6 +62,7 @@ describe("Crawling Checkup (E2E)", () => {
// fail the test // fail the test
console.log('No response'); console.log('No response');
continue; continue;
// continue;
} }
if (!completedResponse.body || completedResponse.body.status !== "completed") { if (!completedResponse.body || completedResponse.body.status !== "completed") {
@ -72,7 +73,7 @@ describe("Crawling Checkup (E2E)", () => {
actual_output: 'FAILURE', actual_output: 'FAILURE',
error: `Crawl job did not complete successfully.` error: `Crawl job did not complete successfully.`
}); });
return null; continue;
} }
// check how many webpages were crawled successfully // check how many webpages were crawled successfully
@ -85,11 +86,11 @@ describe("Crawling Checkup (E2E)", () => {
actual_output: `FAILURE: ${completedResponse.body.data.length}`, actual_output: `FAILURE: ${completedResponse.body.data.length}`,
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}` error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
}); });
return null; continue;
} }
// checks if crawled pages contain expected_crawled_pages // checks if crawled pages contain expected_crawled_pages
if (websiteData.expected_crawled_pages.some(page => !completedResponse.body.data.some((d: { url: string }) => d.url === page))) { if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
errorLog.push({ errorLog.push({
website: websiteData.website, website: websiteData.website,
prompt: 'CRAWL', prompt: 'CRAWL',
@ -97,7 +98,19 @@ describe("Crawling Checkup (E2E)", () => {
actual_output: `FAILURE: ${completedResponse.body.data}`, actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}` error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
}); });
return null; continue;
}
// checks if crawled pages not contain expected_not_crawled_pages
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
});
continue;
} }
passedTests++; passedTests++;
@ -110,6 +123,7 @@ describe("Crawling Checkup (E2E)", () => {
actual_output: 'FAILURE', actual_output: 'FAILURE',
error: `Error processing ${websiteData.website}: ${error}` error: `Error processing ${websiteData.website}: ${error}`
}); });
continue;
} }
} }