This commit is contained in:
Nicolas 2024-11-14 17:44:32 -05:00
parent 3fcdf57d2f
commit 7f084c6c43
2 changed files with 14 additions and 5 deletions

View File

@ -56,7 +56,7 @@ export async function mapController(
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
if (req.body.sitemapOnly) { if (req.body.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(); const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) { if (sitemap !== null) {
sitemap.forEach((x) => { sitemap.forEach((x) => {
links.push(x.url); links.push(x.url);
@ -100,7 +100,7 @@ export async function mapController(
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([ const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),
]); ]);

View File

@ -65,7 +65,12 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
return sitemapLinks.slice(0, limit);
}
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
let url: URL; let url: URL;
@ -159,11 +164,14 @@ export class WebCrawler {
this.robots = robotsParser(this.robotsTxtUrl, txt); this.robots = robotsParser(this.robotsTxtUrl, txt);
} }
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
logger.debug(`Fetching sitemap links from ${this.initialUrl}`); logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if(fromMap && onlySitemap) {
return sitemapLinks.map(link => ({ url: link, html: "" }));
}
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
return null; return null;
@ -353,6 +361,7 @@ export class WebCrawler {
return url; return url;
}; };
const sitemapUrl = url.endsWith("/sitemap.xml") const sitemapUrl = url.endsWith("/sitemap.xml")
? url ? url
: `${url}/sitemap.xml`; : `${url}/sitemap.xml`;