mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'nsc/sitemap-only'
This commit is contained in:
commit
9519897102
|
@ -56,7 +56,7 @@ export async function mapController(
|
||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
if (req.body.sitemapOnly) {
|
if (req.body.sitemapOnly) {
|
||||||
const sitemap = await crawler.tryGetSitemap();
|
const sitemap = await crawler.tryGetSitemap(true, true);
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
sitemap.forEach((x) => {
|
sitemap.forEach((x) => {
|
||||||
links.push(x.url);
|
links.push(x.url);
|
||||||
|
@ -100,7 +100,7 @@ export async function mapController(
|
||||||
|
|
||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
const [sitemap, ...searchResults] = await Promise.all([
|
||||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,12 @@ export class WebCrawler {
|
||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||||
|
// If the initial URL is a sitemap.xml, skip filtering
|
||||||
|
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
|
||||||
|
return sitemapLinks.slice(0, limit);
|
||||||
|
}
|
||||||
|
|
||||||
return sitemapLinks
|
return sitemapLinks
|
||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
let url: URL;
|
let url: URL;
|
||||||
|
@ -159,11 +164,14 @@ export class WebCrawler {
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||||
}
|
}
|
||||||
|
|
||||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
||||||
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
if(fromMap && onlySitemap) {
|
||||||
|
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
||||||
|
}
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
@ -353,6 +361,7 @@ export class WebCrawler {
|
||||||
return url;
|
return url;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||||
? url
|
? url
|
||||||
: `${url}/sitemap.xml`;
|
: `${url}/sitemap.xml`;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user