mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
fix(crawler): relative URL handling on non-start pages (#893)
* fix(crawler): relative URL handling on non-start pages * fix(crawl): further fixing
This commit is contained in:
parent
740a429790
commit
fbabc779f5
|
@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||
return res;
|
||||
}
|
||||
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
initialUrl: sc.originUrl!,
|
||||
initialUrl: initialUrl ?? sc.originUrl!,
|
||||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
|
|
|
@ -171,7 +171,7 @@ export class WebCrawler {
|
|||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
try {
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
fullUrl = new URL(href, url).toString();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -352,10 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
|
||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl as string),
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue
Block a user