mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
fixes the empty excludes.filter undefined bug
This commit is contained in:
parent
789c6cf5d7
commit
f32e8de156
|
@ -42,8 +42,8 @@ export type SearchOptions = {
|
||||||
|
|
||||||
export type CrawlerOptions = {
|
export type CrawlerOptions = {
|
||||||
returnOnlyUrls?: boolean;
|
returnOnlyUrls?: boolean;
|
||||||
includes?: string[];
|
includes?: string | string[];
|
||||||
excludes?: string[];
|
excludes?: string | string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
maxDepth?: number;
|
maxDepth?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
|
|
@ -27,8 +27,8 @@ export class WebScraperDataProvider {
|
||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
private includes: string[];
|
private includes: string | string[];
|
||||||
private excludes: string[];
|
private excludes: string | string[];
|
||||||
private maxCrawledLinks: number;
|
private maxCrawledLinks: number;
|
||||||
private maxCrawledDepth: number = 10;
|
private maxCrawledDepth: number = 10;
|
||||||
private returnOnlyUrls: boolean;
|
private returnOnlyUrls: boolean;
|
||||||
|
@ -171,8 +171,8 @@ export class WebScraperDataProvider {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: this.jobId,
|
jobId: this.jobId,
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','),
|
||||||
excludes: this.excludes,
|
excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','),
|
||||||
maxCrawledLinks: this.maxCrawledLinks,
|
maxCrawledLinks: this.maxCrawledLinks,
|
||||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
|
@ -445,6 +445,10 @@ export class WebScraperDataProvider {
|
||||||
const url = new URL(document.metadata.sourceURL);
|
const url = new URL(document.metadata.sourceURL);
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
|
if (!Array.isArray(this.excludes)) {
|
||||||
|
this.excludes = this.excludes.split(',');
|
||||||
|
}
|
||||||
|
|
||||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||||
// Check if the link should be excluded
|
// Check if the link should be excluded
|
||||||
if (
|
if (
|
||||||
|
@ -456,6 +460,10 @@ export class WebScraperDataProvider {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!Array.isArray(this.includes)) {
|
||||||
|
this.includes = this.includes.split(',');
|
||||||
|
}
|
||||||
|
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0) {
|
if (this.includes.length > 0) {
|
||||||
|
@ -567,8 +575,15 @@ export class WebScraperDataProvider {
|
||||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||||
false;
|
false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||||
|
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||||
|
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||||
|
}
|
||||||
|
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||||
this.allowBackwardCrawling =
|
this.allowBackwardCrawling =
|
||||||
|
|
Loading…
Reference in New Issue
Block a user