Merge pull request #503 from mendableai/bugfix/empty-excludes
Some checks are pending
Fly Deploy / Pre-deploy checks (push) Waiting to run
Fly Deploy / Test Suite (push) Blocked by required conditions
Fly Deploy / Python SDK Tests (push) Blocked by required conditions
Fly Deploy / JavaScript SDK Tests (push) Blocked by required conditions
Fly Deploy / Deploy app (push) Blocked by required conditions
Fly Deploy / Build and publish Python SDK (push) Blocked by required conditions
Fly Deploy / Build and publish JavaScript SDK (push) Blocked by required conditions

[Bug] Fixed the empty excludes.filter is undefined bug
This commit is contained in:
Nicolas 2024-08-05 20:42:21 -04:00 committed by GitHub
commit 72f2c3616f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 8 deletions

View File

@ -42,8 +42,8 @@ export type SearchOptions = {
export type CrawlerOptions = {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
includes?: string | string[];
excludes?: string | string[];
maxCrawledLinks?: number;
maxDepth?: number;
limit?: number;

View File

@ -27,8 +27,8 @@ export class WebScraperDataProvider {
private bullJobId: string;
private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
private includes: string[];
private excludes: string[];
private includes: string | string[];
private excludes: string | string[];
private maxCrawledLinks: number;
private maxCrawledDepth: number = 10;
private returnOnlyUrls: boolean;
@ -171,8 +171,8 @@ export class WebScraperDataProvider {
const crawler = new WebCrawler({
jobId: this.jobId,
initialUrl: this.urls[0],
includes: this.includes,
excludes: this.excludes,
includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','),
excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','),
maxCrawledLinks: this.maxCrawledLinks,
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
limit: this.limit,
@ -445,6 +445,10 @@ export class WebScraperDataProvider {
const url = new URL(document.metadata.sourceURL);
const path = url.pathname;
if (!Array.isArray(this.excludes)) {
this.excludes = this.excludes.split(',');
}
if (this.excludes.length > 0 && this.excludes[0] !== "") {
// Check if the link should be excluded
if (
@ -456,6 +460,10 @@ export class WebScraperDataProvider {
}
}
if (!Array.isArray(this.includes)) {
this.includes = this.includes.split(',');
}
if (this.includes.length > 0 && this.includes[0] !== "") {
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0) {
@ -567,8 +575,15 @@ export class WebScraperDataProvider {
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== "");
if (typeof options.crawlerOptions?.excludes === 'string') {
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
}
if (typeof options.crawlerOptions?.includes === 'string') {
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
}
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling =