Merge pull request #503 from mendableai/bugfix/empty-excludes

[Bug] Fixed the empty excludes.filter is undefined bug
2024-11-16 11:42:24 +08:00 · 2024-08-05 20:42:21 -04:00 · 2024-08-05 20:42:21 -04:00 · 72f2c3616f
commit 72f2c3616f
parent 789c6cf5d7 f32e8de156
2 changed files with 23 additions and 8 deletions
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -42,8 +42,8 @@ export type SearchOptions = {

 export type CrawlerOptions = {
  returnOnlyUrls?: boolean;
-  includes?: string[];
-  excludes?: string[];
+  includes?: string | string[];
+  excludes?: string | string[];
  maxCrawledLinks?: number;
  maxDepth?: number;
  limit?: number;
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -27,8 +27,8 @@ export class WebScraperDataProvider {
  private bullJobId: string;
  private urls: string[] = [""];
  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
-  private includes: string[];
-  private excludes: string[];
+  private includes: string | string[];
+  private excludes: string | string[];
  private maxCrawledLinks: number;
  private maxCrawledDepth: number = 10;
  private returnOnlyUrls: boolean;
@ -171,8 +171,8 @@ export class WebScraperDataProvider {
    const crawler = new WebCrawler({
      jobId: this.jobId,
      initialUrl: this.urls[0],
-      includes: this.includes,
-      excludes: this.excludes,
+      includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','),
+      excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','),
      maxCrawledLinks: this.maxCrawledLinks,
      maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
      limit: this.limit,
@ -445,6 +445,10 @@ export class WebScraperDataProvider {
      const url = new URL(document.metadata.sourceURL);
      const path = url.pathname;

+      if (!Array.isArray(this.excludes)) {
+        this.excludes = this.excludes.split(',');
+      }
+
      if (this.excludes.length > 0 && this.excludes[0] !== "") {
        // Check if the link should be excluded
        if (
@ -456,6 +460,10 @@ export class WebScraperDataProvider {
        }
      }

+      if (!Array.isArray(this.includes)) {
+        this.includes = this.includes.split(',');
+      }
+
      if (this.includes.length > 0 && this.includes[0] !== "") {
        // Check if the link matches the include patterns, if any are specified
        if (this.includes.length > 0) {
@ -567,8 +575,15 @@ export class WebScraperDataProvider {
      options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
      options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
      false;
-    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
-    this.excludes = this.excludes.filter((item) => item !== "");
+
+    if (typeof options.crawlerOptions?.excludes === 'string') {
+      this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
+    }
+
+    if (typeof options.crawlerOptions?.includes === 'string') {
+      this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
+    }
+
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
    this.allowBackwardCrawling =