From 6c726a02eb64df41f64011d7bd87e5b6ccb6c844 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:46:42 -0300
Subject: [PATCH] Moved to utils/removeUnwantedElements, added unit tests

---
 apps/api/src/scraper/WebScraper/single_url.ts | 40 +-----------
 .../__tests__/removeUnwantedElements.test.ts  | 63 +++++++++++++++++++
 .../utils/removeUnwantedElements.ts           | 41 ++++++++++++
 3 files changed, 105 insertions(+), 39 deletions(-)
 create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
 create mode 100644 apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index 354a5cb1..e112cd45 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata";
 import dotenv from "dotenv";
 import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
-import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { handleCustomScraping } from "./custom/handleCustomScraping";
+import { removeUnwantedElements } from "./utils/removeUnwantedElements";
 import axios from "axios";
 
 dotenv.config();
@@ -313,44 +313,6 @@ export async function scrapSingleUrl(
 ): Promise<Document> {
   urlToScrap = urlToScrap.trim();
 
-  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
-    const soup = cheerio.load(html);
-    soup("script, style, iframe, noscript, meta, head").remove();
-    
-    if (pageOptions.removeTags) {
-      if (typeof pageOptions.removeTags === 'string') {
-        pageOptions.removeTags = [pageOptions.removeTags];
-      }
-    
-      if (Array.isArray(pageOptions.removeTags)) {
-        pageOptions.removeTags.forEach((tag) => {
-          let elementsToRemove;
-          if (tag.startsWith("*") && tag.endsWith("*")) {
-            const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
-            elementsToRemove = soup('*').filter((index, element) => {
-              const classNames = soup(element).attr('class');
-              return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
-            });
-          } else {
-            elementsToRemove = soup(tag);
-          }
-    
-          elementsToRemove.remove();
-        });
-      }
-    }
-    
-    if (pageOptions.onlyMainContent) {
-      // remove any other tags that are not in the main content
-      excludeNonMainTags.forEach((tag) => {
-        const elementsToRemove = soup(tag);
-        elementsToRemove.remove();
-      });
-    }
-    const cleanedHtml = soup.html();
-    return cleanedHtml;
-};
-
   const attemptScraping = async (
     url: string,
     method: (typeof baseScrapers)[number]
diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
new file mode 100644
index 00000000..cfa49e7f
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts
@@ -0,0 +1,63 @@
+import { removeUnwantedElements } from "../removeUnwantedElements";
+import { PageOptions } from "../../../../lib/entities";
+
+describe('removeUnwantedElements', () => {
+  it('should remove script, style, iframe, noscript, meta, and head tags', () => {
+    const html = `<html><head><title>Test</title></head><body><script>alert('test');</script><div>Content</div></body></html>`;
+    const options: PageOptions = {};
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('<script>');
+    expect(result).not.toContain('<head>');
+    expect(result).toContain('Content');
+  });
+
+  it('should remove specified tags passed as string', () => {
+    const html = `<div><span>Remove</span><p>Keep</p></div>`;
+    const options: PageOptions = { removeTags: 'span' };
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('<span>');
+    expect(result).toContain('<p>Keep</p>');
+  });
+
+  it('should remove specified tags passed as array', () => {
+    const html = `<div><span>Remove</span><p>Remove</p><a>Keep</a></div>`;
+    const options: PageOptions = { removeTags: ['span', 'p'] };
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('<span>');
+    expect(result).not.toContain('<p>');
+    expect(result).toContain('<a>Keep</a>');
+  });
+
+  it('should handle class selectors', () => {
+    const html = `<div class="test">Remove</div><div class="keep">Keep</div>`;
+    const options: PageOptions = { removeTags: '.test' };
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('class="test"');
+    expect(result).toContain('class="keep"');
+  });
+
+  it('should handle id selectors', () => {
+    const html = `<div id="test">Remove</div><div id="keep">Keep</div>`;
+    const options: PageOptions = { removeTags: '#test' };
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('id="test"');
+    expect(result).toContain('id="keep"');
+  });
+
+  it('should handle regex patterns in class names', () => {
+    const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div>`;
+    const options: PageOptions = { removeTags: ['*test*'] };
+    const result = removeUnwantedElements(html, options);
+    expect(result).not.toContain('class="test-123"');
+    expect(result).not.toContain('class="test-abc"');
+    expect(result).toContain('class="keep"');
+  });
+
+  it('should remove non-main content if onlyMainContent is true', () => {
+    const html = `<div><main>Main Content</main><aside>Remove</aside></div>`;
+    const options: PageOptions = { onlyMainContent: true };
+    const result = removeUnwantedElements(html, options);
+    expect(result).toContain('Main Content');
+    expect(result).not.toContain('<aside>');
+  });
+});
diff --git a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
new file mode 100644
index 00000000..0fd28a57
--- /dev/null
+++ b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
@@ -0,0 +1,41 @@
+import cheerio, { AnyNode, Cheerio } from "cheerio";
+import { PageOptions } from "../../../lib/entities";
+import { excludeNonMainTags } from "./excludeTags";
+
+export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
+  const soup = cheerio.load(html);
+  soup("script, style, iframe, noscript, meta, head").remove();
+  
+  if (pageOptions.removeTags) {
+    if (typeof pageOptions.removeTags === 'string') {
+      pageOptions.removeTags = [pageOptions.removeTags];
+    }
+  
+    if (Array.isArray(pageOptions.removeTags)) {
+      pageOptions.removeTags.forEach((tag) => {
+        let elementsToRemove: Cheerio<AnyNode>;
+        if (tag.startsWith("*") && tag.endsWith("*")) {
+          const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`);
+          elementsToRemove = soup('*').filter((index, element) => {
+            const classNames = soup(element).attr('class');
+            return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className));
+          });
+        } else {
+          elementsToRemove = soup(tag);
+        }
+  
+        elementsToRemove.remove();
+      });
+    }
+  }
+  
+  if (pageOptions.onlyMainContent) {
+    // remove any other tags that are not in the main content
+    excludeNonMainTags.forEach((tag) => {
+      const elementsToRemove = soup(tag);
+      elementsToRemove.remove();
+    });
+  }
+  const cleanedHtml = soup.html();
+  return cleanedHtml;
+};
\ No newline at end of file