From 6c726a02eb64df41f64011d7bd87e5b6ccb6c844 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:46:42 -0300 Subject: [PATCH] Moved to utils/removeUnwantedElements, added unit tests --- apps/api/src/scraper/WebScraper/single_url.ts | 40 +----------- .../__tests__/removeUnwantedElements.test.ts | 63 +++++++++++++++++++ .../utils/removeUnwantedElements.ts | 41 ++++++++++++ 3 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts create mode 100644 apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 354a5cb1..e112cd45 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; +import { removeUnwantedElements } from "./utils/removeUnwantedElements"; import axios from "axios"; dotenv.config(); @@ -313,44 +313,6 @@ export async function scrapSingleUrl( ): Promise { urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { - const soup = cheerio.load(html); - soup("script, style, iframe, noscript, meta, head").remove(); - - if (pageOptions.removeTags) { - if (typeof pageOptions.removeTags === 'string') { - pageOptions.removeTags = [pageOptions.removeTags]; - } - - if (Array.isArray(pageOptions.removeTags)) { - pageOptions.removeTags.forEach((tag) => { - let elementsToRemove; - if (tag.startsWith("*") && tag.endsWith("*")) { - const regexPattern = new RegExp(`\\b${tag.slice(1, -1)}\\b`); - elementsToRemove = soup('*').filter((index, element) => { - const classNames = soup(element).attr('class'); - return classNames && classNames.split(/\s+/).some(className => regexPattern.test(className)); - }); - } else { - elementsToRemove = soup(tag); - } - - elementsToRemove.remove(); - }); - } - } - - if (pageOptions.onlyMainContent) { - // remove any other tags that are not in the main content - excludeNonMainTags.forEach((tag) => { - const elementsToRemove = soup(tag); - elementsToRemove.remove(); - }); - } - const cleanedHtml = soup.html(); - return cleanedHtml; -}; - const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts new file mode 100644 index 00000000..cfa49e7f --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -0,0 +1,63 @@ +import { removeUnwantedElements } from "../removeUnwantedElements"; +import { PageOptions } from "../../../../lib/entities"; + +describe('removeUnwantedElements', () => { + it('should remove script, style, iframe, noscript, meta, and head tags', () => { + const html = `Test
Content
`; + const options: PageOptions = {}; + const result = removeUnwantedElements(html, options); + expect(result).not.toContain('