Merge pull request #297 from AndyMik90/feat/removeTags-regex

[Feat] Added support for RegEx in removeTags
This commit is contained in:
Nicolas 2024-06-18 14:03:41 -04:00 committed by GitHub
commit 1c5a1dd487
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 158 additions and 26 deletions

View File

@ -54,6 +54,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL) const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape") .post("/v0/scrape")

View File

@ -4,10 +4,10 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping"; import { handleCustomScraping } from "./custom/handleCustomScraping";
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
import axios from "axios"; import axios from "axios";
dotenv.config(); dotenv.config();
@ -313,31 +313,6 @@ export async function scrapSingleUrl(
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') {
pageOptions.removeTags.split(',').forEach((tag) => {
soup(tag.trim()).remove();
});
} else if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
soup(tag).remove();
});
}
}
if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => {
soup(tag).remove();
});
}
return soup.html();
};
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: (typeof baseScrapers)[number] method: (typeof baseScrapers)[number]

View File

@ -0,0 +1,103 @@
import { removeUnwantedElements } from "../removeUnwantedElements";
import { PageOptions } from "../../../../lib/entities";
describe('removeUnwantedElements', () => {
it('should remove script, style, iframe, noscript, meta, and head tags', () => {
const html = `<html><head><title>Test</title></head><body><script>alert('test');</script><div>Content</div></body></html>`;
const options: PageOptions = {};
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('<script>');
expect(result).not.toContain('<head>');
expect(result).toContain('Content');
});
it('should remove specified tags passed as string', () => {
const html = `<div><span>Remove</span><p>Keep</p></div>`;
const options: PageOptions = { removeTags: 'span' };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('<span>');
expect(result).toContain('<p>Keep</p>');
});
it('should remove specified tags passed as array', () => {
const html = `<div><span>Remove</span><p>Remove</p><a>Keep</a></div>`;
const options: PageOptions = { removeTags: ['span', 'p'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('<span>');
expect(result).not.toContain('<p>');
expect(result).toContain('<a>Keep</a>');
});
it('should handle class selectors', () => {
const html = `<div class="test">Remove</div><div class="keep">Keep</div>`;
const options: PageOptions = { removeTags: '.test' };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="test"');
expect(result).toContain('class="keep"');
});
it('should handle id selectors', () => {
const html = `<div id="test">Remove</div><div id="keep">Keep</div>`;
const options: PageOptions = { removeTags: '#test' };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('id="test"');
expect(result).toContain('id="keep"');
});
it('should handle regex patterns in class names', () => {
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div>`;
const options: PageOptions = { removeTags: ['*test*'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="test-123"');
expect(result).not.toContain('class="test-abc"');
expect(result).toContain('class="keep"');
});
it('should remove non-main content if onlyMainContent is true', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside></div>`;
const options: PageOptions = { onlyMainContent: true };
const result = removeUnwantedElements(html, options);
expect(result).toContain('Main Content');
expect(result).not.toContain('<aside>');
});
it('should handle complex regex patterns for class names', () => {
const html = `<div class="test-123">Remove</div><div class="test-abc">Remove</div><div class="keep">Keep</div><div class="test-xyz">Remove</div>`;
const options: PageOptions = { removeTags: ['*.test-[a-z]+*'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('class="test-123"');
expect(result).not.toContain('class="test-abc"');
expect(result).not.toContain('class="test-xyz"');
expect(result).toContain('class="keep"');
});
it('should handle complex regex patterns for attributes', () => {
const html = `<div data-info="12345">Remove</div><div data-info="abcde">Keep</div><div data-info="67890">Remove</div>`;
const options: PageOptions = { removeTags: ['*data-info="\\d+"*'] }; // Matches data-info that starts with digits
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('data-info="12345"');
expect(result).not.toContain('data-info="67890"');
expect(result).toContain('data-info="abcde"');
});
it('should handle mixed selectors with regex', () => {
const html = `<div class="remove-this">Remove</div><div id="remove-this">Remove</div><div class="keep-this">Keep</div>`;
const options: PageOptions = { removeTags: ['.remove-this', '#remove-this'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="remove-this"');
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep-this"');
});
it('should handle multiple regex patterns', () => {
const html = `<div attr="test-123">Remove</div><div class="class-remove">Remove</div><div class="keep">Keep</div><div class="remove-this">Remove</div><div id="remove-this">Remove</div>`;
const options: PageOptions = { removeTags: ['*test*', '.class-remove', '*.remove-[a-z]+*', '#remove-this'] };
const result = removeUnwantedElements(html, options);
expect(result).not.toContain('class="test-123"');
expect(result).not.toContain('class="test-abc"');
expect(result).not.toContain('class="remove"');
expect(result).not.toContain('class="remove-this"');
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"');
});
});

View File

@ -0,0 +1,53 @@
import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') {
pageOptions.removeTags = [pageOptions.removeTags];
}
if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
elementsToRemove = soup('*').filter((i, element) => {
if (element.type === 'tag') {
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr =>
regexPattern.test(`${attr}="${attributes[attr]}"`)
);
if (tag.startsWith('*.')) {
classMatch = Object.keys(attributes).some(attr =>
regexPattern.test(`class="${attributes[attr]}"`)
);
}
return tagNameMatches || attributesMatch || classMatch;
}
return false;
});
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
});
}
}
if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
elementsToRemove.remove();
});
}
const cleanedHtml = soup.html();
return cleanedHtml;
};