mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge pull request #685 from devflowinc/main
bugfix: using onlyIncludeTags and removeTags together
This commit is contained in:
commit
ff4b7a835b
File diff suppressed because one or more lines are too long
|
@ -1,30 +1,31 @@
|
|||
import cheerio, { AnyNode, Cheerio } from "cheerio";
|
||||
import { AnyNode, Cheerio, load } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions
|
||||
pageOptions: PageOptions,
|
||||
) => {
|
||||
const soup = cheerio.load(html);
|
||||
let soup = load(html);
|
||||
|
||||
if (
|
||||
pageOptions.onlyIncludeTags &&
|
||||
pageOptions.onlyIncludeTags.length > 0 &&
|
||||
pageOptions.onlyIncludeTags[0] !== ''
|
||||
pageOptions.onlyIncludeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load("<div></div>")("div");
|
||||
const newRoot = load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
return newRoot.html();
|
||||
|
||||
soup = load(newRoot.html());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -33,7 +34,7 @@ export const removeUnwantedElements = (
|
|||
if (
|
||||
pageOptions.removeTags &&
|
||||
pageOptions.removeTags.length > 0 &&
|
||||
pageOptions.removeTags[0] !== ''
|
||||
pageOptions.removeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
|
@ -51,11 +52,11 @@ export const removeUnwantedElements = (
|
|||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||
);
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`)
|
||||
regexPattern.test(`class="${attributes[attr]}"`),
|
||||
);
|
||||
}
|
||||
return tagNameMatches || attributesMatch || classMatch;
|
||||
|
|
Loading…
Reference in New Issue
Block a user