mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
feat(scrapeURL): add url-specific parameters
Some checks failed
STAGING Deploy Images to GHCR / push-app-image (push) Has been cancelled
Some checks failed
STAGING Deploy Images to GHCR / push-app-image (push) Has been cancelled
This commit is contained in:
parent
e5385e62ee
commit
5e2124c6f9
|
@ -8,6 +8,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
|
|||
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
|
||||
import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||
|
||||
export type ScrapeUrlResponse = ({
|
||||
success: true,
|
||||
|
@ -74,6 +75,12 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
|
|||
}
|
||||
|
||||
function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
|
||||
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||
if (specParams !== undefined) {
|
||||
options = Object.assign(options, specParams.scrapeOptions);
|
||||
internalOptions = Object.assign(internalOptions, specParams.internalOptions);
|
||||
}
|
||||
|
||||
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
|
||||
const logs: any[] = [];
|
||||
_logger.add(new ArrayTransport({ array: logs, scrapeId: id }));
|
||||
|
|
78
apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts
Normal file
78
apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts
Normal file
|
@ -0,0 +1,78 @@
|
|||
import { InternalOptions } from "..";
|
||||
import { ScrapeOptions } from "../../../controllers/v1/types";
|
||||
|
||||
export type UrlSpecificParams = {
|
||||
scrapeOptions: Partial<ScrapeOptions>,
|
||||
internalOptions: Partial<InternalOptions>,
|
||||
};
|
||||
|
||||
const docsParam: UrlSpecificParams = {
|
||||
scrapeOptions: {
|
||||
waitFor: 2000,
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
internalOptions: { forceEngine: "fire-engine;chrome-cdp" },
|
||||
}
|
||||
|
||||
export const urlSpecificParams: Record<string, UrlSpecificParams> = {
|
||||
"support.greenpay.me": docsParam,
|
||||
"docs.pdw.co": docsParam,
|
||||
"developers.notion.com": docsParam,
|
||||
"docs2.hubitat.com": docsParam,
|
||||
"rsseau.fr": docsParam,
|
||||
"help.salesforce.com": docsParam,
|
||||
"scrapethissite.com": {
|
||||
scrapeOptions: {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
internalOptions: { forceEngine: "fetch" },
|
||||
},
|
||||
// "eonhealth.com": {
|
||||
// defaultScraper: "fire-engine",
|
||||
// params: {
|
||||
// fireEngineOptions: {
|
||||
// mobileProxy: true,
|
||||
// method: "get",
|
||||
// engine: "request",
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
"notion.com": {
|
||||
scrapeOptions: { waitFor: 2000 },
|
||||
internalOptions: { forceEngine: "fire-engine;playwright" }
|
||||
},
|
||||
"developer.apple.com": {
|
||||
scrapeOptions: { waitFor: 2000 },
|
||||
internalOptions: { forceEngine: "fire-engine;playwright" }
|
||||
},
|
||||
"digikey.com": {
|
||||
scrapeOptions: {},
|
||||
internalOptions: { forceEngine: "fire-engine;tlsclient" }
|
||||
},
|
||||
"lorealparis.hu": {
|
||||
scrapeOptions: {},
|
||||
internalOptions: { forceEngine: "fire-engine;tlsclient" },
|
||||
}
|
||||
};
|
Loading…
Reference in New Issue
Block a user