mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
added variables to beta customers
This commit is contained in:
parent
5a44191344
commit
ecd472356b
|
@ -24,6 +24,9 @@ export type PageOptions = {
|
|||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
useFastMode?: boolean; // beta
|
||||
disableJSDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
@ -66,6 +69,7 @@ export type WebScraperOptions = {
|
|||
concurrentRequests?: number;
|
||||
bullJobId?: string;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
};
|
||||
|
||||
export interface DocumentUrl {
|
||||
|
@ -142,4 +146,5 @@ export interface FireEngineOptions{
|
|||
blockMedia?: boolean;
|
||||
blockAds?: boolean;
|
||||
disableJsDom?: boolean;
|
||||
atsv?: boolean; // beta
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ export class WebScraperDataProvider {
|
|||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
private teamId?: string;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
|
@ -596,6 +597,7 @@ export class WebScraperDataProvider {
|
|||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
|
|
@ -22,21 +22,23 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
teamId,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
|
@ -51,11 +53,11 @@ export async function scrapWithFireEngine({
|
|||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
@ -70,6 +72,20 @@ export async function scrapWithFireEngine({
|
|||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
console.log('using tlsclient')
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
||||
fireEngineOptionsParam.atsv = true;
|
||||
} else {
|
||||
pageOptions.atsv = false;
|
||||
}
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
|
@ -80,7 +96,9 @@ export async function scrapWithFireEngine({
|
|||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
{
|
||||
|
|
|
@ -136,6 +136,7 @@ export async function scrapSingleUrl(
|
|||
},
|
||||
existingHtml: string = "",
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
|
@ -164,7 +165,7 @@ export async function scrapSingleUrl(
|
|||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if(method === "fire-engine;chrome-cdp"){
|
||||
if (method === "fire-engine;chrome-cdp") {
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
|
@ -178,8 +179,10 @@ export async function scrapSingleUrl(
|
|||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
atsv: pageOptions.atsv,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
|
|
Loading…
Reference in New Issue
Block a user