added variables to beta customers

This commit is contained in:
rafaelsideguide 2024-08-19 16:41:54 -03:00
parent 5a44191344
commit ecd472356b
4 changed files with 36 additions and 8 deletions

View File

@ -24,6 +24,9 @@ export type PageOptions = {
parsePDF?: boolean;
removeTags?: string | string[];
onlyIncludeTags?: string | string[];
useFastMode?: boolean; // beta
disableJSDom?: boolean; // beta
atsv?: boolean; // beta
};
export type ExtractorOptions = {
@ -66,6 +69,7 @@ export type WebScraperOptions = {
concurrentRequests?: number;
bullJobId?: string;
priority?: number;
teamId?: string;
};
export interface DocumentUrl {
@ -142,4 +146,5 @@ export interface FireEngineOptions{
blockMedia?: boolean;
blockAds?: boolean;
disableJsDom?: boolean;
atsv?: boolean; // beta
}

View File

@ -45,6 +45,7 @@ export class WebScraperDataProvider {
private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
private priority?: number;
private teamId?: string;
authorize(): void {
throw new Error("Method not implemented.");
@ -596,6 +597,7 @@ export class WebScraperDataProvider {
this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {

View File

@ -22,21 +22,23 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
fireEngineOptions = {},
headers,
options,
priority,
teamId,
}: {
url: string;
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
priority?: number;
teamId?: string;
}): Promise<FireEngineResponse> {
const logParams = {
url,
@ -51,11 +53,11 @@ export async function scrapWithFireEngine({
try {
const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = "/scrape";
@ -70,6 +72,20 @@ export async function scrapWithFireEngine({
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
if (pageOptions?.useFastMode) {
console.log('using tlsclient')
fireEngineOptionsParam.engine = "tlsclient";
engine = "tlsclient";
}
// atsv is only available for beta customers
const betaCustomersString = process.env.BETA_CUSTOMERS;
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
fireEngineOptionsParam.atsv = true;
} else {
pageOptions.atsv = false;
}
const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
@ -80,7 +96,9 @@ export async function scrapWithFireEngine({
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
pageOptions: pageOptions,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
engine,
...fireEngineOptionsParam,
},
{

View File

@ -136,6 +136,7 @@ export async function scrapSingleUrl(
},
existingHtml: string = "",
priority?: number,
teamId?: string
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@ -164,7 +165,7 @@ export async function scrapSingleUrl(
case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if(method === "fire-engine;chrome-cdp"){
if (method === "fire-engine;chrome-cdp") {
engine = "chrome-cdp";
}
@ -178,8 +179,10 @@ export async function scrapSingleUrl(
headers: pageOptions.headers,
fireEngineOptions: {
engine: engine,
atsv: pageOptions.atsv,
},
priority,
teamId,
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;