diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 92dd4c7c..d833bda0 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -24,6 +24,9 @@ export type PageOptions = { parsePDF?: boolean; removeTags?: string | string[]; onlyIncludeTags?: string | string[]; + useFastMode?: boolean; // beta + disableJSDom?: boolean; // beta + atsv?: boolean; // beta }; export type ExtractorOptions = { @@ -66,6 +69,7 @@ export type WebScraperOptions = { concurrentRequests?: number; bullJobId?: string; priority?: number; + teamId?: string; }; export interface DocumentUrl { @@ -142,4 +146,5 @@ export interface FireEngineOptions{ blockMedia?: boolean; blockAds?: boolean; disableJsDom?: boolean; + atsv?: boolean; // beta } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 98f07ae5..e0bac57c 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -45,6 +45,7 @@ export class WebScraperDataProvider { private allowBackwardCrawling: boolean = false; private allowExternalContentLinks: boolean = false; private priority?: number; + private teamId?: string; authorize(): void { throw new Error("Method not implemented."); @@ -596,6 +597,7 @@ export class WebScraperDataProvider { this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; this.priority = options.priority; + this.teamId = options.teamId ?? null; // make sure all urls start with https:// this.urls = this.urls.map((url) => { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 77697411..e427f582 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -22,21 +22,23 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, fireEngineOptions = {}, headers, options, priority, + teamId, }: { url: string; waitFor?: number; screenshot?: boolean; fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; priority?: number; + teamId?: string; }): Promise { const logParams = { url, @@ -51,11 +53,11 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); - const waitParam = reqParams["params"]?.wait ?? waitFor; - const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; - const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; - const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; + let waitParam = reqParams["params"]?.wait ?? waitFor; + let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; + let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; + let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; let endpoint = "/scrape"; @@ -70,6 +72,20 @@ export async function scrapWithFireEngine({ `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); + if (pageOptions?.useFastMode) { + console.log('using tlsclient') + fireEngineOptionsParam.engine = "tlsclient"; + engine = "tlsclient"; + } + + // atsv is only available for beta customers + const betaCustomersString = process.env.BETA_CUSTOMERS; + const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : []; + if (pageOptions?.atsv && betaCustomers.includes(teamId)) { + fireEngineOptionsParam.atsv = true; + } else { + pageOptions.atsv = false; + } const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, @@ -80,7 +96,9 @@ export async function scrapWithFireEngine({ fullPageScreenshot: fullPageScreenshotParam, headers: headers, pageOptions: pageOptions, + disableJsDom: pageOptions?.disableJsDom ?? false, priority, + engine, ...fireEngineOptionsParam, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index df9d04ab..1f2a62de 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -136,6 +136,7 @@ export async function scrapSingleUrl( }, existingHtml: string = "", priority?: number, + teamId?: string ): Promise { urlToScrap = urlToScrap.trim(); @@ -164,7 +165,7 @@ export async function scrapSingleUrl( case "fire-engine;chrome-cdp": let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; - if(method === "fire-engine;chrome-cdp"){ + if (method === "fire-engine;chrome-cdp") { engine = "chrome-cdp"; } @@ -178,8 +179,10 @@ export async function scrapSingleUrl( headers: pageOptions.headers, fireEngineOptions: { engine: engine, + atsv: pageOptions.atsv, }, priority, + teamId, }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot;