From c00cd21308d0d32fbec65f5383b7d22c73c78ea9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 29 Oct 2024 14:10:40 -0300 Subject: [PATCH] Nick: adds support for mobile web scraping --- apps/api/src/controllers/v1/types.ts | 4 +++- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 +++-- apps/api/src/scraper/WebScraper/single_url.ts | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 633bbdf1..c8eec815 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -107,6 +107,7 @@ export const scrapeOptions = z.object({ timeout: z.number().int().positive().finite().safe().default(30000), waitFor: z.number().int().nonnegative().finite().safe().default(0), extract: extractOptions.optional(), + mobile: z.boolean().default(false), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), // New @@ -459,7 +460,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { parsePDF: x.parsePDF, actions: x.actions as Action[], // no strict null checking grrrr - mogery geolocation: x.location ?? x.geolocation, - skipTlsVerification: x.skipTlsVerification + skipTlsVerification: x.skipTlsVerification, + mobile: x.mobile, }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 8aa1d004..0b0792d4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -55,6 +55,7 @@ export type PageOptions = { country?: string; }; skipTlsVerification?: boolean; + mobile?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1817a07b..4e39e426 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -595,6 +595,7 @@ export class WebScraperDataProvider { actions: options.pageOptions?.actions ?? undefined, geolocation: options.pageOptions?.geolocation ?? undefined, skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false, + mobile: options.pageOptions?.mobile ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 3bbd74eb..5c143ec4 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -28,7 +28,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, mobile: false }, fireEngineOptions = {}, headers, options, @@ -40,7 +40,7 @@ export async function scrapWithFireEngine({ waitFor?: number; screenshot?: boolean; fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean }; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, mobile?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; @@ -115,6 +115,7 @@ export async function scrapWithFireEngine({ priority, engine, instantReturn: true, + mobile: pageOptions?.mobile ?? false, ...fireEngineOptionsParam, atsv: pageOptions?.atsv ?? false, scrollXPaths: pageOptions?.scrollXPaths ?? [], diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c7185b79..945d7c82 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -158,6 +158,7 @@ export async function scrapSingleUrl( actions: pageOptions.actions ?? undefined, geolocation: pageOptions.geolocation ?? undefined, skipTlsVerification: pageOptions.skipTlsVerification ?? false, + mobile: pageOptions.mobile ?? false, } if (extractorOptions) {