Nick: geolocation

This commit is contained in:
Nicolas 2024-10-15 21:12:33 -03:00
parent f49552e413
commit b4f6a0f919
6 changed files with 2279 additions and 2 deletions

View File

@ -4,6 +4,7 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities"; import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
import { countries } from "../../lib/validate-country";
export type Format = export type Format =
| "markdown" | "markdown"
@ -108,6 +109,14 @@ export const scrapeOptions = z.object({
extract: extractOptions.optional(), extract: extractOptions.optional(),
parsePDF: z.boolean().default(true), parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(), actions: actionsSchema.optional(),
geolocation: z.object({
country: z.string().optional().refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
}
).transform(val => val ? val.toUpperCase() : 'US')
}).optional(),
}).strict(strictMessage) }).strict(strictMessage)
@ -421,6 +430,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
fullPageScreenshot: x.formats.includes("screenshot@fullPage"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF, parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.geolocation,
}; };
} }

View File

@ -51,6 +51,9 @@ export type PageOptions = {
disableJsDom?: boolean; // beta disableJsDom?: boolean; // beta
atsv?: boolean; // anti-bot solver, beta atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta actions?: Action[]; // beta
geolocation?: {
country?: string;
};
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

File diff suppressed because it is too large Load Diff

View File

@ -593,6 +593,7 @@ export class WebScraperDataProvider {
disableJsDom: options.pageOptions?.disableJsDom ?? false, disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false, atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined, actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
}; };
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths = this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false, fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "us" } },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean; fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
@ -118,6 +118,7 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam, ...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false, atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [], scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
actions: actions, actions: actions,
}, },
{ {

View File

@ -156,6 +156,7 @@ export async function scrapSingleUrl(
disableJsDom: pageOptions.disableJsDom ?? false, disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false, atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined, actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
} }
if (extractorOptions) { if (extractorOptions) {