Nick: wip

This commit is contained in:
Nicolas 2024-10-29 14:05:59 -03:00
parent c96b36d045
commit ad3089f96b
7 changed files with 31 additions and 6 deletions

View File

@ -13,6 +13,7 @@ export type Format =
| "links"
| "screenshot"
| "screenshot@fullPage"
| "screenshot@mobile"
| "extract";
export const url = z.preprocess(
@ -94,12 +95,13 @@ export const scrapeOptions = z.object({
"links",
"screenshot",
"screenshot@fullPage",
"screenshot@mobile",
"extract"
])
.array()
.optional()
.default(["markdown"])
.refine(x => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage"),
.refine(x => x.filter(format => format.startsWith("screenshot")).length <= 1, "You may only specify one screenshot format: screenshot, screenshot@fullPage, or screenshot@mobile"),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
@ -456,6 +458,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
mobileScreenshot: x.formats.includes("screenshot@mobile"),
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.location ?? x.geolocation,
@ -485,6 +488,11 @@ export function legacyDocumentConverter(doc: any): Document {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
if (doc.metadata.mobileScreenshot) {
doc.mobileScreenshot = doc.metadata.mobileScreenshot;
delete doc.metadata.mobileScreenshot;
}
}
return {
@ -493,7 +501,7 @@ export function legacyDocumentConverter(doc: any): Document {
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
screenshot: doc.screenshot ?? doc.fullPageScreenshot ?? doc.mobileScreenshot,
actions: doc.actions ?? undefined,
warning: doc.warning ?? undefined,
metadata: {

View File

@ -8,6 +8,7 @@ export const defaultPageOptions = {
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
mobileScreenshot: false,
parsePDF: true
};

View File

@ -19,6 +19,7 @@ export type Action = {
} | {
type: "screenshot",
fullPage?: boolean,
mobile?: boolean,
} | {
type: "write",
text: string,
@ -41,6 +42,7 @@ export type PageOptions = {
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
mobileScreenshot?: boolean;
headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;

View File

@ -588,6 +588,7 @@ export class WebScraperDataProvider {
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
mobileScreenshot: options.pageOptions?.mobileScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
useFastMode: options.pageOptions?.useFastMode ?? false,
disableJsDom: options.pageOptions?.disableJsDom ?? false,

View File

@ -28,6 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
mobileScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
fireEngineOptions = {},
headers,
@ -40,6 +41,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
mobileScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
@ -64,9 +66,12 @@ export async function scrapWithFireEngine({
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let mobileScreenshotParam = reqParams["params"]?.mobileScreenshot ?? mobileScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
console.log("fullPageScreenshotParam", fullPageScreenshotParam);
console.log("mobileScreenshotParam", mobileScreenshotParam);
let endpoint = "/scrape";
if(options?.endpoint === "request") {
@ -103,6 +108,8 @@ export async function scrapWithFireEngine({
name: "Call to fire-engine"
}, async span => {
console.log("mobileScreenshotParam", mobileScreenshotParam);
console.log("pageOptions", pageOptions);
return await axiosInstance.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
@ -111,6 +118,7 @@ export async function scrapWithFireEngine({
wait: waitParam,
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
mobile: mobileScreenshotParam,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
engine,

View File

@ -146,6 +146,7 @@ export async function scrapSingleUrl(
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
mobileScreenshot: pageOptions.mobileScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
@ -223,6 +224,7 @@ export async function scrapSingleUrl(
return [action as Action];
}) ?? [] as Action[];
console.log("pageOptions", pageOptions);
const response = await scrapWithFireEngine({
url,
...(engine === "chrome-cdp" ? ({
@ -231,9 +233,10 @@ export async function scrapSingleUrl(
type: "wait" as const,
milliseconds: pageOptions.waitFor,
}] : []),
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
...((pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) ? [{
type: "screenshot" as const,
fullPage: !!pageOptions.fullPageScreenshot,
mobile: !!pageOptions.mobileScreenshot,
}] : []),
...processedActions,
],
@ -241,7 +244,9 @@ export async function scrapSingleUrl(
waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
mobileScreenshot: pageOptions.mobileScreenshot,
})),
mobileScreenshot:pageOptions.mobileScreenshot,
pageOptions: pageOptions,
headers: pageOptions.headers,
fireEngineOptions: {
@ -253,7 +258,7 @@ export async function scrapSingleUrl(
teamId,
});
scraperResponse.text = response.html;
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
if (pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) {
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
}
if (pageOptions.actions) {
@ -389,7 +394,7 @@ export async function scrapSingleUrl(
const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true || pageOptions.mobileScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
);

View File

@ -75,7 +75,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
* Defines the options and configurations available for scraping web content.
*/
export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "screenshot@mobile" | "extract")[];
headers?: Record<string, string>;
includeTags?: string[];
excludeTags?: string[];