mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick: wip
This commit is contained in:
parent
c96b36d045
commit
ad3089f96b
|
@ -13,6 +13,7 @@ export type Format =
|
|||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage"
|
||||
| "screenshot@mobile"
|
||||
| "extract";
|
||||
|
||||
export const url = z.preprocess(
|
||||
|
@ -94,12 +95,13 @@ export const scrapeOptions = z.object({
|
|||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"screenshot@mobile",
|
||||
"extract"
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"])
|
||||
.refine(x => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage"),
|
||||
.refine(x => x.filter(format => format.startsWith("screenshot")).length <= 1, "You may only specify one screenshot format: screenshot, screenshot@fullPage, or screenshot@mobile"),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
|
@ -456,6 +458,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
mobileScreenshot: x.formats.includes("screenshot@mobile"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.location ?? x.geolocation,
|
||||
|
@ -485,6 +488,11 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.mobileScreenshot) {
|
||||
doc.mobileScreenshot = doc.metadata.mobileScreenshot;
|
||||
delete doc.metadata.mobileScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
|
@ -493,7 +501,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot ?? doc.mobileScreenshot,
|
||||
actions: doc.actions ?? undefined,
|
||||
warning: doc.warning ?? undefined,
|
||||
metadata: {
|
||||
|
|
|
@ -8,6 +8,7 @@ export const defaultPageOptions = {
|
|||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
mobileScreenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ export type Action = {
|
|||
} | {
|
||||
type: "screenshot",
|
||||
fullPage?: boolean,
|
||||
mobile?: boolean,
|
||||
} | {
|
||||
type: "write",
|
||||
text: string,
|
||||
|
@ -41,6 +42,7 @@ export type PageOptions = {
|
|||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
mobileScreenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
|
|
|
@ -588,6 +588,7 @@ export class WebScraperDataProvider {
|
|||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
mobileScreenshot: options.pageOptions?.mobileScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
|
|
|
@ -28,6 +28,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
mobileScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
|
@ -40,6 +41,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
mobileScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
|
@ -64,9 +66,12 @@ export async function scrapWithFireEngine({
|
|||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let mobileScreenshotParam = reqParams["params"]?.mobileScreenshot ?? mobileScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
console.log("fullPageScreenshotParam", fullPageScreenshotParam);
|
||||
console.log("mobileScreenshotParam", mobileScreenshotParam);
|
||||
let endpoint = "/scrape";
|
||||
|
||||
if(options?.endpoint === "request") {
|
||||
|
@ -103,6 +108,8 @@ export async function scrapWithFireEngine({
|
|||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
console.log("mobileScreenshotParam", mobileScreenshotParam);
|
||||
console.log("pageOptions", pageOptions);
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
|
@ -111,6 +118,7 @@ export async function scrapWithFireEngine({
|
|||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
mobile: mobileScreenshotParam,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
|
|
|
@ -146,6 +146,7 @@ export async function scrapSingleUrl(
|
|||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
mobileScreenshot: pageOptions.mobileScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
|
@ -223,6 +224,7 @@ export async function scrapSingleUrl(
|
|||
return [action as Action];
|
||||
}) ?? [] as Action[];
|
||||
|
||||
console.log("pageOptions", pageOptions);
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
...(engine === "chrome-cdp" ? ({
|
||||
|
@ -231,9 +233,10 @@ export async function scrapSingleUrl(
|
|||
type: "wait" as const,
|
||||
milliseconds: pageOptions.waitFor,
|
||||
}] : []),
|
||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: !!pageOptions.fullPageScreenshot,
|
||||
mobile: !!pageOptions.mobileScreenshot,
|
||||
}] : []),
|
||||
...processedActions,
|
||||
],
|
||||
|
@ -241,7 +244,9 @@ export async function scrapSingleUrl(
|
|||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
mobileScreenshot: pageOptions.mobileScreenshot,
|
||||
})),
|
||||
mobileScreenshot:pageOptions.mobileScreenshot,
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
|
@ -253,7 +258,7 @@ export async function scrapSingleUrl(
|
|||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) {
|
||||
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||
}
|
||||
if (pageOptions.actions) {
|
||||
|
@ -389,7 +394,7 @@ export async function scrapSingleUrl(
|
|||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot || pageOptions.mobileScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true || pageOptions.mobileScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||
);
|
||||
|
|
|
@ -75,7 +75,7 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||
* Defines the options and configurations available for scraping web content.
|
||||
*/
|
||||
export interface CrawlScrapeOptions {
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "extract")[];
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "screenshot@fullPage" | "screenshot@mobile" | "extract")[];
|
||||
headers?: Record<string, string>;
|
||||
includeTags?: string[];
|
||||
excludeTags?: string[];
|
||||
|
|
Loading…
Reference in New Issue
Block a user