mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Compare commits
7 Commits
3c1b1909f8
...
63787bc504
Author | SHA1 | Date | |
---|---|---|---|
|
63787bc504 | ||
|
4cddcd5206 | ||
|
350d00d27a | ||
|
ca2e33db0a | ||
|
7b02c45dd0 | ||
|
c95a4a26c9 | ||
|
3a342bfbf0 |
|
@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
|
|||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
timeout: z.number().int().positive().finite().safe().optional(),
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
mobile: z.boolean().default(false),
|
||||
|
@ -153,9 +153,10 @@ export const scrapeOptions = z.object({
|
|||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
|
@ -199,12 +200,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
|
|||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
|
@ -235,7 +231,7 @@ export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
|||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
scrapeOptions: scrapeOptions.default({}),
|
||||
webhook: webhookSchema.optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
|
|
@ -362,7 +362,7 @@ export class WebCrawler {
|
|||
};
|
||||
|
||||
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||
const sitemapUrl = url.endsWith(".xml")
|
||||
? url
|
||||
: `${url}/sitemap.xml`;
|
||||
|
||||
|
|
|
@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
|||
priority: meta.internalOptions.priority,
|
||||
geolocation: meta.options.geolocation,
|
||||
mobile: meta.options.mobile,
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
|
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
|||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + totalWait,
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout + totalWait
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
|
|||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||
wait: meta.options.waitFor,
|
||||
geolocation: meta.options.geolocation,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + meta.options.waitFor
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout + meta.options.waitFor
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
|
|||
atsv: meta.internalOptions.atsv,
|
||||
geolocation: meta.options.geolocation,
|
||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
meta.options.timeout !== undefined
|
||||
? defaultTimeout
|
||||
: Infinity, // TODO: better timeout handling
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
|
|
@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
|
|||
logRequest?: boolean; // default: true
|
||||
instantReturn?: boolean; // default: false
|
||||
geolocation?: { country?: string; languages?: string[]; };
|
||||
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
export type FireEngineScrapeRequestChromeCDP = {
|
||||
|
|
|
@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
|
|||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
body: {
|
||||
url: meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
},
|
||||
method: "POST",
|
||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||
schema: z.object({
|
||||
|
|
|
@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
|
|||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
|
@ -23,28 +23,52 @@ export async function logJob(job: FirecrawlJob) {
|
|||
job.scrapeOptions.headers["Authorization"] = "REDACTED";
|
||||
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
|
||||
}
|
||||
const jobColumn = {
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
docs: job.docs,
|
||||
time_taken: job.time_taken,
|
||||
team_id: job.team_id === "preview" ? null : job.team_id,
|
||||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
};
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([
|
||||
{
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
docs: job.docs,
|
||||
time_taken: job.time_taken,
|
||||
team_id: job.team_id === "preview" ? null : job.team_id,
|
||||
mode: job.mode,
|
||||
url: job.url,
|
||||
crawler_options: job.crawlerOptions,
|
||||
page_options: job.scrapeOptions,
|
||||
origin: job.origin,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
},
|
||||
]);
|
||||
if (force) {
|
||||
while (true) {
|
||||
try {
|
||||
const { error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([jobColumn]);
|
||||
if (error) {
|
||||
logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
|
||||
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
|
||||
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
|
||||
}
|
||||
}
|
||||
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
|
||||
} else {
|
||||
const { error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.insert([jobColumn]);
|
||||
if (error) {
|
||||
logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
|
||||
} else {
|
||||
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
|
||||
}
|
||||
}
|
||||
|
||||
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||
let phLog = {
|
||||
|
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
posthog.capture(phLog);
|
||||
}
|
||||
}
|
||||
if (error) {
|
||||
logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
|
|
|
@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
}, true);
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
|
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
}, true);
|
||||
|
||||
// await logJob({
|
||||
// job_id: job.data.crawl_id,
|
||||
|
|
Loading…
Reference in New Issue
Block a user