Compare commits

...

7 Commits

Author SHA1 Message Date
Gergő Móricz
63787bc504 fix(scrapeURL/fire-engine): wait longer if timeout is not specified
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
2024-11-15 20:25:16 +01:00
Gergő Móricz
4cddcd5206 fix(scrapeURL/fire-engine): timeout-less scrape support (initial) 2024-11-15 20:15:25 +01:00
Gergő Móricz
350d00d27a fix(crawler): treat XML files as sitemaps (temporarily) 2024-11-15 20:09:20 +01:00
Gergő Móricz
ca2e33db0a fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00
Gergő Móricz
7b02c45dd0 fix(v1/types): better timeout primitives 2024-11-15 19:35:54 +01:00
Gergő Móricz
c95a4a26c9 fix(v1/batch/scrape): raise default timeout 2024-11-15 18:58:03 +01:00
Móricz Gergő
3a342bfbf0 fix(scrapeURL/playwright): JSON body fix 2024-11-15 15:18:40 +01:00
7 changed files with 74 additions and 42 deletions

View File

@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
mobile: z.boolean().default(false),
@ -153,9 +153,10 @@ export const scrapeOptions = z.object({
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
url,
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(30000),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
@ -199,12 +200,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
);
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
@ -235,7 +231,7 @@ export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
scrapeOptions: scrapeOptions.default({}),
webhook: webhookSchema.optional(),
limit: z.number().default(10000),
}).strict(strictMessage);

View File

@ -362,7 +362,7 @@ export class WebCrawler {
};
const sitemapUrl = url.endsWith("/sitemap.xml")
const sitemapUrl = url.endsWith(".xml")
? url
: `${url}/sitemap.xml`;

View File

@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
// TODO: scrollXPaths
};
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
defaultTimeout + totalWait,
meta.options.timeout !== undefined
? defaultTimeout + totalWait
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
geolocation: meta.options.geolocation,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
defaultTimeout + meta.options.waitFor
meta.options.timeout !== undefined
? defaultTimeout + meta.options.waitFor
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
meta.options.timeout !== undefined
? defaultTimeout
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);

View File

@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false
geolocation?: { country?: string; languages?: string[]; };
timeout?: number;
}
export type FireEngineScrapeRequestChromeCDP = {

View File

@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
body: {
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers,
}),
},
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({

View File

@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logJob(job: FirecrawlJob) {
export async function logJob(job: FirecrawlJob, force: boolean = false) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
@ -23,28 +23,52 @@ export async function logJob(job: FirecrawlJob) {
job.scrapeOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
}
const jobColumn = {
job_id: job.job_id ? job.job_id : null,
success: job.success,
message: job.message,
num_docs: job.num_docs,
docs: job.docs,
time_taken: job.time_taken,
team_id: job.team_id === "preview" ? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens,
retry: !!job.retry,
crawl_id: job.crawl_id,
};
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.insert([
{
job_id: job.job_id ? job.job_id : null,
success: job.success,
message: job.message,
num_docs: job.num_docs,
docs: job.docs,
time_taken: job.time_taken,
team_id: job.team_id === "preview" ? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens,
retry: !!job.retry,
crawl_id: job.crawl_id,
},
]);
if (force) {
while (true) {
try {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
} else {
break;
}
} catch (error) {
logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
}
}
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
} else {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
} else {
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
}
}
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = {
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
posthog.capture(phLog);
}
}
if (error) {
logger.error(`Error logging job: ${error.message}`);
}
} catch (error) {
logger.error(`Error logging job: ${error.message}`);
}

View File

@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
}, true);
await addCrawlJobDone(job.data.crawl_id, job.id);
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
crawlerOptions: sc.crawlerOptions,
origin: job.data.origin,
});
}, true);
}
}
}
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
}, true);
// await logJob({
// job_id: job.data.crawl_id,