mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick: added max retries for fire-engine
This commit is contained in:
parent
a9773a24a3
commit
4298cb6cc4
|
@ -95,62 +95,77 @@ export async function scrapWithFireEngine({
|
|||
});
|
||||
|
||||
const startTime = Date.now();
|
||||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
headers: headers,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
const maxRetries = 3;
|
||||
let retryCount = 0;
|
||||
let _response;
|
||||
let checkStatusResponse;
|
||||
|
||||
while (retryCount <= maxRetries) {
|
||||
_response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
headers: headers,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
||||
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
|
||||
// added 5 seconds to the timeout to account for 'smart wait'
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
if (checkStatusResponse.data.processing) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||
axiosInstance.delete(
|
||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
||||
validateStatus: (status) => true
|
||||
}
|
||||
).catch((error) => {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||
);
|
||||
});
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", pageStatusCode: null, pageError: "" };
|
||||
|
||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
||||
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
|
||||
// added 5 seconds to the timeout to account for 'smart wait'
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
if (checkStatusResponse.data.processing) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||
axiosInstance.delete(
|
||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
||||
validateStatus: (status) => true
|
||||
}
|
||||
).catch((error) => {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||
});
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", pageStatusCode: null, pageError: "" };
|
||||
}
|
||||
|
||||
if (!checkStatusResponse.data.processing && (checkStatusResponse.status === 200 && (checkStatusResponse.data.pageStatusCode >= 200 && checkStatusResponse.data.pageStatusCode < 300 || checkStatusResponse.data.pageStatusCode === 404))) {
|
||||
break;
|
||||
}
|
||||
|
||||
retryCount++;
|
||||
if (retryCount <= maxRetries) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Retrying request for ${url}. Attempt ${retryCount} of ${maxRetries}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||
|
|
|
@ -410,12 +410,11 @@ export async function scrapSingleUrl(
|
|||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
}
|
||||
// force pageError if it's the last scraper and it failed too
|
||||
if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
|
||||
pageError = attempt.pageError;
|
||||
|
||||
if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
|
||||
pageStatusCode = 500;
|
||||
}
|
||||
pageStatusCode = attempt.pageStatusCode ?? 500;
|
||||
|
||||
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
|
||||
pageError = undefined;
|
||||
}
|
||||
|
@ -424,8 +423,8 @@ export async function scrapSingleUrl(
|
|||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||
if (pageStatusCode && (pageStatusCode === 404)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
|
||||
break;
|
||||
}
|
||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
|
|
Loading…
Reference in New Issue
Block a user