mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
expose engine results tracker for ScrapeEvents implementation
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
This commit is contained in:
parent
be40dcb217
commit
461eda8d33
|
@ -18,6 +18,7 @@ export type ScrapeUrlResponse = ({
|
|||
error: any,
|
||||
}) & {
|
||||
logs: any[],
|
||||
engines: EngineResultsTracker,
|
||||
}
|
||||
|
||||
export type Meta = {
|
||||
|
@ -116,7 +117,7 @@ export type InternalOptions = {
|
|||
v0DisableJsDom?: boolean;
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = { [E in Engine]?: {
|
||||
export type EngineResultsTracker = { [E in Engine]?: ({
|
||||
state: "error",
|
||||
error: any,
|
||||
unexpected: boolean,
|
||||
|
@ -127,6 +128,9 @@ export type EngineResultsTracker = { [E in Engine]?: {
|
|||
unsupportedFeatures: Set<FeatureFlag>,
|
||||
} | {
|
||||
state: "timeout",
|
||||
}) & {
|
||||
startedAt: number,
|
||||
finishedAt: number,
|
||||
} };
|
||||
|
||||
export type EngineScrapeResultWithContext = {
|
||||
|
@ -135,6 +139,16 @@ export type EngineScrapeResultWithContext = {
|
|||
result: (EngineScrapeResult & { markdown: string }),
|
||||
};
|
||||
|
||||
function safeguardCircularError<T>(error: T): T {
|
||||
if (typeof error === "object" && error !== null && (error as any).results) {
|
||||
const newError = structuredClone(error);
|
||||
delete (newError as any).results;
|
||||
return newError;
|
||||
} else {
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeURLLoop(
|
||||
meta: Meta
|
||||
): Promise<ScrapeUrlResponse> {
|
||||
|
@ -149,6 +163,7 @@ async function scrapeURLLoop(
|
|||
let result: EngineScrapeResultWithContext | null = null;
|
||||
|
||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||
const startedAt = Date.now();
|
||||
try {
|
||||
meta.logger.info("Scraping via " + engine + "...");
|
||||
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
||||
|
@ -167,6 +182,8 @@ async function scrapeURLLoop(
|
|||
result: engineResult,
|
||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||
unsupportedFeatures,
|
||||
startedAt,
|
||||
finishedAt: Date.now(),
|
||||
};
|
||||
|
||||
// NOTE: TODO: what to do when status code is bad is tough...
|
||||
|
@ -186,23 +203,40 @@ async function scrapeURLLoop(
|
|||
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
|
||||
results[engine] = {
|
||||
state: "error",
|
||||
error,
|
||||
error: safeguardCircularError(error),
|
||||
unexpected: false,
|
||||
startedAt,
|
||||
finishedAt: Date.now(),
|
||||
};
|
||||
} else if (error instanceof TimeoutError) {
|
||||
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
|
||||
results[engine] = {
|
||||
state: "timeout",
|
||||
startedAt,
|
||||
finishedAt: Date.now(),
|
||||
};
|
||||
} else if (error instanceof AddFeatureError) {
|
||||
throw error;
|
||||
} else if (error instanceof LLMRefusalError) {
|
||||
results[engine] = {
|
||||
state: "error",
|
||||
error: safeguardCircularError(error),
|
||||
unexpected: true,
|
||||
startedAt,
|
||||
finishedAt: Date.now(),
|
||||
}
|
||||
error.results = results;
|
||||
meta.logger.warn("LLM refusal encountered", { error });
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
|
||||
results[engine] = {
|
||||
state: "error",
|
||||
error,
|
||||
error: safeguardCircularError(error),
|
||||
unexpected: true,
|
||||
startedAt,
|
||||
finishedAt: Date.now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -237,6 +271,7 @@ async function scrapeURLLoop(
|
|||
success: true,
|
||||
document,
|
||||
logs: meta.logs,
|
||||
engines: results,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -261,19 +296,25 @@ export async function scrapeURL(
|
|||
}
|
||||
}
|
||||
} catch (error) {
|
||||
let results: EngineResultsTracker = {};
|
||||
|
||||
if (error instanceof NoEnginesLeftError) {
|
||||
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
|
||||
results = error.results;
|
||||
} else if (error instanceof LLMRefusalError) {
|
||||
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
||||
results = error.results!;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||
// TODO: results?
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error,
|
||||
logs: meta.logs,
|
||||
engines: results,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,14 +3,18 @@ import { encoding_for_model } from "@dqbd/tiktoken";
|
|||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import { Document, ExtractOptions } from "../../../controllers/v1/types";
|
||||
import { Logger } from "winston";
|
||||
import { Meta } from "..";
|
||||
import { EngineResultsTracker, Meta } from "..";
|
||||
|
||||
const maxTokens = 32000;
|
||||
const modifier = 4;
|
||||
|
||||
export class LLMRefusalError extends Error {
|
||||
public refusal: string;
|
||||
public results: EngineResultsTracker | undefined;
|
||||
|
||||
constructor(refusal: string) {
|
||||
super("LLM refused to extract the website's content", { cause: { refusal } })
|
||||
super("LLM refused to extract the website's content")
|
||||
this.refusal = refusal;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user