mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
expose engine results tracker for ScrapeEvents implementation
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
This commit is contained in:
parent
be40dcb217
commit
461eda8d33
|
@ -18,6 +18,7 @@ export type ScrapeUrlResponse = ({
|
||||||
error: any,
|
error: any,
|
||||||
}) & {
|
}) & {
|
||||||
logs: any[],
|
logs: any[],
|
||||||
|
engines: EngineResultsTracker,
|
||||||
}
|
}
|
||||||
|
|
||||||
export type Meta = {
|
export type Meta = {
|
||||||
|
@ -116,7 +117,7 @@ export type InternalOptions = {
|
||||||
v0DisableJsDom?: boolean;
|
v0DisableJsDom?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = { [E in Engine]?: {
|
export type EngineResultsTracker = { [E in Engine]?: ({
|
||||||
state: "error",
|
state: "error",
|
||||||
error: any,
|
error: any,
|
||||||
unexpected: boolean,
|
unexpected: boolean,
|
||||||
|
@ -127,6 +128,9 @@ export type EngineResultsTracker = { [E in Engine]?: {
|
||||||
unsupportedFeatures: Set<FeatureFlag>,
|
unsupportedFeatures: Set<FeatureFlag>,
|
||||||
} | {
|
} | {
|
||||||
state: "timeout",
|
state: "timeout",
|
||||||
|
}) & {
|
||||||
|
startedAt: number,
|
||||||
|
finishedAt: number,
|
||||||
} };
|
} };
|
||||||
|
|
||||||
export type EngineScrapeResultWithContext = {
|
export type EngineScrapeResultWithContext = {
|
||||||
|
@ -135,6 +139,16 @@ export type EngineScrapeResultWithContext = {
|
||||||
result: (EngineScrapeResult & { markdown: string }),
|
result: (EngineScrapeResult & { markdown: string }),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
function safeguardCircularError<T>(error: T): T {
|
||||||
|
if (typeof error === "object" && error !== null && (error as any).results) {
|
||||||
|
const newError = structuredClone(error);
|
||||||
|
delete (newError as any).results;
|
||||||
|
return newError;
|
||||||
|
} else {
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function scrapeURLLoop(
|
async function scrapeURLLoop(
|
||||||
meta: Meta
|
meta: Meta
|
||||||
): Promise<ScrapeUrlResponse> {
|
): Promise<ScrapeUrlResponse> {
|
||||||
|
@ -149,6 +163,7 @@ async function scrapeURLLoop(
|
||||||
let result: EngineScrapeResultWithContext | null = null;
|
let result: EngineScrapeResultWithContext | null = null;
|
||||||
|
|
||||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||||
|
const startedAt = Date.now();
|
||||||
try {
|
try {
|
||||||
meta.logger.info("Scraping via " + engine + "...");
|
meta.logger.info("Scraping via " + engine + "...");
|
||||||
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
||||||
|
@ -167,6 +182,8 @@ async function scrapeURLLoop(
|
||||||
result: engineResult,
|
result: engineResult,
|
||||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: TODO: what to do when status code is bad is tough...
|
// NOTE: TODO: what to do when status code is bad is tough...
|
||||||
|
@ -186,23 +203,40 @@ async function scrapeURLLoop(
|
||||||
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
|
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error,
|
error: safeguardCircularError(error),
|
||||||
unexpected: false,
|
unexpected: false,
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
} else if (error instanceof TimeoutError) {
|
} else if (error instanceof TimeoutError) {
|
||||||
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
|
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "timeout",
|
state: "timeout",
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
} else if (error instanceof AddFeatureError) {
|
} else if (error instanceof AddFeatureError) {
|
||||||
throw error;
|
throw error;
|
||||||
|
} else if (error instanceof LLMRefusalError) {
|
||||||
|
results[engine] = {
|
||||||
|
state: "error",
|
||||||
|
error: safeguardCircularError(error),
|
||||||
|
unexpected: true,
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
|
}
|
||||||
|
error.results = results;
|
||||||
|
meta.logger.warn("LLM refusal encountered", { error });
|
||||||
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
|
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error,
|
error: safeguardCircularError(error),
|
||||||
unexpected: true,
|
unexpected: true,
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -237,6 +271,7 @@ async function scrapeURLLoop(
|
||||||
success: true,
|
success: true,
|
||||||
document,
|
document,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
|
engines: results,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -261,19 +296,25 @@ export async function scrapeURL(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
let results: EngineResultsTracker = {};
|
||||||
|
|
||||||
if (error instanceof NoEnginesLeftError) {
|
if (error instanceof NoEnginesLeftError) {
|
||||||
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
|
meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
|
||||||
|
results = error.results;
|
||||||
} else if (error instanceof LLMRefusalError) {
|
} else if (error instanceof LLMRefusalError) {
|
||||||
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
||||||
|
results = error.results!;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||||
|
// TODO: results?
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error,
|
error,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
|
engines: results,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,14 +3,18 @@ import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
import { Document, ExtractOptions } from "../../../controllers/v1/types";
|
import { Document, ExtractOptions } from "../../../controllers/v1/types";
|
||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import { Meta } from "..";
|
import { EngineResultsTracker, Meta } from "..";
|
||||||
|
|
||||||
const maxTokens = 32000;
|
const maxTokens = 32000;
|
||||||
const modifier = 4;
|
const modifier = 4;
|
||||||
|
|
||||||
export class LLMRefusalError extends Error {
|
export class LLMRefusalError extends Error {
|
||||||
|
public refusal: string;
|
||||||
|
public results: EngineResultsTracker | undefined;
|
||||||
|
|
||||||
constructor(refusal: string) {
|
constructor(refusal: string) {
|
||||||
super("LLM refused to extract the website's content", { cause: { refusal } })
|
super("LLM refused to extract the website's content")
|
||||||
|
this.refusal = refusal;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user