expose engine results tracker for ScrapeEvents implementation
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run

This commit is contained in:
Móricz Gergő 2024-11-07 00:35:38 +01:00
parent be40dcb217
commit 461eda8d33
2 changed files with 50 additions and 5 deletions

View File

@ -18,6 +18,7 @@ export type ScrapeUrlResponse = ({
error: any, error: any,
}) & { }) & {
logs: any[], logs: any[],
engines: EngineResultsTracker,
} }
export type Meta = { export type Meta = {
@ -116,7 +117,7 @@ export type InternalOptions = {
v0DisableJsDom?: boolean; v0DisableJsDom?: boolean;
}; };
export type EngineResultsTracker = { [E in Engine]?: { export type EngineResultsTracker = { [E in Engine]?: ({
state: "error", state: "error",
error: any, error: any,
unexpected: boolean, unexpected: boolean,
@ -127,6 +128,9 @@ export type EngineResultsTracker = { [E in Engine]?: {
unsupportedFeatures: Set<FeatureFlag>, unsupportedFeatures: Set<FeatureFlag>,
} | { } | {
state: "timeout", state: "timeout",
}) & {
startedAt: number,
finishedAt: number,
} }; } };
export type EngineScrapeResultWithContext = { export type EngineScrapeResultWithContext = {
@ -135,6 +139,16 @@ export type EngineScrapeResultWithContext = {
result: (EngineScrapeResult & { markdown: string }), result: (EngineScrapeResult & { markdown: string }),
}; };
function safeguardCircularError<T>(error: T): T {
if (typeof error === "object" && error !== null && (error as any).results) {
const newError = structuredClone(error);
delete (newError as any).results;
return newError;
} else {
return error;
}
}
async function scrapeURLLoop( async function scrapeURLLoop(
meta: Meta meta: Meta
): Promise<ScrapeUrlResponse> { ): Promise<ScrapeUrlResponse> {
@ -149,6 +163,7 @@ async function scrapeURLLoop(
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine); const _engineResult = await scrapeURLWithEngine(meta, engine);
@ -167,6 +182,8 @@ async function scrapeURLLoop(
result: engineResult, result: engineResult,
factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
unsupportedFeatures, unsupportedFeatures,
startedAt,
finishedAt: Date.now(),
}; };
// NOTE: TODO: what to do when status code is bad is tough... // NOTE: TODO: what to do when status code is bad is tough...
@ -186,23 +203,40 @@ async function scrapeURLLoop(
meta.logger.info("Engine " + engine + " could not scrape the page.", { error }); meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
results[engine] = { results[engine] = {
state: "error", state: "error",
error, error: safeguardCircularError(error),
unexpected: false, unexpected: false,
startedAt,
finishedAt: Date.now(),
}; };
} else if (error instanceof TimeoutError) { } else if (error instanceof TimeoutError) {
meta.logger.info("Engine " + engine + " timed out while scraping.", { error }); meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
results[engine] = { results[engine] = {
state: "timeout", state: "timeout",
startedAt,
finishedAt: Date.now(),
}; };
} else if (error instanceof AddFeatureError) { } else if (error instanceof AddFeatureError) {
throw error; throw error;
} else if (error instanceof LLMRefusalError) {
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now(),
}
error.results = results;
meta.logger.warn("LLM refusal encountered", { error });
throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error }); meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
results[engine] = { results[engine] = {
state: "error", state: "error",
error, error: safeguardCircularError(error),
unexpected: true, unexpected: true,
startedAt,
finishedAt: Date.now(),
} }
} }
} }
@ -237,6 +271,7 @@ async function scrapeURLLoop(
success: true, success: true,
document, document,
logs: meta.logs, logs: meta.logs,
engines: results,
}; };
} }
@ -261,19 +296,25 @@ export async function scrapeURL(
} }
} }
} catch (error) { } catch (error) {
let results: EngineResultsTracker = {};
if (error instanceof NoEnginesLeftError) { if (error instanceof NoEnginesLeftError) {
meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); meta.logger.warn("scrapeURL: All scraping engines failed!", { error });
results = error.results;
} else if (error instanceof LLMRefusalError) { } else if (error instanceof LLMRefusalError) {
meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
results = error.results!;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error }); meta.logger.error("scrapeURL: Unexpected error happened", { error });
// TODO: results?
} }
return { return {
success: false, success: false,
error, error,
logs: meta.logs, logs: meta.logs,
engines: results,
} }
} }
} }

View File

@ -3,14 +3,18 @@ import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types"; import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Logger } from "winston"; import { Logger } from "winston";
import { Meta } from ".."; import { EngineResultsTracker, Meta } from "..";
const maxTokens = 32000; const maxTokens = 32000;
const modifier = 4; const modifier = 4;
export class LLMRefusalError extends Error { export class LLMRefusalError extends Error {
public refusal: string;
public results: EngineResultsTracker | undefined;
constructor(refusal: string) { constructor(refusal: string) {
super("LLM refused to extract the website's content", { cause: { refusal } }) super("LLM refused to extract the website's content")
this.refusal = refusal;
} }
} }