ingest scrape events
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run

This commit is contained in:
Móricz Gergő 2024-11-07 12:33:03 +01:00
parent 461eda8d33
commit 49801ac779

View File

@ -10,7 +10,8 @@ import { supabase_service } from "../services/supabase";
import { logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();
export async function startWebScraperPipeline({
@ -58,6 +59,7 @@ export async function runWebScraper({
is_scrape=false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
if (!response.success) {
@ -83,8 +85,11 @@ export async function runWebScraper({
// This is where the returnvalue from the job is set
// onSuccess(response.document, mode);
engines = response.engines;
return response;
} catch (error) {
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
if (response !== undefined) {
return {
...response,
@ -92,13 +97,31 @@ export async function runWebScraper({
error,
}
} else {
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] };
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
}
// onError(error);
} finally {
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code: (result.state === "success" ? result.result.statusCode : undefined),
response_size: (result.state === "success" ? result.result.html.length : undefined),
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
time_taken: result.finishedAt - result.startedAt,
},
});
}
}
}
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {