From 49801ac779f1950c4f407cb2b0f6ac8c5c552a34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 7 Nov 2024 12:33:03 +0100 Subject: [PATCH] ingest scrape events --- apps/api/src/main/runWebScraper.ts | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 9e582a86..90d4a47f 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,7 +10,8 @@ import { supabase_service } from "../services/supabase"; import { logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; import { configDotenv } from "dotenv"; -import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL"; +import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL"; +import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export async function startWebScraperPipeline({ @@ -58,6 +59,7 @@ export async function runWebScraper({ is_scrape=false, }: RunWebScraperParams): Promise { let response: ScrapeUrlResponse | undefined = undefined; + let engines: EngineResultsTracker = {}; try { response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions }); if (!response.success) { @@ -83,8 +85,11 @@ export async function runWebScraper({ // This is where the returnvalue from the job is set // onSuccess(response.document, mode); + engines = response.engines; return response; } catch (error) { + engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {})); + if (response !== undefined) { return { ...response, @@ -92,13 +97,31 @@ export async function runWebScraper({ error, } } else { - return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] }; + return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines }; } // onError(error); + } finally { + const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[]; + + for (const engine of engineOrder) { + const result = engines[engine] as Exclude; + ScrapeEvents.insert(bull_job_id, { + type: "scrape", + url, + method: engine, + result: { + success: result.state === "success", + response_code: (result.state === "success" ? result.result.statusCode : undefined), + response_size: (result.state === "success" ? result.result.html.length : undefined), + error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined), + time_taken: result.finishedAt - result.startedAt, + }, + }); + } } } -const saveJob = async (job: Job, result: any, token: string, mode: string) => { +const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if (useDbAuthentication) {