mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
ingest scrape events
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run
This commit is contained in:
parent
461eda8d33
commit
49801ac779
|
@ -10,7 +10,8 @@ import { supabase_service } from "../services/supabase";
|
|||
import { logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
|
@ -58,6 +59,7 @@ export async function runWebScraper({
|
|||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
let response: ScrapeUrlResponse | undefined = undefined;
|
||||
let engines: EngineResultsTracker = {};
|
||||
try {
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
||||
if (!response.success) {
|
||||
|
@ -83,8 +85,11 @@ export async function runWebScraper({
|
|||
// This is where the returnvalue from the job is set
|
||||
// onSuccess(response.document, mode);
|
||||
|
||||
engines = response.engines;
|
||||
return response;
|
||||
} catch (error) {
|
||||
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
|
||||
|
||||
if (response !== undefined) {
|
||||
return {
|
||||
...response,
|
||||
|
@ -92,13 +97,31 @@ export async function runWebScraper({
|
|||
error,
|
||||
}
|
||||
} else {
|
||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"] };
|
||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
|
||||
}
|
||||
// onError(error);
|
||||
} finally {
|
||||
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
|
||||
|
||||
for (const engine of engineOrder) {
|
||||
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
|
||||
ScrapeEvents.insert(bull_job_id, {
|
||||
type: "scrape",
|
||||
url,
|
||||
method: engine,
|
||||
result: {
|
||||
success: result.state === "success",
|
||||
response_code: (result.state === "success" ? result.result.statusCode : undefined),
|
||||
response_size: (result.state === "success" ? result.result.html.length : undefined),
|
||||
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
|
||||
time_taken: result.finishedAt - result.startedAt,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user