From faf11acf82e323ce534729b41e0931f58a622f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 10:12:49 +0100 Subject: [PATCH] doctor first iteration --- apps/api/.gitignore | 2 + apps/api/src/doctor.ts | 104 ++++++++++++++++++++++++ apps/api/src/scraper/scrapeURL/index.ts | 2 +- 3 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/doctor.ts diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..b467a672 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +doctor-*.html diff --git a/apps/api/src/doctor.ts b/apps/api/src/doctor.ts new file mode 100644 index 00000000..b56cfebe --- /dev/null +++ b/apps/api/src/doctor.ts @@ -0,0 +1,104 @@ +import { configDotenv } from "dotenv"; +configDotenv() +import { z } from "zod"; +import { scrapeOptions, ScrapeOptions } from "./controllers/v1/types"; +import { InternalOptions, scrapeURL, ScrapeUrlResponse } from "./scraper/scrapeURL"; +import { logger as _logger } from "./lib/logger"; +import { Engine, engineOptions, engines } from "./scraper/scrapeURL/engines"; +import { writeFile } from "fs/promises"; +import path from "path"; + +// inputs +const url: string = "https://firecrawl.dev"; +const controlString: string | undefined = undefined; + +const errorReplacer = (_, value) => { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause, + } + } else { + return value; + } + }; + +const doctorId = crypto.randomUUID(); +const logger = _logger.child({ module: "doctor" }); + +type Permutation = { + options: z.input, + internal: InternalOptions, + name: string +}; + +const permutations: Permutation[] = [ + { options: {}, internal: {}, name: "bare" }, + ...Object.entries(engineOptions).filter(([name, options]) => options.quality > 0 && engines.includes(name as Engine)).map(([name, options]) => ({ + options: {}, internal: { forceEngine: name as Engine }, name, + })), +]; + +type PermutationResult = ({ + state: "done", + result: ScrapeUrlResponse & { + success: true + }, +} | { + state: "thrownError", + error: string | Error, +} | { + state: "error", + result: ScrapeUrlResponse & { + success: false + }, +}) & { + permutation: Permutation, +}; + +const results: PermutationResult[] = []; + +(async () => { + await Promise.all(permutations.map(async perm => { + logger.info("Trying permutation " + perm.name); + try { + const result = await scrapeURL(doctorId + ":bare", url, scrapeOptions.parse(perm.options), perm.internal); + if (result.success) { + results.push({ + state: "done", + result, + permutation: perm, + }); + } else { + results.push({ + state: "error", + result, + permutation: perm, + }); + } + } catch (error) { + console.error("Permutation " + perm.name + " failed with error", { error }); + results.push({ + state: "thrownError", + error, + permutation: perm, + }); + } + })); + + const fileContent = "

Doctor

URL: " + url + "

" + + results.map(x => "

" + (x.state === "done" ? "✅" : "❌") + " " + x.permutation.name + "

Scrape options: " + JSON.stringify(x.permutation.options) + "

" + + "

Internal options: " + JSON.stringify(x.permutation.internal) + "

" + + "
" + ((x.state === "done" ? JSON.stringify(x.result, errorReplacer, 4)
+                : x.state === "thrownError" ? (x.error instanceof Error ? (x.error.message + "\n" + (x.error.stack ?? "")) : x.error) 
+                : (JSON.stringify(x.result, errorReplacer, 4))))
+                .replaceAll("<", "<").replaceAll(">", ">") + "
").join("") + + "" + + const fileName = path.join(process.cwd(), "doctor-" + doctorId + ".html"); + await writeFile(fileName, fileContent); + logger.info("Wrote result to " + fileName); +})(); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 7be8b67a..68b6774b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -229,7 +229,7 @@ async function scrapeURLLoop( throw error; } else { Sentry.captureException(error); - meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error }); + meta.logger.warn("An unexpected error happened while scraping with " + engine + ".", { error }); results[engine] = { state: "error", error: safeguardCircularError(error),