From faf11acf82e323ce534729b41e0931f58a622f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 10:12:49 +0100 Subject: [PATCH 1/2] doctor first iteration --- apps/api/.gitignore | 2 + apps/api/src/doctor.ts | 104 ++++++++++++++++++++++++ apps/api/src/scraper/scrapeURL/index.ts | 2 +- 3 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/doctor.ts diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..b467a672 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +doctor-*.html diff --git a/apps/api/src/doctor.ts b/apps/api/src/doctor.ts new file mode 100644 index 00000000..b56cfebe --- /dev/null +++ b/apps/api/src/doctor.ts @@ -0,0 +1,104 @@ +import { configDotenv } from "dotenv"; +configDotenv() +import { z } from "zod"; +import { scrapeOptions, ScrapeOptions } from "./controllers/v1/types"; +import { InternalOptions, scrapeURL, ScrapeUrlResponse } from "./scraper/scrapeURL"; +import { logger as _logger } from "./lib/logger"; +import { Engine, engineOptions, engines } from "./scraper/scrapeURL/engines"; +import { writeFile } from "fs/promises"; +import path from "path"; + +// inputs +const url: string = "https://firecrawl.dev"; +const controlString: string | undefined = undefined; + +const errorReplacer = (_, value) => { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause, + } + } else { + return value; + } + }; + +const doctorId = crypto.randomUUID(); +const logger = _logger.child({ module: "doctor" }); + +type Permutation = { + options: z.input, + internal: InternalOptions, + name: string +}; + +const permutations: Permutation[] = [ + { options: {}, internal: {}, name: "bare" }, + ...Object.entries(engineOptions).filter(([name, options]) => options.quality > 0 && engines.includes(name as Engine)).map(([name, options]) => ({ + options: {}, internal: { forceEngine: name as Engine }, name, + })), +]; + +type PermutationResult = ({ + state: "done", + result: ScrapeUrlResponse & { + success: true + }, +} | { + state: "thrownError", + error: string | Error, +} | { + state: "error", + result: ScrapeUrlResponse & { + success: false + }, +}) & { + permutation: Permutation, +}; + +const results: PermutationResult[] = []; + +(async () => { + await Promise.all(permutations.map(async perm => { + logger.info("Trying permutation " + perm.name); + try { + const result = await scrapeURL(doctorId + ":bare", url, scrapeOptions.parse(perm.options), perm.internal); + if (result.success) { + results.push({ + state: "done", + result, + permutation: perm, + }); + } else { + results.push({ + state: "error", + result, + permutation: perm, + }); + } + } catch (error) { + console.error("Permutation " + perm.name + " failed with error", { error }); + results.push({ + state: "thrownError", + error, + permutation: perm, + }); + } + })); + + const fileContent = "

Doctor

URL: " + url + "

" + + results.map(x => "

" + (x.state === "done" ? "✅" : "❌") + " " + x.permutation.name + "

Scrape options: " + JSON.stringify(x.permutation.options) + "

" + + "

Internal options: " + JSON.stringify(x.permutation.internal) + "

" + + "
" + ((x.state === "done" ? JSON.stringify(x.result, errorReplacer, 4)
+                : x.state === "thrownError" ? (x.error instanceof Error ? (x.error.message + "\n" + (x.error.stack ?? "")) : x.error) 
+                : (JSON.stringify(x.result, errorReplacer, 4))))
+                .replaceAll("<", "<").replaceAll(">", ">") + "
").join("") + + "" + + const fileName = path.join(process.cwd(), "doctor-" + doctorId + ".html"); + await writeFile(fileName, fileContent); + logger.info("Wrote result to " + fileName); +})(); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 7be8b67a..68b6774b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -229,7 +229,7 @@ async function scrapeURLLoop( throw error; } else { Sentry.captureException(error); - meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error }); + meta.logger.warn("An unexpected error happened while scraping with " + engine + ".", { error }); results[engine] = { state: "error", error: safeguardCircularError(error), From 9298a05045e25f4d1d1a1336bd7645b4c255d716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 12:24:53 +0100 Subject: [PATCH 2/2] feat: turn into API --- apps/api/requests.http | 11 +- .../src/controllers/v1/admin/doctor-status.ts | 104 ++++++++++++++++++ apps/api/src/controllers/v1/admin/doctor.ts | 84 ++++++++++++++ apps/api/src/doctor.ts | 104 ------------------ apps/api/src/lib/logger.ts | 31 ++++++ apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/routes/admin.ts | 12 ++ apps/api/src/scraper/scrapeURL/index.ts | 7 +- apps/api/src/services/queue-worker.ts | 20 ++-- apps/api/src/types.ts | 6 +- 10 files changed, 264 insertions(+), 117 deletions(-) create mode 100644 apps/api/src/controllers/v1/admin/doctor-status.ts create mode 100644 apps/api/src/controllers/v1/admin/doctor.ts delete mode 100644 apps/api/src/doctor.ts diff --git a/apps/api/requests.http b/apps/api/requests.http index 4ce40b2c..1be849a1 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -46,4 +46,13 @@ content-type: application/json @batchScrapeId = {{batchScrape.response.body.$.id}} # @name batchScrapeStatus GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} \ No newline at end of file +Authorization: Bearer {{$dotenv TEST_API_KEY}} + +### URL Doctor +# @name urlDoctor +POST {{baseUrl}}/admin/{{$dotenv BULL_AUTH_KEY}}/doctor HTTP/1.1 +Content-Type: application/json + +{ + "url": "https://firecrawl.dev" +} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/admin/doctor-status.ts b/apps/api/src/controllers/v1/admin/doctor-status.ts new file mode 100644 index 00000000..81aae731 --- /dev/null +++ b/apps/api/src/controllers/v1/admin/doctor-status.ts @@ -0,0 +1,104 @@ +import { Request, Response } from "express"; +import { logger as _logger } from "../../../lib/logger"; +import { ScrapeUrlResponse } from "../../../scraper/scrapeURL"; +import { getScrapeQueue, redisConnection } from "../../../services/queue-service"; +import type { Permutation } from "./doctor"; +import { Job } from "bullmq"; + +const logger = _logger.child({ module: "doctorStatusController" }); + +const errorReplacer = (_, value) => { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause, + } + } else { + return value; + } +}; + +type PermutationResult = ({ + state: "done", + result: ScrapeUrlResponse & { + success: true + }, +} | { + state: "thrownError", + error: string | Error | null | undefined, +} | { + state: "error", + result: ScrapeUrlResponse & { + success: false + }, +} | { + state: "pending", +}) & { + permutation: Permutation, +}; + +export async function doctorStatusController(req: Request, res: Response) { + try { + const doctorId = req.params.id; + + const meta: { url: string } | null = JSON.parse(await redisConnection.get("doctor:" + doctorId) ?? "null"); + const permutations: Permutation[] | null = JSON.parse(await redisConnection.get("doctor:" + doctorId + ":permutations") ?? "null"); + if (permutations === null || meta === null) { + return res.status(404).json({ error: "Doctor entry not found" }); + } + + const jobs = (await Promise.all(permutations.map(x => getScrapeQueue().getJob(x.jobId)))).filter(x => x) as Job[]; + + const results: PermutationResult[] = await Promise.all(jobs.map(async job => { + const permutation = permutations.find(x => x.jobId === job.id)!; + const state = await job.getState(); + if (state === "completed" && job.data) { + if (job.returnvalue.success) { + return { + state: "done", + result: job.returnvalue, + permutation, + } + } else { + return { + state: "error", + result: job.returnvalue, + permutation, + } + } + } else if (state === "failed") { + return { + state: "thrownError", + error: job.failedReason, + permutation, + } + } else { + return { + state: "pending", + permutation, + } + } + })); + + const html = "

Doctor

URL: " + meta.url + "

" + + results.map(x => "

" + (x.state === "pending" ? "⏳" : x.state === "done" ? "✅" : "❌") + " " + x.permutation.name + "

Scrape options: " + JSON.stringify(x.permutation.options) + "

" + + "

Internal options: " + JSON.stringify(x.permutation.internal) + "

" + + (x.state !== "pending" ? ("
" + ((x.state === "done"
+                ? JSON.stringify(x.result, errorReplacer, 4)
+                : x.state === "thrownError"
+                    ? (x.error instanceof Error
+                        ? (x.error.message + "\n" + (x.error.stack ?? ""))
+                        : (x.error ?? "")) 
+                    : (JSON.stringify(x.result, errorReplacer, 4))))
+                .replaceAll("<", "<").replaceAll(">", ">") + "
"): "")).join("") + + "" + + res.header("Content-Type", "text/html").send(html); + } catch (error) { + logger.error("Doctor status error", { error }); + res.status(500).json({ error: "Internal server error" }); + } +} diff --git a/apps/api/src/controllers/v1/admin/doctor.ts b/apps/api/src/controllers/v1/admin/doctor.ts new file mode 100644 index 00000000..02e35565 --- /dev/null +++ b/apps/api/src/controllers/v1/admin/doctor.ts @@ -0,0 +1,84 @@ +import { Request, Response } from "express"; +import { logger as _logger } from "../../../lib/logger"; +import { ScrapeUrlResponse, InternalOptions } from "../../../scraper/scrapeURL"; +import { z } from "zod"; +import { scrapeOptions } from "../types"; +import { Engine, engineOptions, engines } from "../../../scraper/scrapeURL/engines"; +import { addScrapeJob, addScrapeJobs } from "../../../services/queue-jobs"; +import { redisConnection } from "../../../services/queue-service"; + +const logger = _logger.child({ module: "doctorController" }); + +export type Permutation = { + options: z.input, + internal: InternalOptions, + name: string, + jobId: string, +}; + +export async function doctorController(req: Request, res: Response) { + try { + const doctorId = crypto.randomUUID(); + + const permutations: Permutation[] = [ + { options: {}, internal: { verbose: true }, name: "bare", jobId: crypto.randomUUID() }, + ...Object.entries(engineOptions).filter(([name, options]) => options.quality > 0 && engines.includes(name as Engine)).map(([name, _options]) => ({ + options: {}, internal: { forceEngine: name as Engine, verbose: true }, name, jobId: crypto.randomUUID(), + })), + ]; + + await addScrapeJobs(permutations.map(perm => ({ + data: { + url: req.body.url, + mode: "single_urls", + team_id: null, + scrapeOptions: scrapeOptions.parse(perm.options), + internalOptions: perm.internal, + plan: null, + origin: "doctor", + is_scrape: true, + doctor: true, + }, + opts: { + jobId: perm.jobId, + priority: 10, + }, + }))); + + await redisConnection.set("doctor:" + doctorId, JSON.stringify({ url: req.body.url }), "EX", 86400); + await redisConnection.set("doctor:" + doctorId + ":permutations", JSON.stringify(permutations), "EX", 86400); + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + + res.json({ ok: true, id: doctorId, url: `${protocol}://${req.get("host")}/admin/${process.env.BULL_AUTH_KEY}/doctor/${doctorId}` }); + + // await Promise.all(permutations.map(async perm => { + // try { + // const result = await scrapeURL(doctorId + ":bare", url, scrapeOptions.parse(perm.options), perm.internal); + // if (result.success) { + // results.push({ + // state: "done", + // result, + // permutation: perm, + // }); + // } else { + // results.push({ + // state: "error", + // result, + // permutation: perm, + // }); + // } + // } catch (error) { + // console.error("Permutation " + perm.name + " failed with error", { error }); + // results.push({ + // state: "thrownError", + // error, + // permutation: perm, + // }); + // } + // })); + } catch (error) { + logger.error("Doctor error", { error }); + res.status(500).json({ error: "Internal server error" }); + } +} diff --git a/apps/api/src/doctor.ts b/apps/api/src/doctor.ts deleted file mode 100644 index b56cfebe..00000000 --- a/apps/api/src/doctor.ts +++ /dev/null @@ -1,104 +0,0 @@ -import { configDotenv } from "dotenv"; -configDotenv() -import { z } from "zod"; -import { scrapeOptions, ScrapeOptions } from "./controllers/v1/types"; -import { InternalOptions, scrapeURL, ScrapeUrlResponse } from "./scraper/scrapeURL"; -import { logger as _logger } from "./lib/logger"; -import { Engine, engineOptions, engines } from "./scraper/scrapeURL/engines"; -import { writeFile } from "fs/promises"; -import path from "path"; - -// inputs -const url: string = "https://firecrawl.dev"; -const controlString: string | undefined = undefined; - -const errorReplacer = (_, value) => { - if (value instanceof Error) { - return { - ...value, - name: value.name, - message: value.message, - stack: value.stack, - cause: value.cause, - } - } else { - return value; - } - }; - -const doctorId = crypto.randomUUID(); -const logger = _logger.child({ module: "doctor" }); - -type Permutation = { - options: z.input, - internal: InternalOptions, - name: string -}; - -const permutations: Permutation[] = [ - { options: {}, internal: {}, name: "bare" }, - ...Object.entries(engineOptions).filter(([name, options]) => options.quality > 0 && engines.includes(name as Engine)).map(([name, options]) => ({ - options: {}, internal: { forceEngine: name as Engine }, name, - })), -]; - -type PermutationResult = ({ - state: "done", - result: ScrapeUrlResponse & { - success: true - }, -} | { - state: "thrownError", - error: string | Error, -} | { - state: "error", - result: ScrapeUrlResponse & { - success: false - }, -}) & { - permutation: Permutation, -}; - -const results: PermutationResult[] = []; - -(async () => { - await Promise.all(permutations.map(async perm => { - logger.info("Trying permutation " + perm.name); - try { - const result = await scrapeURL(doctorId + ":bare", url, scrapeOptions.parse(perm.options), perm.internal); - if (result.success) { - results.push({ - state: "done", - result, - permutation: perm, - }); - } else { - results.push({ - state: "error", - result, - permutation: perm, - }); - } - } catch (error) { - console.error("Permutation " + perm.name + " failed with error", { error }); - results.push({ - state: "thrownError", - error, - permutation: perm, - }); - } - })); - - const fileContent = "

Doctor

URL: " + url + "

" - + results.map(x => "

" + (x.state === "done" ? "✅" : "❌") + " " + x.permutation.name + "

Scrape options: " + JSON.stringify(x.permutation.options) + "

" - + "

Internal options: " + JSON.stringify(x.permutation.internal) + "

" - + "
" + ((x.state === "done" ? JSON.stringify(x.result, errorReplacer, 4)
-                : x.state === "thrownError" ? (x.error instanceof Error ? (x.error.message + "\n" + (x.error.stack ?? "")) : x.error) 
-                : (JSON.stringify(x.result, errorReplacer, 4))))
-                .replaceAll("<", "<").replaceAll(">", ">") + "
").join("") - + "" - - const fileName = path.join(process.cwd(), "doctor-" + doctorId + ".html"); - await writeFile(fileName, fileContent); - logger.info("Wrote result to " + fileName); -})(); diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index eb4f8aeb..e46cfb7f 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -1,4 +1,5 @@ import * as winston from "winston"; +import Transport from "winston-transport"; import { configDotenv } from "dotenv"; configDotenv(); @@ -49,3 +50,33 @@ export const logger = winston.createLogger({ }), ], }); + +export type ArrayTransportOptions = Transport.TransportStreamOptions & { + array: any[]; + scrapeId?: string; +}; + +export class ArrayTransport extends Transport { + private array: any[]; + private scrapeId?: string; + + constructor(opts: ArrayTransportOptions) { + super(opts); + this.array = opts.array; + this.scrapeId = opts.scrapeId; + } + + log(info, next) { + setImmediate(() => { + this.emit("logged", info); + }); + + if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) { + return next(); + } + + this.array.push(info); + + next(); + } +} diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 90d4a47f..06150411 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -70,7 +70,7 @@ export async function runWebScraper({ } } - if(is_scrape === false) { + if(is_scrape === false && team_id) { let creditsToBeBilled = 1; // Assuming 1 credit per document if (scrapeOptions.extract) { creditsToBeBilled = 5; diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index ac61519a..721f2288 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -8,6 +8,8 @@ import { } from "../controllers/v0/admin/queue"; import { wrap } from "./v1"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; +import { doctorController } from "../controllers/v1/admin/doctor"; +import { doctorStatusController } from "../controllers/v1/admin/doctor-status"; export const adminRouter = express.Router(); @@ -40,3 +42,13 @@ adminRouter.post( `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, wrap(acucCacheClearController), ); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/doctor`, + wrap(doctorController), +); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/doctor/:id`, + wrap(doctorStatusController), +); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 68b6774b..8d80e64c 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -2,7 +2,7 @@ import { Logger } from "winston"; import * as Sentry from "@sentry/node"; import { Document, ScrapeOptions } from "../../controllers/v1/types"; -import { logger } from "../../lib/logger"; +import { ArrayTransport, logger } from "../../lib/logger"; import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error"; @@ -97,6 +97,9 @@ function buildMetaObject(id: string, url: string, options: ScrapeOptions, intern const _logger = logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url }); const logs: any[] = []; + if (internalOptions.verbose) { + _logger.add(new ArrayTransport({ array: logs, scrapeId: id })); + } return { id, url, options, internalOptions, @@ -114,6 +117,8 @@ export type InternalOptions = { v0CrawlOnlyUrls?: boolean; v0UseFastMode?: boolean; v0DisableJsDom?: boolean; + + verbose?: boolean; // stores logs. will cause high memory usage. use with caution }; export type EngineResultsTracker = { [E in Engine]?: ({ diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 54841061..76e7c473 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -38,6 +38,7 @@ import { configDotenv } from "dotenv"; import { scrapeOptions } from "../controllers/v1/types"; import { getRateLimiterPoints } from "./rate-limiter"; import { cleanOldConcurrencyLimitEntries, pushConcurrencyLimitActiveJob, removeConcurrencyLimitActiveJob, takeConcurrencyLimitedJob } from "../lib/concurrency-limit"; +import { ScrapeUrlResponse } from "../scraper/scrapeURL"; configDotenv(); const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); @@ -289,17 +290,12 @@ async function processJob(job: Job & { id: string }, token: string) { ] : []) ]); - if (!pipeline.success) { - // TODO: let's Not do this - throw pipeline.error; - } - const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - const doc = pipeline.document; + const doc = (pipeline as ScrapeUrlResponse & { success: true }).document; - const rawHtml = doc.rawHtml ?? ""; + const rawHtml = doc?.rawHtml ?? ""; const data = { success: true, @@ -313,6 +309,16 @@ async function processJob(job: Job & { id: string }, token: string) { document: doc, }; + if (job.data.doctor) { + (data.document as any) = pipeline as unknown as Document; // force it in there + return data; + } + + if (!pipeline.success) { + // TODO: let's Not do this + throw pipeline.error; + } + if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { await callWebhook( job.data.team_id, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index d7821407..8238f56a 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -29,8 +29,8 @@ export interface WebScraperOptions { crawlerOptions?: any; scrapeOptions: ScrapeOptions; internalOptions?: InternalOptions; - team_id: string; - plan: string; + team_id: string | null; + plan: string | null; origin?: string; crawl_id?: string; sitemapped?: boolean; @@ -46,7 +46,7 @@ export interface RunWebScraperParams { internalOptions?: InternalOptions; // onSuccess: (result: V1Document, mode: string) => void; // onError: (error: Error) => void; - team_id: string; + team_id: string | null; bull_job_id: string; priority?: number; is_scrape?: boolean;