From 4381109dd825fce475a775ba5e7a4398b095e04f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 26 Jun 2024 09:00:54 -0300 Subject: [PATCH] added default values and fixed pdf bug --- apps/api/src/controllers/crawl.ts | 14 ++++---------- apps/api/src/controllers/scrape.ts | 17 +++++------------ apps/api/src/lib/default-values.ts | 26 ++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 22 deletions(-) create mode 100644 apps/api/src/lib/default-values.ts diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 8fd876d3..89358fcc 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -9,6 +9,7 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; +import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; export async function crawlController(req: Request, res: Response) { try { @@ -56,15 +57,8 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; - const crawlerOptions = req.body.crawlerOptions ?? { - allowBackwardCrawling: false - }; - const pageOptions = req.body.pageOptions ?? { - onlyMainContent: false, - includeHtml: false, - removeTags: [], - parsePDF: true - }; + const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; + const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -100,7 +94,7 @@ export async function crawlController(req: Request, res: Response) { crawlerOptions: crawlerOptions, team_id: team_id, pageOptions: pageOptions, - origin: req.body.origin ?? "api", + origin: req.body.origin ?? defaultOrigin, }); await logCrawl(job.id.toString(), team_id); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 1537c071..d394efe8 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from '../lib/LLM-extraction/helpers'; +import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; export async function scrapeHelper( req: Request, @@ -105,21 +106,13 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { - onlyMainContent: false, - includeHtml: false, - waitFor: 0, - screenshot: false, - parsePDF: true - }; - const extractorOptions = req.body.extractorOptions ?? { - mode: "markdown" - } + const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; + const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; if (extractorOptions.mode === "llm-extraction") { pageOptions.onlyMainContent = true; } - const origin = req.body.origin ?? "api"; - const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds + const origin = req.body.origin ?? defaultOrigin; + const timeout = req.body.timeout ?? defaultTimeout; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts new file mode 100644 index 00000000..6ae5f99f --- /dev/null +++ b/apps/api/src/lib/default-values.ts @@ -0,0 +1,26 @@ +export const defaultOrigin = "api"; + +export const defaultTimeout = 30000; // 30 seconds + +export const defaultPageOptions = { + onlyMainContent: false, + includeHtml: false, + waitFor: 0, + screenshot: false, + parsePDF: true +}; + +export const defaultCrawlerOptions = { + allowBackwardCrawling: false +} + +export const defaultCrawlPageOptions = { + onlyMainContent: false, + includeHtml: false, + removeTags: [], + parsePDF: true +} + +export const defaultExtractorOptions = { + mode: "markdown" +} \ No newline at end of file