added default values and fixed pdf bug

This commit is contained in:
rafaelsideguide 2024-06-26 09:00:54 -03:00
parent 45f2765601
commit 4381109dd8
3 changed files with 35 additions and 22 deletions

View File

@ -9,6 +9,7 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log"; import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
@ -56,15 +57,8 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? { const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
allowBackwardCrawling: false const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
};
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
removeTags: [],
parsePDF: true
};
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) {
try { try {
@ -100,7 +94,7 @@ export async function crawlController(req: Request, res: Response) {
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
team_id: team_id, team_id: team_id,
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? "api", origin: req.body.origin ?? defaultOrigin,
}); });
await logCrawl(job.id.toString(), team_id); await logCrawl(job.id.toString(), team_id);

View File

@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
export async function scrapeHelper( export async function scrapeHelper(
req: Request, req: Request,
@ -105,21 +106,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
onlyMainContent: false, const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true; pageOptions.onlyMainContent = true;
} }
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? defaultOrigin;
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds const timeout = req.body.timeout ?? defaultTimeout;
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =

View File

@ -0,0 +1,26 @@
export const defaultOrigin = "api";
export const defaultTimeout = 30000; // 30 seconds
export const defaultPageOptions = {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
export const defaultCrawlerOptions = {
allowBackwardCrawling: false
}
export const defaultCrawlPageOptions = {
onlyMainContent: false,
includeHtml: false,
removeTags: [],
parsePDF: true
}
export const defaultExtractorOptions = {
mode: "markdown"
}