mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
added default values and fixed pdf bug
This commit is contained in:
parent
45f2765601
commit
4381109dd8
|
@ -9,6 +9,7 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
|
@ -56,15 +57,8 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {
|
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||||
allowBackwardCrawling: false
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
};
|
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
|
||||||
onlyMainContent: false,
|
|
||||||
includeHtml: false,
|
|
||||||
removeTags: [],
|
|
||||||
parsePDF: true
|
|
||||||
};
|
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
|
@ -100,7 +94,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
});
|
});
|
||||||
|
|
||||||
await logCrawl(job.id.toString(), team_id);
|
await logCrawl(job.id.toString(), team_id);
|
||||||
|
|
|
@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
|
@ -105,21 +106,13 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||||
onlyMainContent: false,
|
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||||
includeHtml: false,
|
|
||||||
waitFor: 0,
|
|
||||||
screenshot: false,
|
|
||||||
parsePDF: true
|
|
||||||
};
|
|
||||||
const extractorOptions = req.body.extractorOptions ?? {
|
|
||||||
mode: "markdown"
|
|
||||||
}
|
|
||||||
if (extractorOptions.mode === "llm-extraction") {
|
if (extractorOptions.mode === "llm-extraction") {
|
||||||
pageOptions.onlyMainContent = true;
|
pageOptions.onlyMainContent = true;
|
||||||
}
|
}
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? defaultOrigin;
|
||||||
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
|
const timeout = req.body.timeout ?? defaultTimeout;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
|
|
26
apps/api/src/lib/default-values.ts
Normal file
26
apps/api/src/lib/default-values.ts
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
export const defaultOrigin = "api";
|
||||||
|
|
||||||
|
export const defaultTimeout = 30000; // 30 seconds
|
||||||
|
|
||||||
|
export const defaultPageOptions = {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
waitFor: 0,
|
||||||
|
screenshot: false,
|
||||||
|
parsePDF: true
|
||||||
|
};
|
||||||
|
|
||||||
|
export const defaultCrawlerOptions = {
|
||||||
|
allowBackwardCrawling: false
|
||||||
|
}
|
||||||
|
|
||||||
|
export const defaultCrawlPageOptions = {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
removeTags: [],
|
||||||
|
parsePDF: true
|
||||||
|
}
|
||||||
|
|
||||||
|
export const defaultExtractorOptions = {
|
||||||
|
mode: "markdown"
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user