mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick:
This commit is contained in:
parent
63264644e1
commit
49e1cb7ca0
|
@ -1,6 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from '../../lib/logger';
|
||||
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||
import { Document, legacyDocumentConverter, legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
|
@ -16,6 +16,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
|||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = legacyExtractorOptions(req.body.extract);
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -27,7 +28,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
|||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions: {},
|
||||
extractorOptions,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
}, {}, jobId, jobPriority);
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
|
@ -11,7 +11,8 @@ export type Format =
|
|||
| "rawHtml"
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage";
|
||||
| "screenshot@fullPage"
|
||||
| "extract";
|
||||
|
||||
export const url = z.preprocess(
|
||||
(x) => {
|
||||
|
@ -40,6 +41,14 @@ export const url = z.preprocess(
|
|||
|
||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const extractOptions = z.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
schema: z.any().optional(),
|
||||
prompt: z.string().default("Based on the information on the page, extract the information from the schema.")
|
||||
}).strict(strictMessage);
|
||||
|
||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
|
@ -49,6 +58,7 @@ export const scrapeOptions = z.object({
|
|||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract"
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
|
@ -59,6 +69,7 @@ export const scrapeOptions = z.object({
|
|||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
|
@ -118,6 +129,13 @@ export const crawlRequestSchema = crawlerOptions.extend({
|
|||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
// export type ExtractorOptions = {
|
||||
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
// extractionPrompt?: string;
|
||||
// extractionSchema?: Record<string, any>;
|
||||
// }
|
||||
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({
|
||||
|
@ -138,6 +156,7 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
|
|||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
extract?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
|
@ -280,6 +299,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
|
@ -291,6 +311,14 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
};
|
||||
}
|
||||
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
|
@ -309,6 +337,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
|
|
|
@ -25,6 +25,7 @@ export async function generateCompletions(
|
|||
case "openAI":
|
||||
const llm = new OpenAI();
|
||||
try{
|
||||
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
|
|
|
@ -25,6 +25,7 @@ function prepareOpenAIDoc(
|
|||
extractionTarget = document.rawHtml;
|
||||
}
|
||||
|
||||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!extractionTarget) {
|
||||
return null;
|
||||
|
@ -43,7 +44,6 @@ function prepareOpenAIDoc(
|
|||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
||||
}
|
||||
|
||||
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||
}
|
||||
|
||||
|
@ -73,7 +73,6 @@ export async function generateOpenAICompletions({
|
|||
warning: "LLM extraction was not performed since the document's content is empty or missing.",
|
||||
};
|
||||
}
|
||||
|
||||
const [content, numTokens] = preparedDoc;
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
|
|
|
@ -12,6 +12,7 @@ export interface Progress {
|
|||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
includeExtract?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
|
|
|
@ -31,7 +31,6 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
console.log({result});
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
|
|
|
@ -305,26 +305,21 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"markdown"
|
||||
);
|
||||
}
|
||||
if (
|
||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"raw-html"
|
||||
);
|
||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||
const extractionMode = this.extractorOptions.mode;
|
||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||
|
||||
if (
|
||||
extractionMode === "llm-extraction" ||
|
||||
extractionMode === "llm-extraction-from-markdown" ||
|
||||
extractionMode === "llm-extraction-from-raw-html"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
completionMode
|
||||
);
|
||||
}
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
@ -588,6 +583,7 @@ export class WebScraperDataProvider {
|
|||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
includeExtract: options.pageOptions?.includeExtract ?? options.extractorOptions.mode !== "markdown" ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
|
@ -617,6 +613,8 @@ export class WebScraperDataProvider {
|
|||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
if (!url.trim().startsWith("http")) {
|
||||
|
|
|
@ -130,6 +130,7 @@ export async function scrapSingleUrl(
|
|||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
includeExtract: pageOptions.includeExtract ?? false,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
|
@ -388,11 +389,11 @@ export async function scrapSingleUrl(
|
|||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
|
@ -407,11 +408,11 @@ export async function scrapSingleUrl(
|
|||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
|
@ -434,7 +435,7 @@ export async function scrapSingleUrl(
|
|||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
|
|
|
@ -203,6 +203,18 @@ async function processJob(job: Job, token: string) {
|
|||
}
|
||||
}
|
||||
|
||||
if(job.data.pageOptions && job.data.pageOptions.includeExtract ) {
|
||||
if(!job.data.pageOptions.includeMarkdown) {
|
||||
delete docs[0].markdown;
|
||||
}
|
||||
// if(!job.data.pageOptions.includeRawHtml) {
|
||||
// delete docs[0].rawHtml;
|
||||
// }
|
||||
// if(!job.data.pageOptions.includeHtml) {
|
||||
// delete docs[0].html;
|
||||
// }
|
||||
}
|
||||
|
||||
const data = {
|
||||
success,
|
||||
result: {
|
||||
|
|
Loading…
Reference in New Issue
Block a user