This commit is contained in:
Nicolas 2024-08-29 20:08:06 -03:00
parent 63264644e1
commit 49e1cb7ca0
9 changed files with 73 additions and 32 deletions

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express";
import { Logger } from '../../lib/logger';
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
import { Document, legacyDocumentConverter, legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from 'uuid';
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
@ -16,6 +16,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
const origin = req.body.origin;
const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = legacyExtractorOptions(req.body.extract);
const jobId = uuidv4();
const startTime = new Date().getTime();
@ -27,7 +28,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
crawlerOptions: {},
team_id: req.auth.team_id,
pageOptions,
extractorOptions: {},
extractorOptions,
origin: req.body.origin,
is_scrape: true,
}, {}, jobId, jobPriority);

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
import { ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
@ -11,7 +11,8 @@ export type Format =
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage";
| "screenshot@fullPage"
| "extract";
export const url = z.preprocess(
(x) => {
@ -40,6 +41,14 @@ export const url = z.preprocess(
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const extractOptions = z.object({
mode: z.enum(["llm"]).default("llm"),
schema: z.any().optional(),
prompt: z.string().default("Based on the information on the page, extract the information from the schema.")
}).strict(strictMessage);
export type ExtractOptions = z.infer<typeof extractOptions>;
export const scrapeOptions = z.object({
formats: z
.enum([
@ -49,6 +58,7 @@ export const scrapeOptions = z.object({
"links",
"screenshot",
"screenshot@fullPage",
"extract"
])
.array()
.optional()
@ -59,6 +69,7 @@ export const scrapeOptions = z.object({
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), // default?
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true),
}).strict(strictMessage);
@ -118,6 +129,13 @@ export const crawlRequestSchema = crawlerOptions.extend({
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
// };
// export type ExtractorOptions = {
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
// extractionPrompt?: string;
// extractionSchema?: Record<string, any>;
// }
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
@ -138,6 +156,7 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string;
extract?: string;
html?: string;
rawHtml?: string;
links?: string[];
@ -280,6 +299,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
includeExtract: x.formats.includes("extract"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
@ -291,6 +311,14 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
};
}
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
return {
mode: x.mode ? "llm-extraction" : "markdown",
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
extractionSchema: x.schema,
};
}
export function legacyDocumentConverter(doc: any): Document {
if (doc.metadata) {
if (doc.metadata.screenshot) {
@ -309,6 +337,7 @@ export function legacyDocumentConverter(doc: any): Document {
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
metadata: {
...doc.metadata,

View File

@ -25,6 +25,7 @@ export async function generateCompletions(
case "openAI":
const llm = new OpenAI();
try{
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,

View File

@ -25,6 +25,7 @@ function prepareOpenAIDoc(
extractionTarget = document.rawHtml;
}
// Check if the markdown content exists in the document
if (!extractionTarget) {
return null;
@ -43,7 +44,6 @@ function prepareOpenAIDoc(
// trim the document to the maximum number of tokens, tokens != characters
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: extractionTarget }], numTokens];
}
@ -73,7 +73,6 @@ export async function generateOpenAICompletions({
warning: "LLM extraction was not performed since the document's content is empty or missing.",
};
}
const [content, numTokens] = preparedDoc;
const completion = await openai.chat.completions.create({

View File

@ -12,6 +12,7 @@ export interface Progress {
export type PageOptions = {
includeMarkdown?: boolean;
includeExtract?: boolean;
onlyMainContent?: boolean;
includeHtml?: boolean;
includeRawHtml?: boolean;

View File

@ -31,7 +31,6 @@ it('should return a list of links on the firecrawl.ai page', async () => {
// Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined();
console.log({result});
expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')

View File

@ -305,26 +305,21 @@ export class WebScraperDataProvider {
}
// documents = await this.applyImgAltText(documents);
if (
(this.extractorOptions.mode === "llm-extraction" ||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
"markdown"
);
}
if (
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
this.mode === "single_urls"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
"raw-html"
);
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
const extractionMode = this.extractorOptions.mode;
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
if (
extractionMode === "llm-extraction" ||
extractionMode === "llm-extraction-from-markdown" ||
extractionMode === "llm-extraction-from-raw-html"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
completionMode
);
}
}
return documents.concat(pdfDocuments).concat(docxDocuments);
}
@ -588,6 +583,7 @@ export class WebScraperDataProvider {
removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
includeExtract: options.pageOptions?.includeExtract ?? options.extractorOptions.mode !== "markdown" ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
@ -617,6 +613,8 @@ export class WebScraperDataProvider {
this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {
if (!url.trim().startsWith("http")) {

View File

@ -130,6 +130,7 @@ export async function scrapSingleUrl(
): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
includeExtract: pageOptions.includeExtract ?? false,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
@ -388,11 +389,11 @@ export async function scrapSingleUrl(
if (screenshot && screenshot.length > 0) {
document = {
content: text,
markdown: pageOptions.includeMarkdown ? text : undefined,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions?.mode === "llm-extraction-from-raw-html"
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
@ -407,11 +408,11 @@ export async function scrapSingleUrl(
} else {
document = {
content: text,
markdown: pageOptions.includeMarkdown ? text : undefined,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions?.mode === "llm-extraction-from-raw-html"
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
metadata: {
@ -434,7 +435,7 @@ export async function scrapSingleUrl(
});
return {
content: "",
markdown: pageOptions.includeMarkdown ? "" : undefined,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
html: "",
linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: {

View File

@ -203,6 +203,18 @@ async function processJob(job: Job, token: string) {
}
}
if(job.data.pageOptions && job.data.pageOptions.includeExtract ) {
if(!job.data.pageOptions.includeMarkdown) {
delete docs[0].markdown;
}
// if(!job.data.pageOptions.includeRawHtml) {
// delete docs[0].rawHtml;
// }
// if(!job.data.pageOptions.includeHtml) {
// delete docs[0].html;
// }
}
const data = {
success,
result: {