From 538355f1af759292364a07028e4749f311aaac36 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 6 May 2024 11:36:44 -0300 Subject: [PATCH 1/3] Added toMarkdown option --- .../src/__tests__/e2e_withAuth/index.test.ts | 51 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 2 +- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/scrape.ts | 6 +-- apps/api/src/controllers/search.ts | 1 + apps/api/src/lib/entities.ts | 4 +- apps/api/src/scraper/WebScraper/index.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 10 ++-- 8 files changed, 67 insertions(+), 13 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c6c59bcb..2e262306 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -81,6 +81,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout + + it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("FireCrawl"); + expect(response.body.data.content).toContain(" { @@ -250,6 +265,42 @@ describe("E2E Tests for API Routes", () => { "🔥 FireCrawl" ); }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "FireCrawl" + ); + expect(completedResponse.body.data[0].content).toContain( + " { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 3d64f7f4..d5877aba 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,7 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 569be333..0b4a08ce 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 849500ad..e03c0133 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../lib/entities'; +import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -13,7 +13,7 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any, + pageOptions: PageOptions, extractorOptions: ExtractorOptions ): Promise<{ success: boolean; @@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 13939221..6529edc7 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -66,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5b663f20..6150cdd0 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,9 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + toMarkdown?: boolean; fallback?: boolean; - fetchPageContent?: boolean; - + fetchPageContent?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1e285520..2cfa84e7 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -45,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -323,7 +323,7 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fab54bde..b7fa07aa 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -172,7 +172,9 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + if (toMarkdown === false) { + return [cleanedHtml, text]; + } return [await parseMarkdown(cleanedHtml), text]; }; @@ -192,7 +194,7 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -215,14 +217,14 @@ export async function scrapSingleUrl( return { content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: "", + markdown: pageOptions.toMarkdown === false ? undefined : "", metadata: { sourceURL: urlToScrap }, } as Document; } From 509250c4ef6fe41d60f6d5ad8ed2a8a6495c6bf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 6 May 2024 19:45:56 -0300 Subject: [PATCH 2/3] changed to `includeHtml` --- .../src/__tests__/e2e_withAuth/index.test.ts | 44 +++++++++++-------- apps/api/src/controllers/crawl.ts | 5 ++- apps/api/src/controllers/crawlPreview.ts | 4 +- apps/api/src/controllers/scrape.ts | 15 ++++--- apps/api/src/controllers/search.ts | 10 +++-- apps/api/src/lib/entities.ts | 2 +- apps/api/src/main/runWebScraper.ts | 5 +++ apps/api/src/scraper/WebScraper/crawler.ts | 4 ++ apps/api/src/scraper/WebScraper/index.ts | 9 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 17 +++---- apps/api/src/types.ts | 4 +- 11 files changed, 78 insertions(+), 41 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2e262306..e0f725e5 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -79,22 +79,25 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout - it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + it("should return a successful response with a valid API key and includeHtml set to true", async () => { const response = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("FireCrawl"); - expect(response.body.data.content).toContain(" { expect(response.statusCode).toBe(401); }); - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); + // it("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) @@ -271,7 +275,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -292,12 +296,16 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + expect(completedResponse.body.data[0].markdown).toContain( "FireCrawl" ); - expect(completedResponse.body.data[0].content).toContain( + expect(completedResponse.body.data[0].html).toContain( " { @@ -73,6 +75,7 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 0b4a08ce..2b1b6767 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,8 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const includeHtml = req.body.includeHtml ?? false; const job = await addWebScraperJob({ url: url, @@ -35,6 +36,7 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index e03c0133..5bd61a5f 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -14,7 +14,8 @@ export async function scrapeHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - extractorOptions: ExtractorOptions + extractorOptions: ExtractorOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -39,7 +40,8 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, - extractorOptions: extractorOptions + extractorOptions: extractorOptions, + includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -91,11 +93,12 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,7 +116,8 @@ export async function scrapeController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - extractorOptions + extractorOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -132,7 +136,8 @@ export async function scrapeController(req: Request, res: Response) { pageOptions: pageOptions, origin: origin, extractor_options: extractorOptions, - num_tokens: numTokens + num_tokens: numTokens, + includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 6529edc7..314e475f 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -13,7 +13,8 @@ export async function searchHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - searchOptions: SearchOptions + searchOptions: SearchOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -59,6 +60,7 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), + includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -66,7 +68,6 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, - toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); @@ -125,6 +126,7 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -142,7 +144,8 @@ export async function searchController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - searchOptions + searchOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -158,6 +161,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, + includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 6150cdd0..b6340d87 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,7 +12,6 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; - toMarkdown?: boolean; fallback?: boolean; fetchPageContent?: boolean; }; @@ -47,6 +46,7 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; + includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 827eec57..798bb654 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -27,6 +27,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, + includeHtml: job.data.includeHtml, })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -38,6 +39,7 @@ export async function runWebScraper({ onSuccess, onError, team_id, + includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -47,6 +49,7 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; + includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -60,6 +63,7 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } else { await provider.setOptions({ @@ -67,6 +71,7 @@ export async function runWebScraper({ urls: url.split(","), crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 23cb6293..d3877b3f 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,6 +19,7 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private includeHtml: boolean; constructor({ initialUrl, @@ -27,6 +28,7 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, + includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -34,6 +36,7 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -45,6 +48,7 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; + this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2cfa84e7..2a3916b6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,6 +24,7 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -45,7 +46,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); processedUrls++; if (inProgress) { inProgress({ @@ -108,6 +109,7 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, + includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -142,6 +144,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", + html: this.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -323,10 +326,10 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - + this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b7fa07aa..4d071db3 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,8 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true, - pageOptions: PageOptions = { onlyMainContent: true } + pageOptions: PageOptions = { onlyMainContent: true }, + includeHtml: boolean = false ): Promise { urlToScrap = urlToScrap.trim(); @@ -172,9 +172,7 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - if (toMarkdown === false) { - return [cleanedHtml, text]; - } + return [await parseMarkdown(cleanedHtml), text]; }; @@ -194,7 +192,8 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,14 +216,16 @@ export async function scrapSingleUrl( return { content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: pageOptions.toMarkdown === false ? undefined : "", + markdown: "", + html: "", metadata: { sourceURL: urlToScrap }, } as Document; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c1858f1b..3fbdcdd5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,6 +25,7 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; + includeHtml?: boolean; } export interface FirecrawlJob { @@ -40,7 +41,8 @@ export interface FirecrawlJob { pageOptions?: any; origin: string; extractor_options?: ExtractorOptions, - num_tokens?: number + num_tokens?: number, + includeHtml?: boolean; } export enum RateLimiterMode { From e1f52c538fd8852fe977303fc929077b77faf77b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 7 May 2024 13:40:24 -0300 Subject: [PATCH 3/3] nested includeHtml inside pageOptions --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- apps/api/src/controllers/crawl.ts | 5 +---- apps/api/src/controllers/crawlPreview.ts | 4 +--- apps/api/src/controllers/scrape.ts | 7 +------ apps/api/src/controllers/search.ts | 7 ++----- apps/api/src/lib/entities.ts | 4 ++-- apps/api/src/main/runWebScraper.ts | 11 +++-------- apps/api/src/scraper/WebScraper/crawler.ts | 4 ---- apps/api/src/scraper/WebScraper/index.ts | 9 +++------ apps/api/src/scraper/WebScraper/single_url.ts | 7 +++---- apps/api/src/types.ts | 2 -- 11 files changed, 19 insertions(+), 47 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e0f725e5..644ad362 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -88,7 +88,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }}); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); @@ -270,12 +270,12 @@ describe("E2E Tests for API Routes", () => { ); }, 60000); // 60 seconds - it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index d4320922..3ba92139 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,8 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml || false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -48,7 +47,6 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, - includeHtml: includeHtml, }); const docs = await a.getDocuments(false, (progress) => { @@ -75,7 +73,6 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2b1b6767..d3e9afea 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,8 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml ?? false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const job = await addWebScraperJob({ url: url, @@ -36,7 +35,6 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 5bd61a5f..021a9d05 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -15,7 +15,6 @@ export async function scrapeHelper( crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -41,7 +40,6 @@ export async function scrapeHelper( }, pageOptions: pageOptions, extractorOptions: extractorOptions, - includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -93,12 +91,11 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -117,7 +114,6 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions, pageOptions, extractorOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -137,7 +133,6 @@ export async function scrapeController(req: Request, res: Response) { origin: origin, extractor_options: extractorOptions, num_tokens: numTokens, - includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 314e475f..d98c08d5 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -14,7 +14,6 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -60,7 +59,6 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), - includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -68,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + includeHtml: pageOptions?.includeHtml ?? false, fallback: false, }, }); @@ -119,6 +118,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { + includeHtml: false, onlyMainContent: true, fetchPageContent: true, fallback: false, @@ -126,7 +126,6 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -145,7 +144,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -161,7 +159,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, - includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b6340d87..0a6a90eb 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,8 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + includeHtml?: boolean; fallback?: boolean; - fetchPageContent?: boolean; + fetchPageContent?: boolean; }; export type ExtractorOptions = { @@ -46,7 +47,6 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; - includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 798bb654..189d5005 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -26,8 +26,7 @@ export async function startWebScraperPipeline({ onError: (error) => { job.moveToFailed(error); }, - team_id: job.data.team_id, - includeHtml: job.data.includeHtml, + team_id: job.data.team_id })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -39,7 +38,6 @@ export async function runWebScraper({ onSuccess, onError, team_id, - includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -49,7 +47,6 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; - includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -62,16 +59,14 @@ export async function runWebScraper({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d3877b3f..23cb6293 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,7 +19,6 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; - private includeHtml: boolean; constructor({ initialUrl, @@ -28,7 +27,6 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, - includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -36,7 +34,6 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; - includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -48,7 +45,6 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; - this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2a3916b6..ed49f1da 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,7 +24,6 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; - private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -46,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); + const result = await scrapSingleUrl(url, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -109,7 +108,6 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, - includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -144,7 +142,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", - html: this.includeHtml ? "" : undefined, + html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -326,10 +324,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4d071db3..a67ce310 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true }, - includeHtml: boolean = false + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, ): Promise { urlToScrap = urlToScrap.trim(); @@ -193,7 +192,7 @@ export async function scrapSingleUrl( url: urlToScrap, content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,7 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 3fbdcdd5..b9b5463d 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,7 +25,6 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; - includeHtml?: boolean; } export interface FirecrawlJob { @@ -42,7 +41,6 @@ export interface FirecrawlJob { origin: string; extractor_options?: ExtractorOptions, num_tokens?: number, - includeHtml?: boolean; } export enum RateLimiterMode {