From f4348024c61e9ce15feeb0928d4d87a91a3f352e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 13 May 2024 09:13:42 -0300 Subject: [PATCH] Added check during scraping to deal with pdfs Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy. --- .../src/__tests__/e2e_withAuth/index.test.ts | 12 +++----- apps/api/src/scraper/WebScraper/index.ts | 28 ++++++++++++++++++- apps/api/src/scraper/WebScraper/single_url.ts | 15 ++++++++-- .../scraper/WebScraper/utils/pdfProcessor.ts | 9 ++++-- 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index a49b1694..d69a70ba 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds + }, 60000); // 60 seconds it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { const response = await request(TEST_URL) @@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty('content'); expect(response.body.data).toHaveProperty('metadata'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - }, 30000); // 30 seconds + }, 60000); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://jestjs.io" }); expect(crawlResponse.statusCode).toBe(200); - - // wait for 30 seconds - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 20000)); const response = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body).toHaveProperty("status"); expect(response.body.status).toBe("cancelled"); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) @@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds - - describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 73eda444..de941e04 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -144,14 +144,23 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } + // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } private async handleSingleUrlsMode( inProgress?: (progress: Progress) => void ): Promise { - let documents = await this.processLinks(this.urls, inProgress); + const links = this.urls; + // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return documents; } @@ -163,7 +172,11 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } + // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links); + // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + let documents = await this.processLinks(links, inProgress); + // documents.push(...pdfDocuments); return this.cacheAndFinalizeDocuments(documents, links); } @@ -220,6 +233,19 @@ export class WebScraperDataProvider { ); } + private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> { + const checks = links.map(async (link) => ({ + link, + isPdf: await isUrlAPdf({ url: link }) + })); + + const results = await Promise.all(checks); + const pdfLinks = results.filter(result => result.isPdf).map(result => result.link); + const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link); + + return [pdfLinks, notPdfLinks]; + } + private applyPathReplacements(documents: Document[]): Document[] { return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index c43ea408..33d85182 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; import { urlSpecificParams } from "./utils/custom/website_params"; +import { fetchAndProcessPdf } from "./utils/pdfProcessor"; dotenv.config(); @@ -66,9 +67,17 @@ export async function scrapWithScrapingBee( ); return ""; } - const decoder = new TextDecoder(); - const text = decoder.decode(response.data); - return text; + // Check the content type of the response + const contentType = response.headers['content-type']; + if (contentType && contentType.includes('application/pdf')) { + // Handle PDF content type + return fetchAndProcessPdf(url); + } else { + // Assume the content is text and decode it + const decoder = new TextDecoder(); + const text = decoder.decode(response.data); + return text; + } } catch (error) { console.error(`Error scraping with Scraping Bee: ${error}`); return ""; diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 67fb1343..a72de308 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise { } else { // If the status code is not 200, increment the attempt counter and wait attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result:", error); + console.error("Error fetching result:", error.data.detail || ''); attempt++; - await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying + await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently } } @@ -127,7 +127,10 @@ export async function isUrlAPdf({ if (fastMode) { return false; } + const before = Date.now(); const response = await axios.head(url); + const after = Date.now(); + console.log(`${after - before}ms - HEAD Request for ${url}`); const contentType = response.headers['content-type']; return contentType.includes('application/pdf'); } catch (error) {