Added check during scraping to deal with pdfs

Checks if the URL is a PDF during the scraping process (single_url.ts). TODO: Run integration tests - Does this strat affect the running time? ps. Some comments need to be removed if we decide to proceed with this strategy.
2024-11-16 11:42:24 +08:00 · 2024-05-13 09:13:42 -03:00 · 2024-05-13 09:13:42 -03:00 · f4348024c6
commit f4348024c6
parent 5a2712fa5a
4 changed files with 49 additions and 15 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -117,7 +117,7 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data).toHaveProperty('content');
      expect(response.body.data).toHaveProperty('metadata');
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
-    }, 30000); // 30 seconds
+    }, 60000); // 60 seconds
  
    it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
      const response = await request(TEST_URL)
@ -132,7 +132,7 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data).toHaveProperty('content');
      expect(response.body.data).toHaveProperty('metadata');
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
-    }, 30000); // 30 seconds
+    }, 60000); // 60 seconds
  });

  describe("POST /v0/crawl", () => {
@ -427,10 +427,8 @@ describe("E2E Tests for API Routes", () => {
      .send({ url: "https://jestjs.io" });
    expect(crawlResponse.statusCode).toBe(200);

-    
-
    // wait for 30 seconds
-    await new Promise((r) => setTimeout(r, 10000));
+    await new Promise((r) => setTimeout(r, 20000));

    const response = await request(TEST_URL)
      .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@ -439,7 +437,7 @@ describe("E2E Tests for API Routes", () => {
    expect(response.body).toHaveProperty("status");
    expect(response.body.status).toBe("cancelled");

-    await new Promise((r) => setTimeout(r, 20000));
+    await new Promise((r) => setTimeout(r, 10000));

    const completedResponse = await request(TEST_URL)
      .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@ -456,8 +454,6 @@ describe("E2E Tests for API Routes", () => {
    
  }, 60000); // 60 seconds

-  
-
  describe("POST /v0/scrape with LLM Extraction", () => {
    it("should extract data using LLM extraction mode", async () => {
      const response = await request(TEST_URL)
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -144,14 +144,23 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
+
    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

  private async handleSingleUrlsMode(
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
-    let documents = await this.processLinks(this.urls, inProgress);
+    const links = this.urls;
+    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
+    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return documents;
  }

@ -163,7 +172,11 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

+    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
+    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+
    let documents = await this.processLinks(links, inProgress);
+    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

@ -220,6 +233,19 @@ export class WebScraperDataProvider {
    );
  }

+  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
+    const checks = links.map(async (link) => ({
+      link,
+      isPdf: await isUrlAPdf({ url: link })
+    }));
+  
+    const results = await Promise.all(checks);
+    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
+    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
+  
+    return [pdfLinks, notPdfLinks];
+  }
+
  private applyPathReplacements(documents: Document[]): Document[] {
    return this.replaceAllPathsWithAbsolutePaths
      ? replacePathsWithAbsolutePaths(documents)
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -6,6 +6,7 @@ import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";

 dotenv.config();

@ -66,9 +67,17 @@ export async function scrapWithScrapingBee(
      );
      return "";
    }
-    const decoder = new TextDecoder();
-    const text = decoder.decode(response.data);
-    return text;
+    // Check the content type of the response
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      // Handle PDF content type
+      return fetchAndProcessPdf(url);
+    } else {
+      // Assume the content is text and decode it
+      const decoder = new TextDecoder();
+      const text = decoder.decode(response.data);
+      return text;
+    }
  } catch (error) {
    console.error(`Error scraping with Scraping Bee: ${error}`);
    return "";
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
          } else {
            // If the status code is not 200, increment the attempt counter and wait
            attempt++;
-            await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
+            await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
          }
        } catch (error) {
-          console.error("Error fetching result:", error);
+          console.error("Error fetching result:", error.data.detail || '');
          attempt++;
-          await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
+          await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
          // You may want to handle specific errors differently
        }
      }
@ -127,7 +127,10 @@ export async function isUrlAPdf({
    if (fastMode) {
      return false;
    }
+    const before = Date.now();
    const response = await axios.head(url);
+    const after = Date.now();
+    console.log(`${after - before}ms - HEAD Request for ${url}`);
    const contentType = response.headers['content-type'];
    return contentType.includes('application/pdf');
  } catch (error) {