added parsePDF option to pageOptions

user can decide if they are going to let us take care of the parse or they are going to parse the pdf by themselves
This commit is contained in:
rafaelsideguide 2024-06-12 15:06:47 -03:00
parent 48f6c19a05
commit e37d151404
9 changed files with 57 additions and 21 deletions

View File

@ -136,6 +136,21 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
}, 60000); // 60 seconds
// TODO: add this test back once we nail the waitFor option to be more deterministic
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
// const startTime = Date.now();

View File

@ -56,7 +56,11 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
parsePDF: true
};
if (mode === "single_urls" && !url.includes(",")) {
try {

View File

@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}

View File

@ -19,6 +19,7 @@ export type PageOptions = {
screenshot?: boolean;
headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean
};
export type ExtractorOptions = {

View File

@ -1,5 +1,3 @@
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
export async function handleCustomScraping(
text: string,
url: string

View File

@ -280,7 +280,7 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const pdfContent = await fetchAndProcessPdf(pdfLink);
const pdfContent = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
return {
content: pdfContent,
metadata: { sourceURL: pdfLink },
@ -475,7 +475,12 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true
};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check

View File

@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[] } = {},
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record<string, string>,
options?: any
): Promise<FireEngineResponse> {
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" };
return { html: await fetchAndProcessPdf(url, pageOptions?.parsePDF), screenshot: "" };
} else {
const data = response.data;
const html = data.content;
@ -108,7 +108,8 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout
timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -129,7 +130,7 @@ export async function scrapWithScrapingBee(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
@ -144,7 +145,8 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>
headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const reqParams = await generateRequestParams(url);
@ -172,7 +174,7 @@ export async function scrapWithPlaywright(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const textData = response.data;
try {
@ -194,7 +196,10 @@ export async function scrapWithPlaywright(
}
}
export async function scrapWithFetch(url: string): Promise<string> {
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<string> {
try {
const response = await axios.get(url, {
headers: {
@ -213,7 +218,7 @@ export async function scrapWithFetch(url: string): Promise<string> {
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url);
return fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const text = response.data;
return text;
@ -371,7 +376,7 @@ export async function scrapSingleUrl(
}
break;
case "pdf":
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url), screenshot }
customScrapedContent = { html: await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF), screenshot }
break;
}
}

View File

@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY;
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(pdfContent.trim()).toEqual("Dummy PDF file");
});

View File

@ -9,9 +9,9 @@ import os from "os";
dotenv.config();
export async function fetchAndProcessPdf(url: string): Promise<string> {
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<string> {
const tempFilePath = await downloadPdf(url);
const content = await processPdfToText(tempFilePath);
const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content;
}
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<string> {
});
}
export async function processPdfToText(filePath: string): Promise<string> {
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
let content = "";
if (process.env.LLAMAPARSE_API_KEY) {
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = {
Authorization: `Bearer ${apiKey}`,
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
console.error("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath);
}
} else {
} else if (parsePDF) {
content = await processPdf(filePath);
} else {
content = fs.readFileSync(filePath, "utf-8");
}
return content;
}