mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
fix(scrapeURL/pdf,docx): ignore SSL when downloading PDF
This commit is contained in:
parent
7081beff1f
commit
16e850288c
|
@ -113,6 +113,7 @@
|
|||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"typesense": "^1.5.4",
|
||||
"undici": "^6.20.1",
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"winston": "^3.14.2",
|
||||
|
|
|
@ -197,6 +197,9 @@ importers:
|
|||
typesense:
|
||||
specifier: ^1.5.4
|
||||
version: 1.8.2(@babel/runtime@7.24.6)
|
||||
undici:
|
||||
specifier: ^6.20.1
|
||||
version: 6.20.1
|
||||
unstructured-client:
|
||||
specifier: ^0.11.3
|
||||
version: 0.11.3(zod@3.23.8)
|
||||
|
@ -3957,6 +3960,10 @@ packages:
|
|||
undici-types@5.26.5:
|
||||
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
||||
|
||||
undici@6.20.1:
|
||||
resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
|
||||
engines: {node: '>=18.17'}
|
||||
|
||||
union@0.5.0:
|
||||
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
|
||||
engines: {node: '>= 0.8.0'}
|
||||
|
@ -8341,6 +8348,8 @@ snapshots:
|
|||
|
||||
undici-types@5.26.5: {}
|
||||
|
||||
undici@6.20.1: {}
|
||||
|
||||
union@0.5.0:
|
||||
dependencies:
|
||||
qs: 6.12.2
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
import { downloadFile } from "../utils/downloadFile";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||
|
|
|
@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||
schema: z.object({
|
||||
markdown: z.string(),
|
||||
}),
|
||||
tryCount: 16,
|
||||
tryCount: 32,
|
||||
tryCooldown: 250,
|
||||
});
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs";
|
|||
import { EngineError } from "../../error";
|
||||
import { Writable } from "stream";
|
||||
import { v4 as uuid } from "uuid";
|
||||
import * as undici from "undici";
|
||||
|
||||
export async function fetchFileToBuffer(url: string): Promise<{
|
||||
response: Response,
|
||||
|
@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{
|
|||
}
|
||||
|
||||
export async function downloadFile(id: string, url: string): Promise<{
|
||||
response: Response
|
||||
response: undici.Response
|
||||
tempFilePath: string
|
||||
}> {
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||
const tempFileWrite = createWriteStream(tempFilePath);
|
||||
|
||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||
// TODO: maybe we could use tlsclient for this? for proxying
|
||||
// use undici to ignore SSL for now
|
||||
const response = await undici.fetch(url, {
|
||||
dispatcher: new undici.Agent({
|
||||
connect: {
|
||||
rejectUnauthorized: false,
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||
if (response.body === null) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user