mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
fix(scrapeURL/pdf): fix llamaparse upload
This commit is contained in:
parent
687ea69621
commit
9ace2ad071
|
@ -81,7 +81,6 @@
|
||||||
"escape-html": "^1.0.3",
|
"escape-html": "^1.0.3",
|
||||||
"express-rate-limit": "^7.3.1",
|
"express-rate-limit": "^7.3.1",
|
||||||
"express-ws": "^5.0.2",
|
"express-ws": "^5.0.2",
|
||||||
"form-data": "^4.0.0",
|
|
||||||
"glob": "^10.4.2",
|
"glob": "^10.4.2",
|
||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.4.1",
|
"ioredis": "^5.4.1",
|
||||||
|
|
|
@ -101,9 +101,6 @@ importers:
|
||||||
express-ws:
|
express-ws:
|
||||||
specifier: ^5.0.2
|
specifier: ^5.0.2
|
||||||
version: 5.0.2(express@4.19.2)
|
version: 5.0.2(express@4.19.2)
|
||||||
form-data:
|
|
||||||
specifier: ^4.0.0
|
|
||||||
version: 4.0.0
|
|
||||||
glob:
|
glob:
|
||||||
specifier: ^10.4.2
|
specifier: ^10.4.2
|
||||||
version: 10.4.2
|
version: 10.4.2
|
||||||
|
@ -3932,8 +3929,8 @@ packages:
|
||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
typescript@5.6.2:
|
typescript@5.6.3:
|
||||||
resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==}
|
resolution: {integrity: sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==}
|
||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
|
@ -7742,7 +7739,7 @@ snapshots:
|
||||||
csv-parse: 5.5.6
|
csv-parse: 5.5.6
|
||||||
gpt3-tokenizer: 1.1.5
|
gpt3-tokenizer: 1.1.5
|
||||||
openai: 3.3.0
|
openai: 3.3.0
|
||||||
typescript: 5.6.2
|
typescript: 5.6.3
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.23.8
|
zod: 3.23.8
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
|
@ -8320,7 +8317,7 @@ snapshots:
|
||||||
|
|
||||||
typescript@5.4.5: {}
|
typescript@5.4.5: {}
|
||||||
|
|
||||||
typescript@5.6.2: {}
|
typescript@5.6.3: {}
|
||||||
|
|
||||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import { createReadStream, promises as fs } from "node:fs";
|
import { createReadStream, promises as fs } from "node:fs";
|
||||||
import FormData from "form-data";
|
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import * as marked from "marked";
|
import * as marked from "marked";
|
||||||
|
@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
||||||
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
||||||
|
|
||||||
const uploadForm = new FormData();
|
const uploadForm = new FormData();
|
||||||
uploadForm.append("file", createReadStream(tempFilePath), {
|
|
||||||
filename: tempFilePath,
|
// This is utterly stupid but it works! - mogery
|
||||||
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
|
uploadForm.append("file", {
|
||||||
});
|
[Symbol.toStringTag]: "Blob",
|
||||||
|
name: tempFilePath,
|
||||||
|
stream() {
|
||||||
|
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
|
||||||
|
},
|
||||||
|
arrayBuffer() {
|
||||||
|
throw Error("Unimplemented in mock Blob: arrayBuffer")
|
||||||
|
},
|
||||||
|
size: (await fs.stat(tempFilePath)).size,
|
||||||
|
text() {
|
||||||
|
throw Error("Unimplemented in mock Blob: text")
|
||||||
|
},
|
||||||
|
slice(start, end, contentType) {
|
||||||
|
throw Error("Unimplemented in mock Blob: slice")
|
||||||
|
},
|
||||||
|
type: "application/pdf",
|
||||||
|
} as Blob);
|
||||||
|
|
||||||
const upload = await robustFetch({
|
const upload = await robustFetch({
|
||||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||||
|
|
|
@ -2,7 +2,6 @@ import { Logger } from "winston";
|
||||||
import { z, ZodError } from "zod";
|
import { z, ZodError } from "zod";
|
||||||
import { v4 as uuid } from "uuid";
|
import { v4 as uuid } from "uuid";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import FormData from "form-data";
|
|
||||||
|
|
||||||
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||||
url: string;
|
url: string;
|
||||||
|
@ -38,14 +37,14 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
||||||
method,
|
method,
|
||||||
headers: {
|
headers: {
|
||||||
...(body instanceof FormData
|
...(body instanceof FormData
|
||||||
? body.getHeaders()
|
? ({})
|
||||||
: body !== undefined ? ({
|
: body !== undefined ? ({
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}) : {}),
|
}) : {}),
|
||||||
...(headers !== undefined ? headers : {}),
|
...(headers !== undefined ? headers : {}),
|
||||||
},
|
},
|
||||||
...(body instanceof FormData ? ({
|
...(body instanceof FormData ? ({
|
||||||
body: body.getBuffer(),
|
body,
|
||||||
}) : body !== undefined ? ({
|
}) : body !== undefined ? ({
|
||||||
body: JSON.stringify(body),
|
body: JSON.stringify(body),
|
||||||
}) : {}),
|
}) : {}),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user