mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
fix(scrapeURL, logger): remove buggy ArrayTransport that causes memory leak
This commit is contained in:
parent
84ad45c01f
commit
49df553768
|
@ -1,7 +1,6 @@
|
||||||
import * as winston from "winston";
|
import * as winston from "winston";
|
||||||
|
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import Transport from "winston-transport";
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
const logFormat = winston.format.printf(info =>
|
const logFormat = winston.format.printf(info =>
|
||||||
|
@ -50,33 +49,3 @@ export const logger = winston.createLogger({
|
||||||
}),
|
}),
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
export type ArrayTransportOptions = Transport.TransportStreamOptions & {
|
|
||||||
array: any[];
|
|
||||||
scrapeId?: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export class ArrayTransport extends Transport {
|
|
||||||
private array: any[];
|
|
||||||
private scrapeId?: string;
|
|
||||||
|
|
||||||
constructor(opts: ArrayTransportOptions) {
|
|
||||||
super(opts);
|
|
||||||
this.array = opts.array;
|
|
||||||
this.scrapeId = opts.scrapeId;
|
|
||||||
}
|
|
||||||
|
|
||||||
log(info, next) {
|
|
||||||
setImmediate(() => {
|
|
||||||
this.emit("logged", info);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
|
|
||||||
return next();
|
|
||||||
}
|
|
||||||
|
|
||||||
this.array.push(info);
|
|
||||||
|
|
||||||
next();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -2,7 +2,7 @@ import { Logger } from "winston";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
||||||
import { logger, ArrayTransport } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
|
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
|
||||||
|
@ -97,9 +97,6 @@ function buildMetaObject(id: string, url: string, options: ScrapeOptions, intern
|
||||||
|
|
||||||
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
|
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
|
||||||
const logs: any[] = [];
|
const logs: any[] = [];
|
||||||
if (process.env.ENV !== "test") {
|
|
||||||
_logger.add(new ArrayTransport({ array: logs, scrapeId: id }));
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id, url, options, internalOptions,
|
id, url, options, internalOptions,
|
||||||
|
|
|
@ -29,7 +29,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Basic scrape", async () => {
|
it("Basic scrape", async () => {
|
||||||
const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -72,7 +72,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
formats: ["markdown", "html"],
|
formats: ["markdown", "html"],
|
||||||
}), { forceEngine });
|
}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -92,7 +92,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
}), { forceEngine });
|
}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -110,7 +110,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
excludeTags: ['.nav', '#footer', 'strong'],
|
excludeTags: ['.nav', '#footer', 'strong'],
|
||||||
}), { forceEngine });
|
}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 400 status code", async () => {
|
it("Scrape of a page with 400 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -138,7 +138,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 401 status code", async () => {
|
it("Scrape of a page with 401 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -151,7 +151,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 403 status code", async () => {
|
it("Scrape of a page with 403 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -164,7 +164,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 404 status code", async () => {
|
it("Scrape of a page with 404 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -177,7 +177,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 405 status code", async () => {
|
it("Scrape of a page with 405 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -190,7 +190,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a page with 500 status code", async () => {
|
it("Scrape of a page with 500 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -203,7 +203,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape a redirected page", async () => {
|
it("Scrape a redirected page", async () => {
|
||||||
const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -224,7 +224,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
formats: ["screenshot"],
|
formats: ["screenshot"],
|
||||||
}), { forceEngine });
|
}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -243,7 +243,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
formats: ["screenshot@fullPage"],
|
formats: ["screenshot@fullPage"],
|
||||||
}), { forceEngine });
|
}), { forceEngine });
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -261,7 +261,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape of a PDF file", async () => {
|
it("Scrape of a PDF file", async () => {
|
||||||
const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}));
|
const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}));
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -275,7 +275,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
it("Scrape a DOCX file", async () => {
|
it("Scrape a DOCX file", async () => {
|
||||||
const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}));
|
const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}));
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -304,7 +304,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -335,7 +335,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
|
@ -369,7 +369,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
// verify that log collection works properly while concurrency is happening
|
// verify that log collection works properly while concurrency is happening
|
||||||
expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
const weirdLogs = out.logs.filter(x => x.scrapeId !== id);
|
const weirdLogs = out.logs.filter(x => x.scrapeId !== id);
|
||||||
if (weirdLogs.length > 0) {
|
if (weirdLogs.length > 0) {
|
||||||
console.warn(JSON.stringify(weirdLogs, replacer));
|
console.warn(JSON.stringify(weirdLogs, replacer));
|
||||||
|
|
Loading…
Reference in New Issue
Block a user