fix(scrapeURL, logger): remove buggy ArrayTransport that causes memory leak

This commit is contained in:
Móricz Gergő 2024-11-11 10:27:55 +01:00
parent 84ad45c01f
commit 49df553768
3 changed files with 19 additions and 53 deletions

View File

@ -1,7 +1,6 @@
import * as winston from "winston"; import * as winston from "winston";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import Transport from "winston-transport";
configDotenv(); configDotenv();
const logFormat = winston.format.printf(info => const logFormat = winston.format.printf(info =>
@ -50,33 +49,3 @@ export const logger = winston.createLogger({
}), }),
], ],
}); });
export type ArrayTransportOptions = Transport.TransportStreamOptions & {
array: any[];
scrapeId?: string;
};
export class ArrayTransport extends Transport {
private array: any[];
private scrapeId?: string;
constructor(opts: ArrayTransportOptions) {
super(opts);
this.array = opts.array;
this.scrapeId = opts.scrapeId;
}
log(info, next) {
setImmediate(() => {
this.emit("logged", info);
});
if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
return next();
}
this.array.push(info);
next();
}
}

View File

@ -2,7 +2,7 @@ import { Logger } from "winston";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions } from "../../controllers/v1/types"; import { Document, ScrapeOptions } from "../../controllers/v1/types";
import { logger, ArrayTransport } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines"; import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error"; import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
@ -97,9 +97,6 @@ function buildMetaObject(id: string, url: string, options: ScrapeOptions, intern
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id }); const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
const logs: any[] = []; const logs: any[] = [];
if (process.env.ENV !== "test") {
_logger.add(new ArrayTransport({ array: logs, scrapeId: id }));
}
return { return {
id, url, options, internalOptions, id, url, options, internalOptions,

View File

@ -29,7 +29,7 @@ describe("Standalone scrapeURL tests", () => {
it("Basic scrape", async () => { it("Basic scrape", async () => {
const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -72,7 +72,7 @@ describe("Standalone scrapeURL tests", () => {
formats: ["markdown", "html"], formats: ["markdown", "html"],
}), { forceEngine }); }), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -92,7 +92,7 @@ describe("Standalone scrapeURL tests", () => {
onlyMainContent: false, onlyMainContent: false,
}), { forceEngine }); }), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -110,7 +110,7 @@ describe("Standalone scrapeURL tests", () => {
excludeTags: ['.nav', '#footer', 'strong'], excludeTags: ['.nav', '#footer', 'strong'],
}), { forceEngine }); }), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 400 status code", async () => { it("Scrape of a page with 400 status code", async () => {
const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -138,7 +138,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 401 status code", async () => { it("Scrape of a page with 401 status code", async () => {
const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -151,7 +151,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 403 status code", async () => { it("Scrape of a page with 403 status code", async () => {
const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -164,7 +164,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 404 status code", async () => { it("Scrape of a page with 404 status code", async () => {
const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -177,7 +177,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 405 status code", async () => { it("Scrape of a page with 405 status code", async () => {
const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -190,7 +190,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a page with 500 status code", async () => { it("Scrape of a page with 500 status code", async () => {
const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -203,7 +203,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape a redirected page", async () => { it("Scrape a redirected page", async () => {
const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine }); const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -224,7 +224,7 @@ describe("Standalone scrapeURL tests", () => {
formats: ["screenshot"], formats: ["screenshot"],
}), { forceEngine }); }), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -243,7 +243,7 @@ describe("Standalone scrapeURL tests", () => {
formats: ["screenshot@fullPage"], formats: ["screenshot@fullPage"],
}), { forceEngine }); }), { forceEngine });
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -261,7 +261,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape of a PDF file", async () => { it("Scrape of a PDF file", async () => {
const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({})); const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}));
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -275,7 +275,7 @@ describe("Standalone scrapeURL tests", () => {
it("Scrape a DOCX file", async () => { it("Scrape a DOCX file", async () => {
const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({})); const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}));
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -304,7 +304,7 @@ describe("Standalone scrapeURL tests", () => {
}, },
})); }));
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -335,7 +335,7 @@ describe("Standalone scrapeURL tests", () => {
}, },
})); }));
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
expect(out.success).toBe(true); expect(out.success).toBe(true);
if (out.success) { if (out.success) {
expect(out.document.warning).toBeUndefined(); expect(out.document.warning).toBeUndefined();
@ -369,7 +369,7 @@ describe("Standalone scrapeURL tests", () => {
} }
// verify that log collection works properly while concurrency is happening // verify that log collection works properly while concurrency is happening
expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
const weirdLogs = out.logs.filter(x => x.scrapeId !== id); const weirdLogs = out.logs.filter(x => x.scrapeId !== id);
if (weirdLogs.length > 0) { if (weirdLogs.length > 0) {
console.warn(JSON.stringify(weirdLogs, replacer)); console.warn(JSON.stringify(weirdLogs, replacer));