fix: another approach to suspected DoS abuse

This commit is contained in:
Yanlong Wang 2024-08-02 17:04:13 +08:00
parent e658e8102c
commit 0a33207f8f
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 23 additions and 10 deletions

View File

@ -375,12 +375,22 @@ export class CrawlerHost extends RPCHost {
let contentText = '';
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
const uid = this.threadLocal.get('uid');
do {
if (pdfMode) {
contentText = snapshot.parsed?.content || snapshot.text;
break;
}
if (
snapshot.maxElemDepth! > 256 ||
(!uid && snapshot.elemCount! > 10_000) ||
snapshot.text.length > 70_000
) {
contentText = snapshot.text;
break;
}
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
let toBeTurnedToMd = jsDomElementOfHTML;
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });

View File

@ -334,6 +334,15 @@ export class SearcherHost extends RPCHost {
r.description = upstreamSearchResult.description;
return r;
}).catch((err)=> {
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
return {
url: upstreamSearchResult.url,
title: upstreamSearchResult.title,
description: upstreamSearchResult.description,
content: x.text,
};
});
});

View File

@ -11,7 +11,6 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
import { TimeoutError } from 'puppeteer';
import { AsyncContext } from '../shared';
const tldExtract = require('tld-extract');
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -129,7 +128,7 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
NodeFilter.SHOW_ELEMENT,
(node) => {
const nodeName = node.nodeName.toLowerCase();
return (nodeName === 'svg' || nodeName === 'code') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
},
false
);
@ -215,7 +214,6 @@ export class PuppeteerControl extends AsyncService {
constructor(
protected globalLogger: Logger,
protected threadLocal: AsyncContext,
) {
super(...arguments);
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
@ -491,17 +489,13 @@ document.addEventListener('load', handlePageLoad);
if (snapshot === s) {
return;
}
snapshot = s;
if (s?.maxElemDepth && s.maxElemDepth > 256) {
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
return;
}
if (s?.elemCount && s.elemCount > 20_000) {
if (!this.threadLocal.get('uid')) {
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: too many DOM elements` });
return;
}
if (s?.elemCount && s.elemCount > 10_000) {
return;
}
snapshot = s;
nextSnapshotDeferred.resolve(s);
nextSnapshotDeferred = Defer();
this.once('crippled', crippleListener);