diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 9f8bd93..7f17b47 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -375,12 +375,22 @@ export class CrawlerHost extends RPCHost { let contentText = ''; const imageSummary = {} as { [k: string]: string; }; const imageIdxTrack = new Map(); + const uid = this.threadLocal.get('uid'); do { if (pdfMode) { contentText = snapshot.parsed?.content || snapshot.text; break; } + if ( + snapshot.maxElemDepth! > 256 || + (!uid && snapshot.elemCount! > 10_000) || + snapshot.text.length > 70_000 + ) { + contentText = snapshot.text; + break; + } + const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); let toBeTurnedToMd = jsDomElementOfHTML; let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl }); diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index d7a56a3..0a5a736 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -334,6 +334,15 @@ export class SearcherHost extends RPCHost { r.description = upstreamSearchResult.description; return r; + }).catch((err)=> { + this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) }); + + return { + url: upstreamSearchResult.url, + title: upstreamSearchResult.title, + description: upstreamSearchResult.description, + content: x.text, + }; }); }); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index a249df2..cbffb60 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -11,7 +11,6 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors'; import { TimeoutError } from 'puppeteer'; -import { AsyncContext } from '../shared'; const tldExtract = require('tld-extract'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -129,7 +128,7 @@ function getMaxDepthAndCountUsingTreeWalker(root) { NodeFilter.SHOW_ELEMENT, (node) => { const nodeName = node.nodeName.toLowerCase(); - return (nodeName === 'svg' || nodeName === 'code') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT; + return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT; }, false ); @@ -215,7 +214,6 @@ export class PuppeteerControl extends AsyncService { constructor( protected globalLogger: Logger, - protected threadLocal: AsyncContext, ) { super(...arguments); this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95; @@ -491,17 +489,13 @@ document.addEventListener('load', handlePageLoad); if (snapshot === s) { return; } + snapshot = s; if (s?.maxElemDepth && s.maxElemDepth > 256) { - page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` }); return; } - if (s?.elemCount && s.elemCount > 20_000) { - if (!this.threadLocal.get('uid')) { - page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: too many DOM elements` }); - return; - } + if (s?.elemCount && s.elemCount > 10_000) { + return; } - snapshot = s; nextSnapshotDeferred.resolve(s); nextSnapshotDeferred = Defer(); this.once('crippled', crippleListener);