From 19dc9df9cba5a62db1e5c3086eef66f1f10d9794 Mon Sep 17 00:00:00 2001 From: Harsh Gupta Date: Thu, 15 Aug 2024 15:01:48 +0530 Subject: [PATCH] more console logs --- backend/functions/src/cloud-functions/crawler.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 0c15ae1..2e20113 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -401,17 +401,27 @@ export class CrawlerHost extends RPCHost { let toBeTurnedToMd = jsDomElementOfHTML; let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); if (mode !== 'markdown' && snapshot.parsed?.content) { + console.log('Processing parsed content for non-markdown mode'); const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); + console.log('Created jsDomElementOfParsed'); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); + console.log('Generated par1 from jsDomElementOfHTML'); const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; + console.log('Generated par2 from jsDomElementOfParsed'); // If Readability did its job if (par2.length >= 0.3 * par1.length) { + console.log('Readability seems to have done its job, adjusting turnDownService'); turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); if (snapshot.parsed.content) { + console.log('Using parsed content for toBeTurnedToMd'); toBeTurnedToMd = jsDomElementOfParsed; } + } else { + console.log('Readability output not sufficient, using original HTML'); } + } else { + console.log('Skipping parsed content processing'); } for (const plugin of this.turnDownPlugins) { @@ -588,6 +598,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; console.log('Crawl method called with request:', req.url); // const rpcReflect: RPCReflection = {}; const ctx = { req, res }; + console.log(`req.headers: ${JSON.stringify(req.headers)}`); const crawlerOptionsHeaderOnly = CrawlerOptionsHeaderOnly.from(req.headers); const crawlerOptionsParamsAllowed = CrawlerOptions.from(req.method === 'POST' ? req.body : req.query); const noSlashURL = ctx.req.url.slice(1);