fix: detect when readability does not work

This commit is contained in:
yanlong.wang 2024-06-20 18:20:13 +08:00
parent eaa06781e3
commit 579f259cb9
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -327,8 +327,19 @@ export class CrawlerHost extends RPCHost {
break;
}
const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
let toBeTurnedToMd = snapshot.html;
let turnDownService = this.getTurndown({ url: nominalUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) {
const par1 = turnDownService.turndown(toBeTurnedToMd);
const par2 = turnDownService.turndown(snapshot.parsed.content)
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
toBeTurnedToMd = snapshot.parsed.content;
}
}
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}