feat: full markdown mode

This commit is contained in:
yanlong.wang 2024-04-25 18:21:04 +08:00
parent 0f70723d19
commit 69231ad59e
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -58,7 +58,23 @@ export class CrawlerHost extends RPCHost {
this.emit('ready'); this.emit('ready');
} }
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { getTurndown(noRules?: boolean | string) {
const turnDownService = new TurndownService();
if (!noRules) {
turnDownService.addRule('remove-irrelevant', {
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
replacement: () => ''
});
turnDownService.addRule('title-as-h1', {
filter: ['title'],
replacement: (innerText) => `${innerText}\n===============\n`
});
}
return turnDownService;
}
async formatSnapshot(mode: string | 'markdown' | 'full-markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
screenshotUrl?: string; screenshotUrl?: string;
}, nominalUrl?: URL) { }, nominalUrl?: URL) {
if (mode === 'screenshot') { if (mode === 'screenshot') {
@ -96,8 +112,8 @@ export class CrawlerHost extends RPCHost {
}; };
} }
const toBeTurnedToMd = snapshot.parsed?.content; const toBeTurnedToMd = mode === 'full-markdown' ? snapshot.html : snapshot.parsed?.content;
let turnDownService = new TurndownService(); let turnDownService = mode === 'markdown' ? this.getTurndown('without any rule') : this.getTurndown();
for (const plugin of this.turnDownPlugins) { for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin); turnDownService = turnDownService.use(plugin);
} }
@ -129,7 +145,7 @@ export class CrawlerHost extends RPCHost {
if (mapped) { if (mapped) {
return `![Image ${imgIdx}: ${mapped || alt}](${src})`; return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
} }
return `![Image ${imgIdx}: ${alt}](${src})`; return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
} }
}); });
@ -139,7 +155,7 @@ export class CrawlerHost extends RPCHost {
contentText = turnDownService.turndown(toBeTurnedToMd).trim(); contentText = turnDownService.turndown(toBeTurnedToMd).trim();
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = new TurndownService(); const vanillaTurnDownService = this.getTurndown();
try { try {
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim(); contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
} catch (err2) { } catch (err2) {
@ -148,12 +164,15 @@ export class CrawlerHost extends RPCHost {
} }
} }
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) { if (
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
&& toBeTurnedToMd !== snapshot.html
) {
try { try {
contentText = turnDownService.turndown(snapshot.html); contentText = turnDownService.turndown(snapshot.html);
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = new TurndownService(); const vanillaTurnDownService = this.getTurndown();
try { try {
contentText = vanillaTurnDownService.turndown(snapshot.html); contentText = vanillaTurnDownService.turndown(snapshot.html);
} catch (err2) { } catch (err2) {
@ -179,6 +198,10 @@ export class CrawlerHost extends RPCHost {
mixins.push(`Published Time: ${this.publishedTime}`); mixins.push(`Published Time: ${this.publishedTime}`);
} }
if (mode === 'full-markdown') {
return this.content;
}
return `Title: ${this.title} return `Title: ${this.title}
URL Source: ${this.url} URL Source: ${this.url}
@ -233,6 +256,7 @@ ${this.content}
description: `Specifies the form factor of the crawled data you prefer. \n\n` + description: `Specifies the form factor of the crawled data you prefer. \n\n` +
`Supported formats:\n` + `Supported formats:\n` +
`- markdown\n` + `- markdown\n` +
`- full-markdown\n` +
`- html\n` + `- html\n` +
`- text\n` + `- text\n` +
`- screenshot\n\n` + `- screenshot\n\n` +