mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: full markdown mode
This commit is contained in:
parent
0f70723d19
commit
69231ad59e
|
@ -58,7 +58,23 @@ export class CrawlerHost extends RPCHost {
|
|||
this.emit('ready');
|
||||
}
|
||||
|
||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
||||
getTurndown(noRules?: boolean | string) {
|
||||
const turnDownService = new TurndownService();
|
||||
if (!noRules) {
|
||||
turnDownService.addRule('remove-irrelevant', {
|
||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
||||
replacement: () => ''
|
||||
});
|
||||
turnDownService.addRule('title-as-h1', {
|
||||
filter: ['title'],
|
||||
replacement: (innerText) => `${innerText}\n===============\n`
|
||||
});
|
||||
}
|
||||
|
||||
return turnDownService;
|
||||
}
|
||||
|
||||
async formatSnapshot(mode: string | 'markdown' | 'full-markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
}, nominalUrl?: URL) {
|
||||
if (mode === 'screenshot') {
|
||||
|
@ -96,8 +112,8 @@ export class CrawlerHost extends RPCHost {
|
|||
};
|
||||
}
|
||||
|
||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
||||
let turnDownService = new TurndownService();
|
||||
const toBeTurnedToMd = mode === 'full-markdown' ? snapshot.html : snapshot.parsed?.content;
|
||||
let turnDownService = mode === 'markdown' ? this.getTurndown('without any rule') : this.getTurndown();
|
||||
for (const plugin of this.turnDownPlugins) {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
|
@ -129,7 +145,7 @@ export class CrawlerHost extends RPCHost {
|
|||
if (mapped) {
|
||||
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
|
||||
}
|
||||
return `![Image ${imgIdx}: ${alt}](${src})`;
|
||||
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -139,7 +155,7 @@ export class CrawlerHost extends RPCHost {
|
|||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = new TurndownService();
|
||||
const vanillaTurnDownService = this.getTurndown();
|
||||
try {
|
||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
|
@ -148,12 +164,15 @@ export class CrawlerHost extends RPCHost {
|
|||
}
|
||||
}
|
||||
|
||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||
if (
|
||||
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||
&& toBeTurnedToMd !== snapshot.html
|
||||
) {
|
||||
try {
|
||||
contentText = turnDownService.turndown(snapshot.html);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = new TurndownService();
|
||||
const vanillaTurnDownService = this.getTurndown();
|
||||
try {
|
||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||
} catch (err2) {
|
||||
|
@ -179,6 +198,10 @@ export class CrawlerHost extends RPCHost {
|
|||
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||
}
|
||||
|
||||
if (mode === 'full-markdown') {
|
||||
return this.content;
|
||||
}
|
||||
|
||||
return `Title: ${this.title}
|
||||
|
||||
URL Source: ${this.url}
|
||||
|
@ -233,6 +256,7 @@ ${this.content}
|
|||
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
|
||||
`Supported formats:\n` +
|
||||
`- markdown\n` +
|
||||
`- full-markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- screenshot\n\n` +
|
||||
|
|
Loading…
Reference in New Issue
Block a user