mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: clean broken markdown
This commit is contained in:
parent
c7c039aeb1
commit
ef23d810f8
|
@ -7,6 +7,40 @@ import TurnDownService from 'turndown';
|
|||
import { Request, Response } from 'express';
|
||||
import normalizeUrl from "@esm2cjs/normalize-url";
|
||||
|
||||
function tidyMarkdown(markdown: string): string {
|
||||
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
|
||||
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
|
||||
// Remove internal new lines and excessive spaces within the text
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
url = url.replace(/\s+/g, '').trim();
|
||||
return `[${text}](${url})`;
|
||||
});
|
||||
|
||||
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
||||
// Normalize by removing excessive spaces and new lines
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
||||
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
||||
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
||||
if (imgUrl) {
|
||||
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
|
||||
} else {
|
||||
return `[${text}](${linkUrl})`;
|
||||
}
|
||||
});
|
||||
|
||||
// Step 2: Normalize regular links that may be broken across lines
|
||||
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
url = url.replace(/\s+/g, '').trim();
|
||||
return `[${text}](${url})`;
|
||||
});
|
||||
|
||||
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
|
||||
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
return normalizedMarkdown;
|
||||
}
|
||||
|
||||
@singleton()
|
||||
export class CrawlerHost extends RPCHost {
|
||||
|
@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost {
|
|||
|
||||
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
|
||||
|
||||
const cleanText = tidyMarkdown(contentText).trim();
|
||||
|
||||
const formatted = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
url: snapshot.href?.trim(),
|
||||
content: contentText.trim(),
|
||||
content: cleanText,
|
||||
|
||||
toString() {
|
||||
return `Title: ${this.title}
|
||||
|
|
Loading…
Reference in New Issue
Block a user