mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
feat: clean broken markdown
This commit is contained in:
parent
c7c039aeb1
commit
ef23d810f8
|
@ -7,6 +7,40 @@ import TurnDownService from 'turndown';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import normalizeUrl from "@esm2cjs/normalize-url";
|
import normalizeUrl from "@esm2cjs/normalize-url";
|
||||||
|
|
||||||
|
function tidyMarkdown(markdown: string): string {
|
||||||
|
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
|
||||||
|
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
|
||||||
|
// Remove internal new lines and excessive spaces within the text
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
url = url.replace(/\s+/g, '').trim();
|
||||||
|
return `[${text}](${url})`;
|
||||||
|
});
|
||||||
|
|
||||||
|
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
||||||
|
// Normalize by removing excessive spaces and new lines
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
||||||
|
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
||||||
|
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
||||||
|
if (imgUrl) {
|
||||||
|
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
|
||||||
|
} else {
|
||||||
|
return `[${text}](${linkUrl})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Step 2: Normalize regular links that may be broken across lines
|
||||||
|
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
url = url.replace(/\s+/g, '').trim();
|
||||||
|
return `[${text}](${url})`;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
|
||||||
|
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
||||||
|
|
||||||
|
return normalizedMarkdown;
|
||||||
|
}
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class CrawlerHost extends RPCHost {
|
export class CrawlerHost extends RPCHost {
|
||||||
|
@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost {
|
||||||
|
|
||||||
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
|
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
|
||||||
|
|
||||||
|
const cleanText = tidyMarkdown(contentText).trim();
|
||||||
|
|
||||||
const formatted = {
|
const formatted = {
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
url: snapshot.href?.trim(),
|
url: snapshot.href?.trim(),
|
||||||
content: contentText.trim(),
|
content: cleanText,
|
||||||
|
|
||||||
toString() {
|
toString() {
|
||||||
return `Title: ${this.title}
|
return `Title: ${this.title}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user