feat: clean broken markdown

This commit is contained in:
Han Xiao 2024-04-13 19:21:35 -07:00
parent c7c039aeb1
commit ef23d810f8

View File

@ -7,6 +7,40 @@ import TurnDownService from 'turndown';
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import normalizeUrl from "@esm2cjs/normalize-url"; import normalizeUrl from "@esm2cjs/normalize-url";
function tidyMarkdown(markdown: string): string {
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
// Remove internal new lines and excessive spaces within the text
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
// Normalize by removing excessive spaces and new lines
text = text.replace(/\s+/g, ' ').trim();
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
linkUrl = linkUrl.replace(/\s+/g, '').trim();
if (imgUrl) {
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
} else {
return `[${text}](${linkUrl})`;
}
});
// Step 2: Normalize regular links that may be broken across lines
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
return normalizedMarkdown;
}
@singleton() @singleton()
export class CrawlerHost extends RPCHost { export class CrawlerHost extends RPCHost {
@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost {
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim(); const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
const cleanText = tidyMarkdown(contentText).trim();
const formatted = { const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: snapshot.href?.trim(), url: snapshot.href?.trim(),
content: contentText.trim(), content: cleanText,
toString() { toString() {
return `Title: ${this.title} return `Title: ${this.title}