diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index b09aae7..2a7ed73 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -7,6 +7,40 @@ import TurnDownService from 'turndown'; import { Request, Response } from 'express'; import normalizeUrl from "@esm2cjs/normalize-url"; +function tidyMarkdown(markdown: string): string { + // Step 1: Handle complex broken links with text and optional images spread across multiple lines + let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { + // Remove internal new lines and excessive spaces within the text + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; + }); + + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { + // Normalize by removing excessive spaces and new lines + text = text.replace(/\s+/g, ' ').trim(); + alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; + imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; + linkUrl = linkUrl.replace(/\s+/g, '').trim(); + if (imgUrl) { + return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; + } else { + return `[${text}](${linkUrl})`; + } + }); + + // Step 2: Normalize regular links that may be broken across lines + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; + }); + + // Step 3: Replace more than two consecutive empty lines with exactly two empty lines + normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); + + return normalizedMarkdown; +} @singleton() export class CrawlerHost extends RPCHost { @@ -34,10 +68,12 @@ export class CrawlerHost extends RPCHost { const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim(); + const cleanText = tidyMarkdown(contentText).trim(); + const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), url: snapshot.href?.trim(), - content: contentText.trim(), + content: cleanText, toString() { return `Title: ${this.title}