diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 872adc6e..849d209d 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] { const $ = cheerio.load(html); const links: string[] = []; - // Parse the base URL to get the origin - const urlObject = new URL(baseUrl); - const origin = urlObject.origin; - $('a').each((_, element) => { const href = $(element).attr('href'); if (href) { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(new URL(href, baseUrl).href); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(new URL(href, baseUrl).href); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); + try { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } catch (error) { + // Log the error and continue + console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error); } - // Fragment-only links (#) are ignored } });