Caleb: made changes per Rafaels requests

This commit is contained in:
Caleb Peffer 2024-07-17 11:29:05 -07:00
parent da3c6bca37
commit c5d1e7260d
3 changed files with 36 additions and 30 deletions

View File

@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
expect(result.linksOnPage).toBeDefined();
expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://www.mendable.ai/blog')
}, 10000);

View File

@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils";
dotenv.config();
@ -109,37 +110,7 @@ function getScrapingFallbackOrder(
return scrapersInOrder as (typeof baseScrapers)[number][];
}
function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(`${origin}${href}`);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(`${baseUrl}/${href}`);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
}
});
// Remove duplicates and return
return [...new Set(links)];
}
export async function scrapSingleUrl(
urlToScrap: string,

View File

@ -1,4 +1,6 @@
import axios from "axios";
import * as cheerio from "cheerio";
export async function attemptScrapWithRequests(
urlToScrap: string
@ -21,3 +23,35 @@ export async function attemptScrapWithRequests(
export function sanitizeText(text: string): string {
return text.replace("\u0000", "");
}
export function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(`${origin}${href}`);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(`${baseUrl}/${href}`);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
}
});
// Remove duplicates and return
return [...new Set(links)];
}