mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Caleb: made changes per Rafaels requests
This commit is contained in:
parent
da3c6bca37
commit
c5d1e7260d
|
@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => {
|
||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||||
|
expect(result.linksOnPage).toContain('https://www.mendable.ai/blog')
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
|
@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||||
|
import { extractLinks } from "./utils/utils";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
@ -109,37 +110,7 @@ function getScrapingFallbackOrder(
|
||||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractLinks(html: string, baseUrl: string): string[] {
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
const links: string[] = [];
|
|
||||||
|
|
||||||
// Parse the base URL to get the origin
|
|
||||||
const urlObject = new URL(baseUrl);
|
|
||||||
const origin = urlObject.origin;
|
|
||||||
|
|
||||||
$('a').each((_, element) => {
|
|
||||||
const href = $(element).attr('href');
|
|
||||||
if (href) {
|
|
||||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
||||||
// Absolute URL, add as is
|
|
||||||
links.push(href);
|
|
||||||
} else if (href.startsWith('/')) {
|
|
||||||
// Relative URL starting with '/', append to origin
|
|
||||||
links.push(`${origin}${href}`);
|
|
||||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
|
||||||
// Relative URL not starting with '/', append to base URL
|
|
||||||
links.push(`${baseUrl}/${href}`);
|
|
||||||
} else if (href.startsWith('mailto:')) {
|
|
||||||
// mailto: links, add as is
|
|
||||||
links.push(href);
|
|
||||||
}
|
|
||||||
// Fragment-only links (#) are ignored
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Remove duplicates and return
|
|
||||||
return [...new Set(links)];
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import * as cheerio from "cheerio";
|
||||||
|
|
||||||
|
|
||||||
export async function attemptScrapWithRequests(
|
export async function attemptScrapWithRequests(
|
||||||
urlToScrap: string
|
urlToScrap: string
|
||||||
|
@ -21,3 +23,35 @@ export async function attemptScrapWithRequests(
|
||||||
export function sanitizeText(text: string): string {
|
export function sanitizeText(text: string): string {
|
||||||
return text.replace("\u0000", "");
|
return text.replace("\u0000", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const links: string[] = [];
|
||||||
|
|
||||||
|
// Parse the base URL to get the origin
|
||||||
|
const urlObject = new URL(baseUrl);
|
||||||
|
const origin = urlObject.origin;
|
||||||
|
|
||||||
|
$('a').each((_, element) => {
|
||||||
|
const href = $(element).attr('href');
|
||||||
|
if (href) {
|
||||||
|
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||||
|
// Absolute URL, add as is
|
||||||
|
links.push(href);
|
||||||
|
} else if (href.startsWith('/')) {
|
||||||
|
// Relative URL starting with '/', append to origin
|
||||||
|
links.push(`${origin}${href}`);
|
||||||
|
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||||
|
// Relative URL not starting with '/', append to base URL
|
||||||
|
links.push(`${baseUrl}/${href}`);
|
||||||
|
} else if (href.startsWith('mailto:')) {
|
||||||
|
// mailto: links, add as is
|
||||||
|
links.push(href);
|
||||||
|
}
|
||||||
|
// Fragment-only links (#) are ignored
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove duplicates and return
|
||||||
|
return [...new Set(links)];
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user