This commit is contained in:
Nicolas 2024-04-16 12:06:46 -04:00
parent 36fe5f5986
commit 93627ae87c
4 changed files with 52 additions and 20 deletions

View File

@ -3,7 +3,7 @@
Crawl and convert any website into LLM-ready markdown. Build by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) Crawl and convert any website into LLM-ready markdown. Build by [Mendable.ai](https://mendable.ai?ref=gfirecrawl)
*This repo is still in early development and its main purpose is to help improve accuracy of LLM response given clean data* *This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it*
## What is Firecrawl? ## What is Firecrawl?

View File

@ -2,5 +2,5 @@
Guide coming soon. Guide coming soon.
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it*

View File

@ -13,7 +13,6 @@ export function parseApi(api: string) {
return uuid; return uuid;
} }
console.log(parseApi("fc-a6a2d63aed2b46a9946d2a7207efed4d"))
export function uuidToFcUuid(uuid: string) { export function uuidToFcUuid(uuid: string) {
const uuidWithoutDashes = uuid.replace(/-/g, ""); const uuidWithoutDashes = uuid.replace(/-/g, "");

View File

@ -1,6 +1,5 @@
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee"; import { ScrapingBeeClient } from "scrapingbee";
import { attemptScrapWithRequests, sanitizeText } from "./utils/utils";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
@ -9,9 +8,23 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
dotenv.config(); dotenv.config();
export async function scrapWithCustomFirecrawl(
url: string,
options?: any
): Promise<string> {
try {
// TODO: merge the custom firecrawl scraper into mono-repo when ready
return null;
} catch (error) {
console.error(`Error scraping with custom firecrawl-scraper: ${error}`);
return "";
}
}
export async function scrapWithScrapingBee(
export async function scrapWithScrapingBee(url: string, wait_browser:string = "domcontentloaded"): Promise<string> { url: string,
wait_browser: string = "domcontentloaded"
): Promise<string> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const response = await client.get({ const response = await client.get({
@ -35,11 +48,10 @@ export async function scrapWithScrapingBee(url: string, wait_browser:string = "d
} }
} }
export async function scrapWithPlaywright(url: string): Promise<string> { export async function scrapWithPlaywright(url: string): Promise<string> {
try { try {
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: 'POST', method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
@ -47,7 +59,9 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
}); });
if (!response.ok) { if (!response.ok) {
console.error(`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}`); console.error(
`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}`
);
return ""; return "";
} }
@ -73,29 +87,42 @@ export async function scrapSingleUrl(
return soup.html(); return soup.html();
}; };
const attemptScraping = async (url: string, method: 'scrapingBee' | 'playwright' | 'scrapingBeeLoad' | 'fetch') => { const attemptScraping = async (
url: string,
method:
| "firecrawl-scraper"
| "scrapingBee"
| "playwright"
| "scrapingBeeLoad"
| "fetch"
) => {
let text = ""; let text = "";
switch (method) { switch (method) {
case 'scrapingBee': case "firecrawl-scraper":
text = await scrapWithCustomFirecrawl(url);
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url); text = await scrapWithScrapingBee(url);
} }
break; break;
case 'playwright': case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
text = await scrapWithPlaywright(url); text = await scrapWithPlaywright(url);
} }
break; break;
case 'scrapingBeeLoad': case "scrapingBeeLoad":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url, "networkidle2"); text = await scrapWithScrapingBee(url, "networkidle2");
} }
break; break;
case 'fetch': case "fetch":
try { try {
const response = await fetch(url); const response = await fetch(url);
if (!response.ok) { if (!response.ok) {
console.error(`Error fetching URL: ${url} with status: ${response.status}`); console.error(
`Error fetching URL: ${url} with status: ${response.status}`
);
return ""; return "";
} }
text = await response.text(); text = await response.text();
@ -104,26 +131,32 @@ export async function scrapSingleUrl(
return ""; return "";
} }
break; break;
} }
const cleanedHtml = removeUnwantedElements(text); const cleanedHtml = removeUnwantedElements(text);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text];
}; };
try { try {
let [text, html ] = await attemptScraping(urlToScrap, 'scrapingBee'); // TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo
// let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper');
// if (!text || text.length < 100) {
// console.log("Falling back to scraping bee load");
// [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad');
// }
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
if (!text || text.length < 100) { if (!text || text.length < 100) {
console.log("Falling back to playwright"); console.log("Falling back to playwright");
[text, html] = await attemptScraping(urlToScrap, 'playwright'); [text, html] = await attemptScraping(urlToScrap, "playwright");
} }
if (!text || text.length < 100) { if (!text || text.length < 100) {
console.log("Falling back to scraping bee load"); console.log("Falling back to scraping bee load");
[text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); [text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad");
} }
if (!text || text.length < 100) { if (!text || text.length < 100) {
console.log("Falling back to fetch"); console.log("Falling back to fetch");
[text, html] = await attemptScraping(urlToScrap, 'fetch'); [text, html] = await attemptScraping(urlToScrap, "fetch");
} }
const soup = cheerio.load(html); const soup = cheerio.load(html);