This commit is contained in:
Nicolas 2024-05-09 18:01:04 -07:00
commit efc6fcb474
2 changed files with 76 additions and 39 deletions

View File

@ -21,7 +21,7 @@ export async function generateRequestParams(
}; };
try { try {
const urlKey = new URL(url).hostname; const urlKey = new URL(url).hostname.replace(/^www\./, "");
if (urlSpecificParams.hasOwnProperty(urlKey)) { if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] }; return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else { } else {
@ -57,7 +57,7 @@ export async function scrapWithScrapingBee(
wait_browser, wait_browser,
timeout timeout
); );
const response = await client.get(clientParams); const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) { if (response.status !== 200 && response.status !== 404) {
@ -77,12 +77,15 @@ export async function scrapWithScrapingBee(
export async function scrapWithPlaywright(url: string): Promise<string> { export async function scrapWithPlaywright(url: string): Promise<string> {
try { try {
const reqParams = await generateRequestParams(url);
const wait_playwright = reqParams["params"]?.wait ?? 0;
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url }), body: JSON.stringify({ url: url, wait: wait_playwright }),
}); });
if (!response.ok) { if (!response.ok) {
@ -103,7 +106,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -169,56 +172,50 @@ export async function scrapSingleUrl(
break; break;
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions); let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text]; return [await parseMarkdown(cleanedHtml), text];
}; };
try { try {
// TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo let [text, html] = ["", ""];
// let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper'); let urlKey = urlToScrap;
// if (!text || text.length < 100) { try {
// console.log("Falling back to scraping bee load"); urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
// [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); } catch (error) {
// } console.error(`Invalid URL key, trying: ${urlToScrap}`);
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint
if (pageOptions.fallback === false) {
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {
url: urlToScrap,
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap },
} as Document;
} }
if (!text || text.length < 100) { const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
console.log("Falling back to playwright"); const scrapersInOrder = defaultScraper
[text, html] = await attemptScraping(urlToScrap, "playwright"); ? [
defaultScraper,
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
]
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) {
[text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.length >= 100) break;
console.log(`Falling back to ${scraper}`);
} }
if (!text || text.length < 100) { if (!text) {
console.log("Falling back to scraping bee load"); throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
[text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad");
}
if (!text || text.length < 100) {
console.log("Falling back to fetch");
[text, html] = await attemptScraping(urlToScrap, "fetch");
} }
const soup = cheerio.load(html); const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
const document: Document = {
return {
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
metadata: { ...metadata, sourceURL: urlToScrap }, metadata: { ...metadata, sourceURL: urlToScrap },
} as Document; };
return document;
} catch (error) { } catch (error) {
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
return { return {

View File

@ -38,5 +38,45 @@ export const urlSpecificParams = {
accept: accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}, },
},
"docs.pdw.co":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 3000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"ycombinator.com":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 3000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
} }
}; };