mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' of https://github.com/mendableai/firecrawl
This commit is contained in:
commit
efc6fcb474
|
@ -21,7 +21,7 @@ export async function generateRequestParams(
|
||||||
};
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const urlKey = new URL(url).hostname;
|
const urlKey = new URL(url).hostname.replace(/^www\./, "");
|
||||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||||
} else {
|
} else {
|
||||||
|
@ -77,12 +77,15 @@ export async function scrapWithScrapingBee(
|
||||||
|
|
||||||
export async function scrapWithPlaywright(url: string): Promise<string> {
|
export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||||
try {
|
try {
|
||||||
|
const reqParams = await generateRequestParams(url);
|
||||||
|
const wait_playwright = reqParams["params"]?.wait ?? 0;
|
||||||
|
|
||||||
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url }),
|
body: JSON.stringify({ url: url, wait: wait_playwright }),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
|
@ -103,7 +106,7 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
|
@ -174,51 +177,45 @@ export async function scrapSingleUrl(
|
||||||
|
|
||||||
return [await parseMarkdown(cleanedHtml), text];
|
return [await parseMarkdown(cleanedHtml), text];
|
||||||
};
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo
|
let [text, html] = ["", ""];
|
||||||
// let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper');
|
let urlKey = urlToScrap;
|
||||||
// if (!text || text.length < 100) {
|
try {
|
||||||
// console.log("Falling back to scraping bee load");
|
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||||
// [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad');
|
} catch (error) {
|
||||||
// }
|
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||||
|
|
||||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
|
||||||
// Basically means that it is using /search endpoint
|
|
||||||
if (pageOptions.fallback === false) {
|
|
||||||
const soup = cheerio.load(html);
|
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
|
||||||
return {
|
|
||||||
url: urlToScrap,
|
|
||||||
content: text,
|
|
||||||
markdown: text,
|
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
|
||||||
} as Document;
|
|
||||||
}
|
}
|
||||||
if (!text || text.length < 100) {
|
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||||
console.log("Falling back to playwright");
|
const scrapersInOrder = defaultScraper
|
||||||
[text, html] = await attemptScraping(urlToScrap, "playwright");
|
? [
|
||||||
|
defaultScraper,
|
||||||
|
"scrapingBee",
|
||||||
|
"playwright",
|
||||||
|
"scrapingBeeLoad",
|
||||||
|
"fetch",
|
||||||
|
]
|
||||||
|
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
||||||
|
|
||||||
|
for (const scraper of scrapersInOrder) {
|
||||||
|
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||||
|
if (text && text.length >= 100) break;
|
||||||
|
console.log(`Falling back to ${scraper}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!text || text.length < 100) {
|
if (!text) {
|
||||||
console.log("Falling back to scraping bee load");
|
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||||
[text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad");
|
|
||||||
}
|
|
||||||
if (!text || text.length < 100) {
|
|
||||||
console.log("Falling back to fetch");
|
|
||||||
[text, html] = await attemptScraping(urlToScrap, "fetch");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
|
const document: Document = {
|
||||||
return {
|
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: text,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
metadata: { ...metadata, sourceURL: urlToScrap },
|
metadata: { ...metadata, sourceURL: urlToScrap },
|
||||||
} as Document;
|
};
|
||||||
|
|
||||||
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -38,5 +38,45 @@ export const urlSpecificParams = {
|
||||||
accept:
|
accept:
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
},
|
},
|
||||||
|
},
|
||||||
|
"docs.pdw.co":{
|
||||||
|
defaultScraper: "playwright",
|
||||||
|
params: {
|
||||||
|
wait_browser: "networkidle2",
|
||||||
|
block_resources: false,
|
||||||
|
wait: 3000,
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
referer: "https://www.google.com/",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"accept-encoding": "gzip, deflate, br",
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"ycombinator.com":{
|
||||||
|
defaultScraper: "playwright",
|
||||||
|
params: {
|
||||||
|
wait_browser: "networkidle2",
|
||||||
|
block_resources: false,
|
||||||
|
wait: 3000,
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
referer: "https://www.google.com/",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"accept-encoding": "gzip, deflate, br",
|
||||||
|
accept:
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue
Block a user