diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 6ab30036..262a90c0 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -5,9 +5,28 @@ import dotenv from "dotenv"; import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { excludeNonMainTags } from "./utils/excludeTags"; +import { urlSpecificParams } from "./utils/custom/website_params"; dotenv.config(); +export async function generateRequestParams( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = 15000 +): Promise { + const defaultParams = { + url: url, + params: { timeout: timeout, wait_browser: wait_browser }, + headers: { "ScrapingService-Request": "TRUE" }, + }; + + const urlKey = new URL(url).hostname; + if (urlSpecificParams.hasOwnProperty(urlKey)) { + return { ...defaultParams, ...urlSpecificParams[urlKey] }; + } else { + return defaultParams; + } +} export async function scrapWithCustomFirecrawl( url: string, options?: any @@ -28,11 +47,13 @@ export async function scrapWithScrapingBee( ): Promise { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const response = await client.get({ - url: url, - params: { timeout: timeout, wait_browser: wait_browser }, - headers: { "ScrapingService-Request": "TRUE" }, - }); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + + const response = await client.get(clientParams); if (response.status !== 200 && response.status !== 404) { console.error( @@ -107,11 +128,15 @@ export async function scrapSingleUrl( let text = ""; switch (method) { case "firecrawl-scraper": - text = await scrapWithCustomFirecrawl(url,); + text = await scrapWithCustomFirecrawl(url); break; case "scrapingBee": if (process.env.SCRAPING_BEE_API_KEY) { - text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000); + text = await scrapWithScrapingBee( + url, + "domcontentloaded", + pageOptions.fallback === false ? 7000 : 15000 + ); } break; case "playwright": @@ -141,7 +166,7 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text, pageOptions); - + return [await parseMarkdown(cleanedHtml), text]; }; @@ -155,7 +180,7 @@ export async function scrapSingleUrl( let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); // Basically means that it is using /search endpoint - if(pageOptions.fallback === false){ + if (pageOptions.fallback === false) { const soup = cheerio.load(html); const metadata = extractMetadata(soup, urlToScrap); return { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts new file mode 100644 index 00000000..164b0741 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -0,0 +1,24 @@ +export const urlSpecificParams = { + "platform.openai.com": { + params: { + wait_browser: "networkidle2", + block_resources: false, + }, + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "cors", + "sec-fetch-dest": "empty", + referer: "https://www.google.com/", + "accept-language": "en-US,en;q=0.9", + "accept-encoding": "gzip, deflate, br", + accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + }, + cookies: { + __cf_bm: + "mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ", + }, + }, +};