diff --git a/apps/api/requests.http b/apps/api/requests.http index 32f9f60e..3a1a9902 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,10 +1,10 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc- +Authorization: Bearer fc content-type: application/json { - "url":"new.abb.com/sustainability/foundation" + "url":"firecrawl.dev" } @@ -18,7 +18,7 @@ Authorization: Bearer fc- content-type: application/json { - "url": "new.abb.com/sustainability/foundation" + "url": "firecrawl.dev" } ## "reoveTags": [], diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 43edc579..8fd876d3 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -66,7 +66,6 @@ export async function crawlController(req: Request, res: Response) { parsePDF: true }; - console.log('1. here OK!') if (mode === "single_urls" && !url.includes(",")) { try { const a = new WebScraperDataProvider(); @@ -85,7 +84,6 @@ export async function crawlController(req: Request, res: Response) { current_url: progress.currentDocumentUrl, }); }); - console.log('crawlController - return res.json...') return res.json({ success: true, documents: docs, diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 86899598..233da921 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,6 +1,5 @@ export function parseMarkdown(html: string) { - console.log('parseMarkdown - start!') var TurndownService = require("turndown"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm') @@ -51,6 +50,5 @@ export function parseMarkdown(html: string) { /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - console.log('parseMarkdown - return') return markdownContent; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index a8c4e84a..dee89bc4 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -78,11 +78,9 @@ export async function runWebScraper({ pageOptions: pageOptions, }); } - console.log('runWebScraper - getDocuments') const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); })) as Document[]; - console.log('runWebScraper - getDocuments - done - docs.length:', docs.length) if (docs.length === 0) { return { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d1e93cdf..2e2dec2b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -130,31 +130,20 @@ export class WebCrawler { ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { - console.log('3.1 here OK') - console.log('this.robotsTxtUrl:', this.robotsTxtUrl) const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); - console.log('????', {response}) - console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); - } - console.log('4. here OK!') if(!crawlerOptions?.ignoreSitemap){ - console.log('4.1') const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - console.log('4.2') if (sitemapLinks.length > 0) { - console.log('4.3') let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - console.log('4.4') return filteredLinks.map(link => ({ url: link, html: "" })); } } - console.log('5. here OK!') const urls = await this.crawlUrls( [this.initialUrl], pageOptions, @@ -162,7 +151,6 @@ export class WebCrawler { inProgress ); - console.log('6. here OK!') if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -172,7 +160,6 @@ export class WebCrawler { // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); - console.log('7. here OK!') return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } @@ -189,7 +176,6 @@ export class WebCrawler { } return; } - console.log('crawlUrls - crawl') const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { @@ -202,7 +188,6 @@ export class WebCrawler { // } // } - console.log('---??---') newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { @@ -220,14 +205,12 @@ export class WebCrawler { currentDocumentUrl: task, }); } - console.log('----???----') await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } }, concurrencyLimit); - console.log('crawlUrls - queue.push') queue.push( urls.filter( (url) => @@ -237,9 +220,7 @@ export class WebCrawler { if (err) console.error(err); } ); - console.log('crawlUrls - queue.drain') await queue.drain(); - console.log('crawlUrls - return') return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -267,22 +248,17 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { - console.log('crawl scrapSingleUrl...') const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); - console.log('got a page! lets continue...') content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; } else { - // console.log('crawl - else') const response = await axios.get(url, { timeout: axiosTimeout }); - console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; pageError = response.statusText != "OK" ? response.statusText : undefined; } - console.log('crawl... keep going') const $ = load(content); let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; @@ -291,17 +267,14 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } - console.log('crawl... keep going 2') $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { - console.log('href:', href) let fullUrl = href; if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } const urlObj = new URL(fullUrl); - console.log('urlObj:', urlObj) const path = urlObj.pathname; @@ -313,19 +286,15 @@ export class WebCrawler { !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { - console.log(fullUrl) - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } }); - console.log('crawl... keep going 3') - + if (this.visited.size === 1) { return links; } - console.log('returning crawl...') // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -412,7 +381,6 @@ export class WebCrawler { // private async tryFetchSitemapLinks(url: string): Promise { - console.log("4.1.1 - Normalizing URL"); const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); if (url.endsWith("/")) { @@ -421,18 +389,15 @@ export class WebCrawler { return url; }; - console.log("4.1.2 - Constructing sitemap URL"); const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; let sitemapLinks: string[] = []; - console.log("4.1.3 - Fetching sitemap from constructed URL"); try { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { - console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { @@ -440,12 +405,10 @@ export class WebCrawler { } if (sitemapLinks.length === 0) { - console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { - console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); } } catch (error) { @@ -453,16 +416,12 @@ export class WebCrawler { } } - console.log("4.1.7 - Normalizing sitemap links"); const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - console.log("4.1.8 - Checking if normalized URL is already included"); if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { - console.log("4.1.9 - Adding initial URL to sitemap links"); sitemapLinks.push(url); } - console.log("4.1.10 - Returning sitemap links"); return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 037b6898..9e318505 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -63,13 +63,11 @@ export class WebScraperDataProvider { await Promise.all( batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; - console.log('convertUrlsToDocuments - scrapSingleUrl') const result = await scrapSingleUrl( url, this.pageOptions, existingHTML ); - console.log('convertUrlsToDocuments - result ok') processedUrls++; if (inProgress) { inProgress({ @@ -100,7 +98,6 @@ export class WebScraperDataProvider { return [] as Document[]; } } - console.log('returning results from convertUrlsToDocuments...') return results.filter((result) => result !== null) as Document[]; } @@ -109,7 +106,6 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { this.validateInitialUrl(); - console.log('2. here OK!') if (!useCaching) { return this.processDocumentsWithoutCache(inProgress); } @@ -178,7 +174,6 @@ export class WebScraperDataProvider { allowBackwardCrawling: this.allowBackwardCrawling, }); - console.log('3. here OK!') let links = await crawler.start( inProgress, this.pageOptions, @@ -190,28 +185,21 @@ export class WebScraperDataProvider { this.maxCrawledDepth ); - console.log("8 - Mapping URLs from links"); let allLinks = links.map((e) => e.url); - console.log("9 - Mapping HTML content from links"); const allHtmls = links.map((e) => e.html); - console.log("10 - Checking if only URLs should be returned"); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks, inProgress); } let documents = []; - console.log("11 - Checking if crawler is in fast mode and HTML content is present"); // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { - console.log("12 - Processing links with HTML content in fast mode"); documents = await this.processLinks(allLinks, inProgress, allHtmls); } else { - console.log("13 - Processing links in normal mode"); documents = await this.processLinks(allLinks, inProgress); } - console.log("14 - Caching and finalizing documents"); return this.cacheAndFinalizeDocuments(documents, allLinks); } @@ -270,22 +258,14 @@ export class WebScraperDataProvider { links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); - console.log('processLinks - convertUrlsToDocuments...') let documents = await this.convertUrlsToDocuments( links, inProgress, allHtmls ); - console.log('processLinks - convertUrlsToDocuments - done') - console.log('processLinks - getSitemapData...') documents = await this.getSitemapData(this.urls[0], documents); - console.log('processLinks - getSitemapData - done') - - console.log('processLinks - applyPathReplacements...') documents = this.applyPathReplacements(documents); - console.log('processLinks - applyPathReplacements - done') - // documents = await this.applyImgAltText(documents); if ( @@ -294,7 +274,6 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } - console.log('processLinks - returning...') return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -340,11 +319,8 @@ export class WebScraperDataProvider { documents: Document[], links: string[] ): Promise { - console.log('cacheAndFinalizeDocuments - 1') await this.setCachedDocuments(documents, links); - console.log('cacheAndFinalizeDocuments - 2') documents = this.removeChildLinks(documents); - console.log('cacheAndFinalizeDocuments - 3') return documents.splice(0, this.limit); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 05a24c29..9f8d563d 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -113,25 +113,12 @@ export async function scrapWithScrapingBee( pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { - console.log("13. scrapWithScrapingBee - 1") const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - console.log("13. scrapWithScrapingBee - 2") const clientParams = await generateRequestParams( url, wait_browser, timeout, ); - console.log({ url, - wait_browser, - timeout }) - console.log({ - ...clientParams, - params: { - ...clientParams.params, - 'transparent_status_code': 'True' - } - }) - console.log("13. scrapWithScrapingBee - 3") const response = await client.get({ ...clientParams, params: { @@ -139,7 +126,6 @@ export async function scrapWithScrapingBee( 'transparent_status_code': 'True' } }); - console.log("13. scrapWithScrapingBee - 4") const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); @@ -152,7 +138,6 @@ export async function scrapWithScrapingBee( } catch (decodeError) { console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); } - console.log("13. scrapWithScrapingBee - 5 - returning ok") return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { @@ -409,14 +394,8 @@ export async function scrapSingleUrl( screenshot = customScrapedContent.screenshot; } - console.log( - 'chegou aqui' - ) - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - console.log('cleanedHtml') - return { text: await parseMarkdown(cleanedHtml), html: cleanedHtml, @@ -450,9 +429,7 @@ export async function scrapSingleUrl( break; } - console.log('attemptScraping - 1') const attempt = await attemptScraping(urlToScrap, scraper); - console.log('attemptScraping - 2 - return ok') text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; @@ -471,7 +448,6 @@ export async function scrapSingleUrl( console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); } } - console.log('ok... here we are...') if (!text) { throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); @@ -508,7 +484,6 @@ export async function scrapSingleUrl( }; } - console.log('returning document...') return document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);