From e31a5007d5f6230ce6e33fdb2232c3a0ae855ea2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 22 Jul 2024 18:30:58 -0400 Subject: [PATCH] Nick: speed improvements --- apps/api/src/controllers/scrape.ts | 86 ++++++++----- apps/api/src/scraper/WebScraper/index.ts | 114 +++++++++++++----- apps/api/src/scraper/WebScraper/sitemap.ts | 4 +- .../src/services/billing/credit_billing.ts | 60 +++++---- 4 files changed, 172 insertions(+), 92 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index f076425f..4870925f 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -73,28 +73,6 @@ export async function scrapeHelper( }); } - let creditsToBeBilled = filteredDocs.length; - const creditsPerLLMExtract = 50; - - - - if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") { - creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); - } - - const billingResult = await billTeam( - team_id, - creditsToBeBilled - ); - if (!billingResult.success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } - return { success: true, data: filteredDocs[0], @@ -104,6 +82,7 @@ export async function scrapeHelper( export async function scrapeController(req: Request, res: Response) { try { + let earlyReturn = false; // make sure to authenticate user first, Bearer const { success, team_id, error, status, plan } = await authenticateUser( req, @@ -113,28 +92,41 @@ export async function scrapeController(req: Request, res: Response) { if (!success) { return res.status(status).json({ error }); } + const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; const origin = req.body.origin ?? defaultOrigin; let timeout = req.body.timeout ?? defaultTimeout; - if (extractorOptions.mode === "llm-extraction") { + if (extractorOptions.mode.includes("llm-extraction")) { pageOptions.onlyMainContent = true; timeout = req.body.timeout ?? 90000; } - - try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = - await checkTeamCredits(team_id, 1); - if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits" }); + const checkCredits = async () => { + try { + const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); + if (!creditsCheckSuccess) { + earlyReturn = true; + return res.status(402).json({ error: "Insufficient credits" }); + } + } catch (error) { + console.error(error); + earlyReturn = true; + return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); } - } catch (error) { - console.error(error); - return res.status(500).json({ error: "Internal server error" }); + }; + + + // Async check saves 500ms in average case + // Don't async check in llm extraction mode as it could be expensive + if (extractorOptions.mode.includes("llm-extraction")) { + await checkCredits(); + } else { + checkCredits(); } + const startTime = new Date().getTime(); const result = await scrapeHelper( req, @@ -149,6 +141,33 @@ export async function scrapeController(req: Request, res: Response) { const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; + if (result.success) { + let creditsToBeBilled = 1; // Assuming 1 credit per document + const creditsPerLLMExtract = 50; + + if (extractorOptions.mode.includes("llm-extraction")) { + creditsToBeBilled += creditsPerLLMExtract; + } + + let startTimeBilling = new Date().getTime(); + + if (earlyReturn) { + // Don't bill if we're early returning + return; + } + const billingResult = await billTeam( + team_id, + creditsToBeBilled + ); + if (!billingResult.success) { + return res.status(402).json({ + success: false, + error: "Failed to bill team. Insufficient credits or subscription not found.", + }); + } + console.log("Billed team in", new Date().getTime() - startTimeBilling, "ms"); + } + logJob({ success: result.success, message: result.error, @@ -164,6 +183,9 @@ export async function scrapeController(req: Request, res: Response) { extractor_options: extractorOptions, num_tokens: numTokens, }); + + + return res.status(result.returnCode).json(result); } catch (error) { console.error(error); diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 87b5aee7..66bb15de 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -164,7 +164,6 @@ export class WebScraperDataProvider { private async handleCrawlMode( inProgress?: (progress: Progress) => void ): Promise { - const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -225,7 +224,6 @@ export class WebScraperDataProvider { return this.returnOnlyUrlsResponse(links, inProgress); } - let documents = await this.processLinks(links, inProgress); return this.cacheAndFinalizeDocuments(documents, links); } @@ -253,35 +251,60 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void, allHtmls?: string[] ): Promise { - const pdfLinks = links.filter(link => link.endsWith(".pdf")); - const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx")); - - const pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - const docxDocuments = await this.fetchDocxDocuments(docLinks); - - links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); - - let documents = await this.convertUrlsToDocuments( - links, - inProgress, - allHtmls + const pdfLinks = links.filter((link) => link.endsWith(".pdf")); + const docLinks = links.filter( + (link) => link.endsWith(".doc") || link.endsWith(".docx") ); - documents = await this.getSitemapData(this.urls[0], documents); + const [pdfDocuments, docxDocuments] = await Promise.all([ + this.fetchPdfDocuments(pdfLinks), + this.fetchDocxDocuments(docLinks), + ]); + + links = links.filter( + (link) => !pdfLinks.includes(link) && !docLinks.includes(link) + ); + + let [documents, sitemapData] = await Promise.all([ + this.convertUrlsToDocuments(links, inProgress, allHtmls), + this.mode === "single_urls" && links.length > 0 + ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( + (error) => { + console.error("Failed to fetch sitemap data:", error); + return null; + } + ) + : Promise.resolve(null), + ]); + + if (this.mode === "single_urls" && documents.length > 0) { + documents[0].metadata.sitemap = sitemapData; + } else { + documents = await this.getSitemapData(this.urls[0], documents); + } + documents = this.applyPathReplacements(documents); // documents = await this.applyImgAltText(documents); - if ( - (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") && + (this.extractorOptions.mode === "llm-extraction" || + this.extractorOptions.mode === "llm-extraction-from-markdown") && this.mode === "single_urls" ) { - documents = await generateCompletions(documents, this.extractorOptions, "markdown"); + documents = await generateCompletions( + documents, + this.extractorOptions, + "markdown" + ); } if ( - (this.extractorOptions.mode === "llm-extraction-from-raw-html") && + this.extractorOptions.mode === "llm-extraction-from-raw-html" && this.mode === "single_urls" ) { - documents = await generateCompletions(documents, this.extractorOptions, "raw-html"); + documents = await generateCompletions( + documents, + this.extractorOptions, + "raw-html" + ); } return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -289,7 +312,10 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + pdfLink, + this.pageOptions.parsePDF + ); return { content: content, metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, @@ -301,7 +327,8 @@ export class WebScraperDataProvider { private async fetchDocxDocuments(docxLinks: string[]): Promise { return Promise.all( docxLinks.map(async (p) => { - const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p); + const { content, pageStatusCode, pageError } = + await fetchAndProcessDocx(p); return { content, metadata: { sourceURL: p, pageStatusCode, pageError }, @@ -489,16 +516,21 @@ export class WebScraperDataProvider { includeHtml: false, replaceAllPathsWithAbsolutePaths: false, parsePDF: true, - removeTags: [] + removeTags: [], }; - this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; + this.replaceAllPathsWithAbsolutePaths = + options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? + options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? + false; //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; - this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; - this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false; + this.allowBackwardCrawling = + options.crawlerOptions?.allowBackwardCrawling ?? false; + this.allowExternalContentLinks = + options.crawlerOptions?.allowExternalContentLinks ?? false; // make sure all urls start with https:// this.urls = this.urls.map((url) => { @@ -537,6 +569,34 @@ export class WebScraperDataProvider { } return documents; } + private async getSitemapDataForSingleUrl( + baseUrl: string, + url: string, + timeout?: number + ) { + const sitemapData = await fetchSitemapData(baseUrl, timeout); + if (sitemapData) { + const docInSitemapData = sitemapData.find( + (data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url) + ); + if (docInSitemapData) { + let sitemapDocData: Partial = {}; + if (docInSitemapData.changefreq) { + sitemapDocData.changefreq = docInSitemapData.changefreq; + } + if (docInSitemapData.priority) { + sitemapDocData.priority = Number(docInSitemapData.priority); + } + if (docInSitemapData.lastmod) { + sitemapDocData.lastmod = docInSitemapData.lastmod; + } + if (Object.keys(sitemapDocData).length !== 0) { + return sitemapDocData; + } + } + } + return null; + } generatesImgAltText = async (documents: Document[]): Promise => { await Promise.all( documents.map(async (document) => { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 460aeca6..1dfbf3a1 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -54,10 +54,10 @@ export async function getLinksFromSitemap( return allUrls; } -export const fetchSitemapData = async (url: string): Promise => { +export const fetchSitemapData = async (url: string, timeout?: number): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 6f06fa17..82668111 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -19,20 +19,20 @@ export async function supaBillTeam(team_id: string, credits: number) { // credits_used: The number of credits consumed by the API call. // created_at: The timestamp of the API usage. - // 1. get the subscription - const { data: subscription } = await supabase_service - .from("subscriptions") - .select("*") - .eq("team_id", team_id) - .eq("status", "active") - .single(); - - // 2. Check for available coupons - const { data: coupons } = await supabase_service - .from("coupons") - .select("id, credits") - .eq("team_id", team_id) - .eq("status", "active"); + // 1. get the subscription and check for available coupons concurrently + const [{ data: subscription }, { data: coupons }] = await Promise.all([ + supabase_service + .from("subscriptions") + .select("*") + .eq("team_id", team_id) + .eq("status", "active") + .single(), + supabase_service + .from("coupons") + .select("id, credits") + .eq("team_id", team_id) + .eq("status", "active"), + ]); let couponCredits = 0; if (coupons && coupons.length > 0) { @@ -169,21 +169,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: true, message: "Preview team, no credits used" }; } - // Retrieve the team's active subscription - const { data: subscription, error: subscriptionError } = - await supabase_service - .from("subscriptions") - .select("id, price_id, current_period_start, current_period_end") - .eq("team_id", team_id) - .eq("status", "active") - .single(); - - // Check for available coupons - const { data: coupons } = await supabase_service - .from("coupons") - .select("credits") - .eq("team_id", team_id) - .eq("status", "active"); + // Retrieve the team's active subscription and check for available coupons concurrently + const [{ data: subscription, error: subscriptionError }, { data: coupons }] = + await Promise.all([ + supabase_service + .from("subscriptions") + .select("id, price_id, current_period_start, current_period_end") + .eq("team_id", team_id) + .eq("status", "active") + .single(), + supabase_service + .from("coupons") + .select("credits") + .eq("team_id", team_id) + .eq("status", "active"), + ]); let couponCredits = 0; if (coupons && coupons.length > 0) { @@ -238,7 +238,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // 5. Compare the total credits used with the credits allowed by the plan. if (totalCreditsUsed + credits > FREE_CREDITS) { // Send email notification for insufficient credits - await sendNotification( team_id, NotificationType.LIMIT_REACHED, @@ -275,7 +274,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Adjust total credits used by subtracting coupon value const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); - // Get the price details const { data: price, error: priceError } = await supabase_service .from("prices")