From e31a5007d5f6230ce6e33fdb2232c3a0ae855ea2 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 22 Jul 2024 18:30:58 -0400
Subject: [PATCH] Nick: speed improvements

---
 apps/api/src/controllers/scrape.ts            |  86 ++++++++-----
 apps/api/src/scraper/WebScraper/index.ts      | 114 +++++++++++++-----
 apps/api/src/scraper/WebScraper/sitemap.ts    |   4 +-
 .../src/services/billing/credit_billing.ts    |  60 +++++----
 4 files changed, 172 insertions(+), 92 deletions(-)
diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
index f076425f..4870925f 100644
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@@ -73,28 +73,6 @@ export async function scrapeHelper(
     });
   }
 
-  let creditsToBeBilled = filteredDocs.length;
-  const creditsPerLLMExtract = 50;
-
-
-
-  if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
-    creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
-  }
-
-  const billingResult = await billTeam(
-    team_id,
-    creditsToBeBilled
-  );
-  if (!billingResult.success) {
-    return {
-      success: false,
-      error:
-        "Failed to bill team. Insufficient credits or subscription not found.",
-      returnCode: 402,
-    };
-  }
-
   return {
     success: true,
     data: filteredDocs[0],
@@ -104,6 +82,7 @@ export async function scrapeHelper(
 
 export async function scrapeController(req: Request, res: Response) {
   try {
+    let earlyReturn = false;
     // make sure to authenticate user first, Bearer <token>
     const { success, team_id, error, status, plan } = await authenticateUser(
       req,
@@ -113,28 +92,41 @@ export async function scrapeController(req: Request, res: Response) {
     if (!success) {
       return res.status(status).json({ error });
     }
+
     const crawlerOptions = req.body.crawlerOptions ?? {};
     const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
     const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
     const origin = req.body.origin ?? defaultOrigin;
     let timeout = req.body.timeout ?? defaultTimeout;
 
-    if (extractorOptions.mode === "llm-extraction") {
+    if (extractorOptions.mode.includes("llm-extraction")) {
       pageOptions.onlyMainContent = true;
       timeout = req.body.timeout ?? 90000;
     }
-    
 
-    try {
-      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
-        await checkTeamCredits(team_id, 1);
-      if (!creditsCheckSuccess) {
-        return res.status(402).json({ error: "Insufficient credits" });
+    const checkCredits = async () => {
+      try {
+        const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
+        if (!creditsCheckSuccess) {
+          earlyReturn = true;
+          return res.status(402).json({ error: "Insufficient credits" });
+        }
+      } catch (error) {
+        console.error(error);
+        earlyReturn = true;
+        return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
       }
-    } catch (error) {
-      console.error(error);
-      return res.status(500).json({ error: "Internal server error" });
+    };
+
+
+    // Async check saves 500ms in average case
+    // Don't async check in llm extraction mode as it could be expensive
+    if (extractorOptions.mode.includes("llm-extraction")) {
+      await checkCredits();
+    } else {
+      checkCredits();
     }
+
     const startTime = new Date().getTime();
     const result = await scrapeHelper(
       req,
@@ -149,6 +141,33 @@ export async function scrapeController(req: Request, res: Response) {
     const timeTakenInSeconds = (endTime - startTime) / 1000;
     const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
 
+    if (result.success) {
+      let creditsToBeBilled = 1; // Assuming 1 credit per document
+      const creditsPerLLMExtract = 50;
+
+      if (extractorOptions.mode.includes("llm-extraction")) {
+        creditsToBeBilled += creditsPerLLMExtract;
+      }
+
+      let startTimeBilling = new Date().getTime();
+
+      if (earlyReturn) {
+        // Don't bill if we're early returning
+        return;
+      }
+      const billingResult = await billTeam(
+        team_id,
+        creditsToBeBilled
+      );
+      if (!billingResult.success) {
+        return res.status(402).json({
+          success: false,
+          error: "Failed to bill team. Insufficient credits or subscription not found.",
+        });
+      }
+      console.log("Billed team in", new Date().getTime() - startTimeBilling, "ms");
+    }
+
     logJob({
       success: result.success,
       message: result.error,
@@ -164,6 +183,9 @@ export async function scrapeController(req: Request, res: Response) {
       extractor_options: extractorOptions,
       num_tokens: numTokens,
     });
+
+    
+    
     return res.status(result.returnCode).json(result);
   } catch (error) {
     console.error(error);
diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
index 87b5aee7..66bb15de 100644
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@@ -164,7 +164,6 @@ export class WebScraperDataProvider {
   private async handleCrawlMode(
     inProgress?: (progress: Progress) => void
   ): Promise<Document[]> {
-
     const crawler = new WebCrawler({
       initialUrl: this.urls[0],
       includes: this.includes,
@@ -225,7 +224,6 @@ export class WebScraperDataProvider {
       return this.returnOnlyUrlsResponse(links, inProgress);
     }
 
-
     let documents = await this.processLinks(links, inProgress);
     return this.cacheAndFinalizeDocuments(documents, links);
   }
@@ -253,35 +251,60 @@ export class WebScraperDataProvider {
     inProgress?: (progress: Progress) => void,
     allHtmls?: string[]
   ): Promise<Document[]> {
-    const pdfLinks = links.filter(link => link.endsWith(".pdf"));
-    const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
-
-    const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
-    const docxDocuments = await this.fetchDocxDocuments(docLinks);
-
-    links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
-
-    let documents = await this.convertUrlsToDocuments(
-      links,
-      inProgress,
-      allHtmls
+    const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
+    const docLinks = links.filter(
+      (link) => link.endsWith(".doc") || link.endsWith(".docx")
     );
 
-    documents = await this.getSitemapData(this.urls[0], documents);
+    const [pdfDocuments, docxDocuments] = await Promise.all([
+      this.fetchPdfDocuments(pdfLinks),
+      this.fetchDocxDocuments(docLinks),
+    ]);
+
+    links = links.filter(
+      (link) => !pdfLinks.includes(link) && !docLinks.includes(link)
+    );
+
+    let [documents, sitemapData] = await Promise.all([
+      this.convertUrlsToDocuments(links, inProgress, allHtmls),
+      this.mode === "single_urls" && links.length > 0
+        ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
+            (error) => {
+              console.error("Failed to fetch sitemap data:", error);
+              return null;
+            }
+          )
+        : Promise.resolve(null),
+    ]);
+
+    if (this.mode === "single_urls" && documents.length > 0) {
+      documents[0].metadata.sitemap = sitemapData;
+    } else {
+      documents = await this.getSitemapData(this.urls[0], documents);
+    }
+
     documents = this.applyPathReplacements(documents);
     // documents = await this.applyImgAltText(documents);
-
     if (
-      (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
+      (this.extractorOptions.mode === "llm-extraction" ||
+        this.extractorOptions.mode === "llm-extraction-from-markdown") &&
       this.mode === "single_urls"
     ) {
-      documents = await generateCompletions(documents, this.extractorOptions, "markdown");
+      documents = await generateCompletions(
+        documents,
+        this.extractorOptions,
+        "markdown"
+      );
     }
     if (
-      (this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
+      this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
       this.mode === "single_urls"
     ) {
-      documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
+      documents = await generateCompletions(
+        documents,
+        this.extractorOptions,
+        "raw-html"
+      );
     }
     return documents.concat(pdfDocuments).concat(docxDocuments);
   }
@@ -289,7 +312,10 @@ export class WebScraperDataProvider {
   private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
     return Promise.all(
       pdfLinks.map(async (pdfLink) => {
-        const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
+        const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
+          pdfLink,
+          this.pageOptions.parsePDF
+        );
         return {
           content: content,
           metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
@@ -301,7 +327,8 @@ export class WebScraperDataProvider {
   private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
     return Promise.all(
       docxLinks.map(async (p) => {
-        const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
+        const { content, pageStatusCode, pageError } =
+          await fetchAndProcessDocx(p);
         return {
           content,
           metadata: { sourceURL: p, pageStatusCode, pageError },
@@ -489,16 +516,21 @@ export class WebScraperDataProvider {
       includeHtml: false,
       replaceAllPathsWithAbsolutePaths: false,
       parsePDF: true,
-      removeTags: []
+      removeTags: [],
     };
-    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
-    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
+    this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
+    this.replaceAllPathsWithAbsolutePaths =
+      options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
+      options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
+      false;
     //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
     this.excludes = this.excludes.filter((item) => item !== "");
     this.crawlerMode = options.crawlerOptions?.mode ?? "default";
     this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
-    this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
-    this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
+    this.allowBackwardCrawling =
+      options.crawlerOptions?.allowBackwardCrawling ?? false;
+    this.allowExternalContentLinks =
+      options.crawlerOptions?.allowExternalContentLinks ?? false;
 
     // make sure all urls start with https://
     this.urls = this.urls.map((url) => {
@@ -537,6 +569,34 @@ export class WebScraperDataProvider {
     }
     return documents;
   }
+  private async getSitemapDataForSingleUrl(
+    baseUrl: string,
+    url: string,
+    timeout?: number
+  ) {
+    const sitemapData = await fetchSitemapData(baseUrl, timeout);
+    if (sitemapData) {
+      const docInSitemapData = sitemapData.find(
+        (data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
+      );
+      if (docInSitemapData) {
+        let sitemapDocData: Partial<SitemapEntry> = {};
+        if (docInSitemapData.changefreq) {
+          sitemapDocData.changefreq = docInSitemapData.changefreq;
+        }
+        if (docInSitemapData.priority) {
+          sitemapDocData.priority = Number(docInSitemapData.priority);
+        }
+        if (docInSitemapData.lastmod) {
+          sitemapDocData.lastmod = docInSitemapData.lastmod;
+        }
+        if (Object.keys(sitemapDocData).length !== 0) {
+          return sitemapDocData;
+        }
+      }
+    }
+    return null;
+  }
   generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
     await Promise.all(
       documents.map(async (document) => {
diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts
index 460aeca6..1dfbf3a1 100644
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@@ -54,10 +54,10 @@ export async function getLinksFromSitemap(
   return allUrls;
 }
 
-export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
+export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
   const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
   try {
-    const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
+    const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
     if (response.status === 200) {
       const xml = response.data;
       const parsedXml = await parseStringPromise(xml);
diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts
index 6f06fa17..82668111 100644
--- a/apps/api/src/services/billing/credit_billing.ts
+++ b/apps/api/src/services/billing/credit_billing.ts
@@ -19,20 +19,20 @@ export async function supaBillTeam(team_id: string, credits: number) {
   // credits_used: The number of credits consumed by the API call.
   // created_at: The timestamp of the API usage.
 
-  // 1. get the subscription
-  const { data: subscription } = await supabase_service
-    .from("subscriptions")
-    .select("*")
-    .eq("team_id", team_id)
-    .eq("status", "active")
-    .single();
-
-  // 2. Check for available coupons
-  const { data: coupons } = await supabase_service
-    .from("coupons")
-    .select("id, credits")
-    .eq("team_id", team_id)
-    .eq("status", "active");
+  // 1. get the subscription and check for available coupons concurrently
+  const [{ data: subscription }, { data: coupons }] = await Promise.all([
+    supabase_service
+      .from("subscriptions")
+      .select("*")
+      .eq("team_id", team_id)
+      .eq("status", "active")
+      .single(),
+    supabase_service
+      .from("coupons")
+      .select("id, credits")
+      .eq("team_id", team_id)
+      .eq("status", "active"),
+  ]);
 
   let couponCredits = 0;
   if (coupons && coupons.length > 0) {
@@ -169,21 +169,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
     return { success: true, message: "Preview team, no credits used" };
   }
 
-  // Retrieve the team's active subscription
-  const { data: subscription, error: subscriptionError } =
-    await supabase_service
-      .from("subscriptions")
-      .select("id, price_id, current_period_start, current_period_end")
-      .eq("team_id", team_id)
-      .eq("status", "active")
-      .single();
-
-  // Check for available coupons
-  const { data: coupons } = await supabase_service
-    .from("coupons")
-    .select("credits")
-    .eq("team_id", team_id)
-    .eq("status", "active");
+  // Retrieve the team's active subscription and check for available coupons concurrently
+  const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
+    await Promise.all([
+      supabase_service
+        .from("subscriptions")
+        .select("id, price_id, current_period_start, current_period_end")
+        .eq("team_id", team_id)
+        .eq("status", "active")
+        .single(),
+      supabase_service
+        .from("coupons")
+        .select("credits")
+        .eq("team_id", team_id)
+        .eq("status", "active"),
+    ]);
 
   let couponCredits = 0;
   if (coupons && coupons.length > 0) {
@@ -238,7 +238,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
     // 5. Compare the total credits used with the credits allowed by the plan.
     if (totalCreditsUsed + credits > FREE_CREDITS) {
       // Send email notification for insufficient credits
-
       await sendNotification(
         team_id,
         NotificationType.LIMIT_REACHED,
@@ -275,7 +274,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
 
   // Adjust total credits used by subtracting coupon value
   const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
-
   // Get the price details
   const { data: price, error: priceError } = await supabase_service
     .from("prices")