diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index be99e8c1..8cb09cf0 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -1,19 +1,27 @@ -import { ExtractorOptions, PageOptions } from './../../lib/entities'; +import { ExtractorOptions, PageOptions } from "./../../lib/entities"; import { Request, Response } from "express"; -import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; +import { + billTeam, + checkTeamCredits, +} from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { Document } from "../../lib/entities"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function -import { numTokensFromString } from '../../lib/LLM-extraction/helpers'; -import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../lib/default-values'; -import { addScrapeJob, waitForJob } from '../../services/queue-jobs'; -import { getScrapeQueue } from '../../services/queue-service'; +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { + defaultPageOptions, + defaultExtractorOptions, + defaultTimeout, + defaultOrigin, +} from "../../lib/default-values"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { getScrapeQueue } from "../../services/queue-service"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from '../../lib/logger'; +import { Logger } from "../../lib/logger"; import * as Sentry from "@sentry/node"; -import { getJobPriority } from '../../lib/job-priority'; +import { getJobPriority } from "../../lib/job-priority"; export async function scrapeHelper( jobId: string, @@ -36,47 +44,71 @@ export async function scrapeHelper( } if (isUrlBlocked(url)) { - return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; + return { + success: false, + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + returnCode: 403, + }; } - const jobPriority = await getJobPriority({plan, team_id, basePriority: 10}) + const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); - const job = await addScrapeJob({ - url, - mode: "single_urls", - crawlerOptions, - team_id, - pageOptions, - extractorOptions, - origin: req.body.origin ?? defaultOrigin, - }, {}, jobId, jobPriority); + const job = await addScrapeJob( + { + url, + mode: "single_urls", + crawlerOptions, + team_id, + pageOptions, + extractorOptions, + origin: req.body.origin ?? defaultOrigin, + }, + {}, + jobId, + jobPriority + ); let doc; - const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { - try { - doc = (await waitForJob(job.id, timeout))[0]; - } catch (e) { - if (e instanceof Error && e.message.startsWith("Job wait")) { - span.setAttribute("timedOut", true); - return { - success: false, - error: "Request timed out", - returnCode: 408, + const err = await Sentry.startSpan( + { + name: "Wait for job to finish", + op: "bullmq.wait", + attributes: { job: jobId }, + }, + async (span) => { + try { + doc = (await waitForJob(job.id, timeout))[0]; + } catch (e) { + if (e instanceof Error && e.message.startsWith("Job wait")) { + span.setAttribute("timedOut", true); + return { + success: false, + error: "Request timed out", + returnCode: 408, + }; + } else if ( + typeof e === "string" && + (e.includes("Error generating completions: ") || + e.includes("Invalid schema for function") || + e.includes( + "LLM extraction did not match the extraction schema you provided." + )) + ) { + return { + success: false, + error: e, + returnCode: 500, + }; + } else { + throw e; } - } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) { - return { - success: false, - error: e, - returnCode: 500, - }; - } else { - throw e; } + span.setAttribute("result", JSON.stringify(doc)); + return null; } - span.setAttribute("result", JSON.stringify(doc)); - return null; - }); + ); if (err !== null) { return err; @@ -86,14 +118,22 @@ export async function scrapeHelper( if (!doc) { console.error("!!! PANIC DOC IS", doc, job); - return { success: true, error: "No page found", returnCode: 200, data: doc }; + return { + success: true, + error: "No page found", + returnCode: 200, + data: doc, + }; } delete doc.index; delete doc.provider; // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html - if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { + if ( + !pageOptions.includeRawHtml && + extractorOptions.mode == "llm-extraction-from-raw-html" + ) { if (doc.rawHtml) { delete doc.rawHtml; } @@ -127,13 +167,24 @@ export async function scrapeController(req: Request, res: Response) { const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; - const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; + const extractorOptions = { + ...defaultExtractorOptions, + ...req.body.extractorOptions, + }; const origin = req.body.origin ?? defaultOrigin; let timeout = req.body.timeout ?? defaultTimeout; if (extractorOptions.mode.includes("llm-extraction")) { - if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) { - return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" }); + if ( + typeof extractorOptions.extractionSchema !== "object" || + extractorOptions.extractionSchema === null + ) { + return res + .status(400) + .json({ + error: + "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", + }); } pageOptions.onlyMainContent = true; @@ -142,7 +193,8 @@ export async function scrapeController(req: Request, res: Response) { // checkCredits try { - const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); + const { success: creditsCheckSuccess, message: creditsCheckMessage } = + await checkTeamCredits(team_id, 1); if (!creditsCheckSuccess) { earlyReturn = true; return res.status(402).json({ error: "Insufficient credits" }); @@ -150,7 +202,12 @@ export async function scrapeController(req: Request, res: Response) { } catch (error) { Logger.error(error); earlyReturn = true; - return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); + return res + .status(500) + .json({ + error: + "Error checking team credits. Please contact hello@firecrawl.com for help.", + }); } const jobId = uuidv4(); @@ -168,7 +225,10 @@ export async function scrapeController(req: Request, res: Response) { ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; + const numTokens = + result.data && result.data.markdown + ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + : 0; if (result.success) { let creditsToBeBilled = 0; // billing for doc done on queue end @@ -185,14 +245,12 @@ export async function scrapeController(req: Request, res: Response) { // Don't bill if we're early returning return; } - const billingResult = await billTeam( - team_id, - creditsToBeBilled - ); + const billingResult = await billTeam(team_id, creditsToBeBilled); if (!billingResult.success) { return res.status(402).json({ success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", + error: + "Failed to bill team. Insufficient credits or subscription not found.", }); } } @@ -209,17 +267,22 @@ export async function scrapeController(req: Request, res: Response) { url: req.body.url, crawlerOptions: crawlerOptions, pageOptions: pageOptions, - origin: origin, + origin: origin, extractor_options: extractorOptions, num_tokens: numTokens, }); - - return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); Logger.error(error); - return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") }); + return res + .status(500) + .json({ + error: + typeof error === "string" + ? error + : error?.message ?? "Internal Server Error", + }); } } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 69e47a0b..f4c4586f 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -21,6 +21,7 @@ import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; import { Logger } from "../../lib/logger"; +import { getJobPriority } from "../../lib/job-priority"; export async function crawlController( req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, @@ -66,6 +67,7 @@ export async function crawlController( pageOptions, team_id: req.auth.team_id, createdAt: Date.now(), + plan: req.auth.plan, }; const crawler = crawlToCrawler(id, sc); @@ -86,7 +88,14 @@ export async function crawlController( ? null : await crawler.tryGetSitemap(); - if (sitemap !== null) { + if (sitemap !== null && sitemap.length > 0) { + let jobPriority = 20; + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(sitemap.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 505628d8..19dc4165 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -39,6 +39,7 @@ export async function mapController( pageOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), + plan: req.auth.plan, }; const crawler = crawlToCrawler(id, sc); diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index cbd7fe2b..940296bf 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -6,6 +6,8 @@ import { v4 as uuidv4 } from 'uuid'; import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; +import { getJobPriority } from "../../lib/job-priority"; +import { PlanType } from "../../types"; export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response) { req.body = scrapeRequestSchema.parse(req.body); @@ -17,6 +19,8 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, const jobId = uuidv4(); const startTime = new Date().getTime(); + const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10}) + const job = await addScrapeJob({ url: req.body.url, mode: "single_urls", @@ -25,7 +29,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, pageOptions, extractorOptions: {}, origin: req.body.origin, - }, {}, jobId); + }, {}, jobId, jobPriority); let doc: any | undefined; try { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index bcfca1fb..3697bd2a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -3,6 +3,7 @@ import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; +import { PlanType } from "../../types"; export type Format = | "markdown" @@ -229,7 +230,7 @@ export type CrawlStatusResponse = type AuthObject = { team_id: string; - plan: string; + plan: PlanType; }; type Account = {