Merge pull request #653 from mendableai/mog/fix-status-job-get

fix(v0/crawl-status): don't crash on big crawls when requesting jobs from supa
2024-11-16 11:42:24 +08:00 · 2024-09-12 11:39:42 -04:00 · 2024-09-12 11:39:42 -04:00 · ee38273ff9
commit ee38273ff9
parent 503c8b3efa e1171ade1b
3 changed files with 44 additions and 9 deletions
--- a/apps/api/src/controllers/v0/crawl-status.ts
+++ b/apps/api/src/controllers/v0/crawl-status.ts
@ -4,16 +4,16 @@ import { RateLimiterMode } from "../../../src/types";
 import { getScrapeQueue } from "../../../src/services/queue-service";
 import { Logger } from "../../../src/lib/logger";
 import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
-import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
+import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
 import * as Sentry from "@sentry/node";
 import { configDotenv } from "dotenv";
 configDotenv();

-export async function getJobs(ids: string[]) {
+export async function getJobs(crawlId: string, ids: string[]) {
  const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
  
  if (process.env.USE_DB_AUTHENTICATION === "true") {
-    const supabaseData = await supabaseGetJobsById(ids);
+    const supabaseData = await supabaseGetJobsByCrawlId(crawlId);

    supabaseData.forEach(x => {
      const job = jobs.find(y => y.id === x.job_id);
@ -52,7 +52,7 @@ export async function crawlStatusController(req: Request, res: Response) {

    const jobIDs = await getCrawlJobs(req.params.jobId);

-    const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
+    const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
    const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
    const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

--- a/apps/api/src/controllers/v0/status.ts
+++ b/apps/api/src/controllers/v0/status.ts
@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
    //   }
    // }

-    const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
+    const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
    const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
    const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

--- a/apps/api/src/lib/supabase-jobs.ts
+++ b/apps/api/src/lib/supabase-jobs.ts
@ -2,6 +2,11 @@ import { supabase_service } from "../services/supabase";
 import { Logger } from "./logger";
 import * as Sentry from "@sentry/node";

+/**
+ * Get a single firecrawl_job by ID
+ * @param jobId ID of Job
+ * @returns {any | null} Job
+ */
 export const supabaseGetJobById = async (jobId: string) => {
  const { data, error } = await supabase_service
    .from("firecrawl_jobs")
@ -20,13 +25,43 @@ export const supabaseGetJobById = async (jobId: string) => {
  return data;
 };

+/**
+ * Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once.
+ * @param jobIds IDs of Jobs
+ * @returns {any[]} Jobs
+ */
 export const supabaseGetJobsById = async (jobIds: string[]) => {
-  const { data, error } = await supabase_service.rpc("get_jobs_by_ids", {
-    job_ids: jobIds,
-  });
+  const { data, error } = await supabase_service
+    .from("firecrawl_jobs")
+    .select()
+    .in("job_id", jobIds);

  if (error) {
-    Logger.error(`Error in get_jobs_by_ids: ${error}`);
+    Logger.error(`Error in supabaseGetJobsById: ${error}`);
+    Sentry.captureException(error);
+    return [];
+  }
+
+  if (!data) {
+    return [];
+  }
+
+  return data;
+};
+
+/**
+ * Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once.
+ * @param crawlId ID of crawl
+ * @returns {any[]} Jobs
+ */
+export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
+  const { data, error } = await supabase_service
+    .from("firecrawl_jobs")
+    .select()
+    .eq("crawl_id", crawlId)
+
+  if (error) {
+    Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
    Sentry.captureException(error);
    return [];
  }