fix(v0/crawl-status): don't crash on big crawls when requesting jobs from supabase

This commit is contained in:
Gergo Moricz 2024-09-10 08:51:58 +02:00
parent 2cbc4c59ce
commit a6bcf7b438
3 changed files with 39 additions and 5 deletions

View File

@ -4,16 +4,16 @@ import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJobs(ids: string[]) {
export async function getJobs(crawlId: string, ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
@ -52,7 +52,7 @@ export async function crawlStatusController(req: Request, res: Response) {
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
// }
// }
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -2,6 +2,11 @@ import { supabase_service } from "../services/supabase";
import { Logger } from "./logger";
import * as Sentry from "@sentry/node";
/**
* Get a single firecrawl_job by ID
* @param jobId ID of Job
* @returns {any | null} Job
*/
export const supabaseGetJobById = async (jobId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
@ -20,6 +25,11 @@ export const supabaseGetJobById = async (jobId: string) => {
return data;
};
/**
* Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once.
* @param jobIds IDs of Jobs
* @returns {any[]} Jobs
*/
export const supabaseGetJobsById = async (jobIds: string[]) => {
const { data, error } = await supabase_service.rpc("get_jobs_by_ids", {
job_ids: jobIds,
@ -38,6 +48,30 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
return data;
};
/**
* Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once.
* @param crawlId ID of crawl
* @returns {any[]} Jobs
*/
export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select()
.eq("crawl_id", crawlId)
if (error) {
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
Sentry.captureException(error);
return [];
}
if (!data) {
return [];
}
return data;
};
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
const { data, error } = await supabase_service