cleanup and fix cancelling

This commit is contained in:
Gergo Moricz 2024-08-06 16:26:46 +02:00
parent 52198f2991
commit 03c84a9372
5 changed files with 45 additions and 81 deletions

View File

@ -1,7 +1,6 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing";
@ -59,17 +58,12 @@ export async function crawlCancelController(req: Request, res: Response) {
}
try {
// TODO: FIX THIS by doing as a flag on the data?
// await getWebScraperQueue().client.del(job.lockKey());
// await job.takeLock();
// await job.discard();
// await job.moveToFailed(Error("Job cancelled by user"), true);
await (await getWebScraperQueue().client).set("cancelled:" + job.id, "true", "EX", 60 * 60);
await job.discard();
} catch (error) {
Logger.error(error);
}
const newJobState = await job.getState();
res.json({
status: "cancelled"
});

View File

@ -21,6 +21,14 @@ export async function crawlStatusController(req: Request, res: Response) {
return res.status(404).json({ error: "Job not found" });
}
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + req.params.jobId);
if (isCancelled) {
return res.json({
status: "cancelled",
});
}
let progress = job.progress;
if(typeof progress !== 'object') {
progress = {

View File

@ -1,6 +1,5 @@
import { ExtractorOptions, PageOptions } from './../lib/entities';
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
@ -9,9 +8,8 @@ import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { addScrapeJob, addWebScraperJob } from '../services/queue-jobs';
import { getScrapeQueue, getWebScraperQueue, scrapeQueueEvents } from '../services/queue-service';
import { supabase_service } from '../services/supabase';
import { addScrapeJob } from '../services/queue-jobs';
import { scrapeQueueEvents } from '../services/queue-service';
import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger';
@ -39,17 +37,6 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
// const a = new WebScraperDataProvider();
// await a.setOptions({
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
const job = await addScrapeJob({
url,
mode: "single_urls",
@ -60,53 +47,16 @@ export async function scrapeHelper(
origin: req.body.origin ?? defaultOrigin,
});
// const docsPromise = new Promise((resolve) => {
// promiseResolve = resolve;
// });
// const listener = (j: string, res: any) => {
// console.log("JOB COMPLETED", j, "vs", job.id, res);
// if (j === job.id) {
// promiseResolve([j, res]);
// sq.removeListener("global:completed", listener);
// }
// }
const jobResult = await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000);//60 seconds timeout
// wsq.on("global:completed", listener);
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
// );
// let j;
// try {
// j = await Promise.race([jobResult, timeoutPromise]);
// } catch (error) {
// // sq.removeListener("global:completed", listener);
// return error;
// }
// console.log("JOB RESULT", j[1]);
// let j1 = typeof j[1] === "string" ? JSON.parse(j[1]) : j[1];
const doc = jobResult !== null ? jobResult[0] : (await supabase_service
.from("firecrawl_jobs")
.select("docs")
.eq("job_id", job.id as string)).data[0]?.docs[0];
const doc = (await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000))[0]; //60 seconds timeout
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return { success: true, error: "No page found", returnCode: 200, data: doc };
}
delete doc.index;
delete doc.provider;
// make sure doc.content is not empty
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
delete doc.rawHtml;

View File

@ -12,6 +12,7 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { getWebScraperQueue } from "../services/queue-service";
export async function startWebScraperPipeline({
job,
@ -100,16 +101,20 @@ export async function runWebScraper({
}
})
: docs.filter((doc) => doc.content.trim().length > 0);
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + bull_job_id);
const billingResult = await billTeam(team_id, filteredDocs.length);
if (!isCancelled) {
const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found");
return {
success: false,
message: "Failed to bill team, no subscription was found",
docs: [],
};
if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found");
return {
success: false,
message: "Failed to bill team, no subscription was found",
docs: [],
};
}
}
// This is where the returnvalue from the job is set

View File

@ -12,7 +12,7 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job, tryCatch } from "bullmq";
import { Job, QueueEvents, tryCatch } from "bullmq";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { Worker } from "bullmq";
@ -131,10 +131,18 @@ async function processJob(job: Job, token: string) {
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id);
if (isCancelled) {
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), job.token);
await job.discard();
}
const data = {
success: success,
success,
result: {
links: docs.map((doc) => {
links: isCancelled ? [] : docs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
@ -142,20 +150,20 @@ async function processJob(job: Job, token: string) {
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: docs,
error: isCancelled ? "Job cancelled by user" : message /* etc... */,
docs: isCancelled ? [] : docs,
};
if (job.data.mode === "crawl") {
if (job.data.mode === "crawl" && !isCancelled) {
await callWebhook(job.data.team_id, job.id as string, data);
}
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
success: success && !isCancelled,
message: isCancelled ? "Job cancelled by user" : message,
num_docs: isCancelled ? 0 : docs.length,
docs: isCancelled ? [] : docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
@ -165,7 +173,6 @@ async function processJob(job: Job, token: string) {
origin: job.data.origin,
});
Logger.debug(`🐂 Job done ${job.id}`);
// done(null, data);
return data;
} catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`);