cleanup and fix cancelling

This commit is contained in:
Gergo Moricz 2024-08-06 16:26:46 +02:00
parent 52198f2991
commit 03c84a9372
5 changed files with 45 additions and 81 deletions

View File

@ -1,7 +1,6 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing"; import { billTeam } from "../../src/services/billing/credit_billing";
@ -59,17 +58,12 @@ export async function crawlCancelController(req: Request, res: Response) {
} }
try { try {
// TODO: FIX THIS by doing as a flag on the data? await (await getWebScraperQueue().client).set("cancelled:" + job.id, "true", "EX", 60 * 60);
// await getWebScraperQueue().client.del(job.lockKey()); await job.discard();
// await job.takeLock();
// await job.discard();
// await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) { } catch (error) {
Logger.error(error); Logger.error(error);
} }
const newJobState = await job.getState();
res.json({ res.json({
status: "cancelled" status: "cancelled"
}); });

View File

@ -21,6 +21,14 @@ export async function crawlStatusController(req: Request, res: Response) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
} }
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + req.params.jobId);
if (isCancelled) {
return res.json({
status: "cancelled",
});
}
let progress = job.progress; let progress = job.progress;
if(typeof progress !== 'object') { if(typeof progress !== 'object') {
progress = { progress = {

View File

@ -1,6 +1,5 @@
import { ExtractorOptions, PageOptions } from './../lib/entities'; import { ExtractorOptions, PageOptions } from './../lib/entities';
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../types";
@ -9,9 +8,8 @@ import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { addScrapeJob, addWebScraperJob } from '../services/queue-jobs'; import { addScrapeJob } from '../services/queue-jobs';
import { getScrapeQueue, getWebScraperQueue, scrapeQueueEvents } from '../services/queue-service'; import { scrapeQueueEvents } from '../services/queue-service';
import { supabase_service } from '../services/supabase';
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger'; import { Logger } from '../lib/logger';
@ -39,17 +37,6 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
} }
// const a = new WebScraperDataProvider();
// await a.setOptions({
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
const job = await addScrapeJob({ const job = await addScrapeJob({
url, url,
mode: "single_urls", mode: "single_urls",
@ -60,53 +47,16 @@ export async function scrapeHelper(
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
}); });
const doc = (await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000))[0]; //60 seconds timeout
// const docsPromise = new Promise((resolve) => {
// promiseResolve = resolve;
// });
// const listener = (j: string, res: any) => {
// console.log("JOB COMPLETED", j, "vs", job.id, res);
// if (j === job.id) {
// promiseResolve([j, res]);
// sq.removeListener("global:completed", listener);
// }
// }
const jobResult = await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000);//60 seconds timeout
// wsq.on("global:completed", listener);
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
// );
// let j;
// try {
// j = await Promise.race([jobResult, timeoutPromise]);
// } catch (error) {
// // sq.removeListener("global:completed", listener);
// return error;
// }
// console.log("JOB RESULT", j[1]);
// let j1 = typeof j[1] === "string" ? JSON.parse(j[1]) : j[1];
const doc = jobResult !== null ? jobResult[0] : (await supabase_service
.from("firecrawl_jobs")
.select("docs")
.eq("job_id", job.id as string)).data[0]?.docs[0];
if (!doc) { if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return { success: true, error: "No page found", returnCode: 200, data: doc }; return { success: true, error: "No page found", returnCode: 200, data: doc };
} }
delete doc.index; delete doc.index;
delete doc.provider; delete doc.provider;
// make sure doc.content is not empty
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
delete doc.rawHtml; delete doc.rawHtml;

View File

@ -12,6 +12,7 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { ScrapeEvents } from "../lib/scrape-events";
import { getWebScraperQueue } from "../services/queue-service";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
@ -101,6 +102,9 @@ export async function runWebScraper({
}) })
: docs.filter((doc) => doc.content.trim().length > 0); : docs.filter((doc) => doc.content.trim().length > 0);
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + bull_job_id);
if (!isCancelled) {
const billingResult = await billTeam(team_id, filteredDocs.length); const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) { if (!billingResult.success) {
@ -111,6 +115,7 @@ export async function runWebScraper({
docs: [], docs: [],
}; };
} }
}
// This is where the returnvalue from the job is set // This is where the returnvalue from the job is set
onSuccess(filteredDocs, mode); onSuccess(filteredDocs, mode);

View File

@ -12,7 +12,7 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job, tryCatch } from "bullmq"; import { Job, QueueEvents, tryCatch } from "bullmq";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { ScrapeEvents } from "../lib/scrape-events";
import { Worker } from "bullmq"; import { Worker } from "bullmq";
@ -131,10 +131,18 @@ async function processJob(job: Job, token: string) {
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id);
if (isCancelled) {
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), job.token);
await job.discard();
}
const data = { const data = {
success: success, success,
result: { result: {
links: docs.map((doc) => { links: isCancelled ? [] : docs.map((doc) => {
return { return {
content: doc, content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "", source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
@ -142,20 +150,20 @@ async function processJob(job: Job, token: string) {
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,
error: message /* etc... */, error: isCancelled ? "Job cancelled by user" : message /* etc... */,
docs: docs, docs: isCancelled ? [] : docs,
}; };
if (job.data.mode === "crawl") { if (job.data.mode === "crawl" && !isCancelled) {
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data);
} }
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
success: success, success: success && !isCancelled,
message: message, message: isCancelled ? "Job cancelled by user" : message,
num_docs: docs.length, num_docs: isCancelled ? 0 : docs.length,
docs: docs, docs: isCancelled ? [] : docs,
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
team_id: job.data.team_id, team_id: job.data.team_id,
mode: job.data.mode, mode: job.data.mode,
@ -165,7 +173,6 @@ async function processJob(job: Job, token: string) {
origin: job.data.origin, origin: job.data.origin,
}); });
Logger.debug(`🐂 Job done ${job.id}`); Logger.debug(`🐂 Job done ${job.id}`);
// done(null, data);
return data; return data;
} catch (error) { } catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`); Logger.error(`🐂 Job errored ${job.id} - ${error}`);