mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 19:58:08 +08:00
cleanup and fix cancelling
This commit is contained in:
parent
52198f2991
commit
03c84a9372
|
@ -1,7 +1,6 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||||
import { supabase_service } from "../../src/services/supabase";
|
import { supabase_service } from "../../src/services/supabase";
|
||||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||||
|
@ -59,17 +58,12 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// TODO: FIX THIS by doing as a flag on the data?
|
await (await getWebScraperQueue().client).set("cancelled:" + job.id, "true", "EX", 60 * 60);
|
||||||
// await getWebScraperQueue().client.del(job.lockKey());
|
await job.discard();
|
||||||
// await job.takeLock();
|
|
||||||
// await job.discard();
|
|
||||||
// await job.moveToFailed(Error("Job cancelled by user"), true);
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const newJobState = await job.getState();
|
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: "cancelled"
|
status: "cancelled"
|
||||||
});
|
});
|
||||||
|
|
|
@ -21,6 +21,14 @@ export async function crawlStatusController(req: Request, res: Response) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + req.params.jobId);
|
||||||
|
|
||||||
|
if (isCancelled) {
|
||||||
|
return res.json({
|
||||||
|
status: "cancelled",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let progress = job.progress;
|
let progress = job.progress;
|
||||||
if(typeof progress !== 'object') {
|
if(typeof progress !== 'object') {
|
||||||
progress = {
|
progress = {
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
|
||||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
|
@ -9,9 +8,8 @@ import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
import { addScrapeJob, addWebScraperJob } from '../services/queue-jobs';
|
import { addScrapeJob } from '../services/queue-jobs';
|
||||||
import { getScrapeQueue, getWebScraperQueue, scrapeQueueEvents } from '../services/queue-service';
|
import { scrapeQueueEvents } from '../services/queue-service';
|
||||||
import { supabase_service } from '../services/supabase';
|
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from '../lib/logger';
|
import { Logger } from '../lib/logger';
|
||||||
|
|
||||||
|
@ -39,17 +37,6 @@ export async function scrapeHelper(
|
||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
}
|
}
|
||||||
|
|
||||||
// const a = new WebScraperDataProvider();
|
|
||||||
// await a.setOptions({
|
|
||||||
// mode: "single_urls",
|
|
||||||
// urls: [url],
|
|
||||||
// crawlerOptions: {
|
|
||||||
// ...crawlerOptions,
|
|
||||||
// },
|
|
||||||
// pageOptions: pageOptions,
|
|
||||||
// extractorOptions: extractorOptions,
|
|
||||||
// });
|
|
||||||
|
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob({
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
|
@ -60,53 +47,16 @@ export async function scrapeHelper(
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const doc = (await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000))[0]; //60 seconds timeout
|
||||||
// const docsPromise = new Promise((resolve) => {
|
|
||||||
// promiseResolve = resolve;
|
|
||||||
// });
|
|
||||||
|
|
||||||
// const listener = (j: string, res: any) => {
|
|
||||||
// console.log("JOB COMPLETED", j, "vs", job.id, res);
|
|
||||||
// if (j === job.id) {
|
|
||||||
// promiseResolve([j, res]);
|
|
||||||
// sq.removeListener("global:completed", listener);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
const jobResult = await job.waitUntilFinished(scrapeQueueEvents, 60 * 1000);//60 seconds timeout
|
|
||||||
|
|
||||||
|
|
||||||
// wsq.on("global:completed", listener);
|
|
||||||
|
|
||||||
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
|
|
||||||
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
|
|
||||||
// );
|
|
||||||
|
|
||||||
// let j;
|
|
||||||
// try {
|
|
||||||
// j = await Promise.race([jobResult, timeoutPromise]);
|
|
||||||
// } catch (error) {
|
|
||||||
// // sq.removeListener("global:completed", listener);
|
|
||||||
// return error;
|
|
||||||
// }
|
|
||||||
// console.log("JOB RESULT", j[1]);
|
|
||||||
|
|
||||||
// let j1 = typeof j[1] === "string" ? JSON.parse(j[1]) : j[1];
|
|
||||||
|
|
||||||
const doc = jobResult !== null ? jobResult[0] : (await supabase_service
|
|
||||||
.from("firecrawl_jobs")
|
|
||||||
.select("docs")
|
|
||||||
.eq("job_id", job.id as string)).data[0]?.docs[0];
|
|
||||||
|
|
||||||
if (!doc) {
|
if (!doc) {
|
||||||
|
console.error("!!! PANIC DOC IS", doc, job);
|
||||||
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
||||||
}
|
}
|
||||||
|
|
||||||
delete doc.index;
|
delete doc.index;
|
||||||
delete doc.provider;
|
delete doc.provider;
|
||||||
|
|
||||||
// make sure doc.content is not empty
|
|
||||||
|
|
||||||
|
|
||||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||||
delete doc.rawHtml;
|
delete doc.rawHtml;
|
||||||
|
|
|
@ -12,6 +12,7 @@ import { Document } from "../lib/entities";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
|
import { getWebScraperQueue } from "../services/queue-service";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
|
@ -101,6 +102,9 @@ export async function runWebScraper({
|
||||||
})
|
})
|
||||||
: docs.filter((doc) => doc.content.trim().length > 0);
|
: docs.filter((doc) => doc.content.trim().length > 0);
|
||||||
|
|
||||||
|
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + bull_job_id);
|
||||||
|
|
||||||
|
if (!isCancelled) {
|
||||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||||
|
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
|
@ -111,6 +115,7 @@ export async function runWebScraper({
|
||||||
docs: [],
|
docs: [],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// This is where the returnvalue from the job is set
|
// This is where the returnvalue from the job is set
|
||||||
onSuccess(filteredDocs, mode);
|
onSuccess(filteredDocs, mode);
|
||||||
|
|
|
@ -12,7 +12,7 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
import { logJob } from "./logging/log_job";
|
import { logJob } from "./logging/log_job";
|
||||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
import { Job, tryCatch } from "bullmq";
|
import { Job, QueueEvents, tryCatch } from "bullmq";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
import { Worker } from "bullmq";
|
import { Worker } from "bullmq";
|
||||||
|
@ -131,10 +131,18 @@ async function processJob(job: Job, token: string) {
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
|
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id);
|
||||||
|
|
||||||
|
if (isCancelled) {
|
||||||
|
await job.discard();
|
||||||
|
await job.moveToFailed(Error("Job cancelled by user"), job.token);
|
||||||
|
await job.discard();
|
||||||
|
}
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: success,
|
success,
|
||||||
result: {
|
result: {
|
||||||
links: docs.map((doc) => {
|
links: isCancelled ? [] : docs.map((doc) => {
|
||||||
return {
|
return {
|
||||||
content: doc,
|
content: doc,
|
||||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||||
|
@ -142,20 +150,20 @@ async function processJob(job: Job, token: string) {
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
project_id: job.data.project_id,
|
project_id: job.data.project_id,
|
||||||
error: message /* etc... */,
|
error: isCancelled ? "Job cancelled by user" : message /* etc... */,
|
||||||
docs: docs,
|
docs: isCancelled ? [] : docs,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (job.data.mode === "crawl") {
|
if (job.data.mode === "crawl" && !isCancelled) {
|
||||||
await callWebhook(job.data.team_id, job.id as string, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
await logJob({
|
await logJob({
|
||||||
job_id: job.id as string,
|
job_id: job.id as string,
|
||||||
success: success,
|
success: success && !isCancelled,
|
||||||
message: message,
|
message: isCancelled ? "Job cancelled by user" : message,
|
||||||
num_docs: docs.length,
|
num_docs: isCancelled ? 0 : docs.length,
|
||||||
docs: docs,
|
docs: isCancelled ? [] : docs,
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
|
@ -165,7 +173,6 @@ async function processJob(job: Job, token: string) {
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
});
|
||||||
Logger.debug(`🐂 Job done ${job.id}`);
|
Logger.debug(`🐂 Job done ${job.id}`);
|
||||||
// done(null, data);
|
|
||||||
return data;
|
return data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user