feat: move scraper to queue

This commit is contained in:
Gergő Móricz 2024-07-25 00:14:25 +02:00
parent 15890772be
commit 6798695ee4
4 changed files with 60 additions and 27 deletions

View File

@ -9,6 +9,8 @@ import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { addWebScraperJob } from '../services/queue-jobs';
import { getWebScraperQueue } from '../services/queue-service';
export async function scrapeHelper( export async function scrapeHelper(
req: Request, req: Request,
@ -33,49 +35,74 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
} }
const a = new WebScraperDataProvider(); // const a = new WebScraperDataProvider();
await a.setOptions({ // await a.setOptions({
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
const job = await addWebScraperJob({
url,
mode: "single_urls", mode: "single_urls",
urls: [url], crawlerOptions,
crawlerOptions: { team_id,
...crawlerOptions, pageOptions,
}, extractorOptions,
pageOptions: pageOptions, origin: req.body.origin ?? defaultOrigin,
extractorOptions: extractorOptions,
}); });
const wsq = getWebScraperQueue();
let promiseResolve;
const docsPromise = new Promise((resolve) => {
promiseResolve = resolve;
});
const listener = (j: string) => {
console.log("JOB COMPLETED", j, "vs", job.id);
if (j === job.id) {
promiseResolve(j);
wsq.removeListener("global:completed", listener);
}
}
wsq.on("global:completed", listener);
const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) => const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout) setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
); );
const docsPromise = a.getDocuments(false); let j;
let docs;
try { try {
docs = await Promise.race([docsPromise, timeoutPromise]); j = await Promise.race([docsPromise, timeoutPromise]);
} catch (error) { } catch (error) {
wsq.removeListener("global:completed", listener);
return error; return error;
} }
// make sure doc.content is not empty const jobNew = (await wsq.getJob(j));
let filteredDocs = docs.filter( const doc = jobNew.progress().currentDocument;
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 delete doc.index;
);
if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
}
// make sure doc.content is not empty
if (!doc) {
return { success: true, error: "No page found", returnCode: 200, data: doc };
}
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
filteredDocs.forEach(doc => {
delete doc.rawHtml; delete doc.rawHtml;
});
} }
return { return {
success: true, success: true,
data: filteredDocs[0], data: doc,
returnCode: 200, returnCode: 200,
}; };
} }

View File

@ -7,11 +7,12 @@ import { WebScraperOptions } from "../types";
export async function addWebScraperJob( export async function addWebScraperJob(
webScraperOptions: WebScraperOptions, webScraperOptions: WebScraperOptions,
options: any = {} options: any = {},
jobId: string = uuidv4(),
): Promise<Job> { ): Promise<Job> {
return await getWebScraperQueue().add(webScraperOptions, { return await getWebScraperQueue().add(webScraperOptions, {
...options, ...options,
jobId: uuidv4(), jobId,
}); });
} }

View File

@ -42,7 +42,9 @@ async function processJob(job: Job, done) {
error: message /* etc... */, error: message /* etc... */,
}; };
if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data);
}
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
@ -52,7 +54,7 @@ async function processJob(job: Job, done) {
docs: docs, docs: docs,
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
team_id: job.data.team_id, team_id: job.data.team_id,
mode: "crawl", mode: job.data.mode,
url: job.data.url, url: job.data.url,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
@ -90,7 +92,9 @@ async function processJob(job: Job, done) {
error: error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */, "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
}; };
if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data); await callWebhook(job.data.team_id, job.id as string, data);
}
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
success: false, success: false,

View File

@ -25,6 +25,7 @@ export interface WebScraperOptions {
mode: Mode; mode: Mode;
crawlerOptions: any; crawlerOptions: any;
pageOptions: any; pageOptions: any;
extractorOptions?: any;
team_id: string; team_id: string;
origin?: string; origin?: string;
} }