Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper

This commit is contained in:
Nicolas 2024-08-27 11:12:00 -03:00
commit d30119707f
3 changed files with 55 additions and 73 deletions

View File

@ -912,53 +912,41 @@ describe("GET /v1/crawl/:jobId", () => {
180000
); // 120 seconds
// it.concurrent(
// "If someone cancels a crawl job, it should turn into failed status",
// async () => {
// const crawlResponse = await request(TEST_URL)
// .post("/v1/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://docs.tatum.io", limit: 200 });
it.concurrent(
"If someone cancels a crawl job, it should turn into failed status",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.tatum.io", limit: 200 });
// expect(crawlResponse.statusCode).toBe(200);
expect(crawlResponse.statusCode).toBe(200);
// await new Promise((r) => setTimeout(r, 10000));
await new Promise((r) => setTimeout(r, 10000));
// const responseCancel = await request(TEST_URL)
// .delete(`/v1/crawl/${crawlResponse.body.id}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(responseCancel.statusCode).toBe(200);
// expect(responseCancel.body).toHaveProperty("status");
// expect(responseCancel.body.status).toBe("cancelled");
// await new Promise((r) => setTimeout(r, 10000));
// const completedResponse = await request(TEST_URL)
// .get(`/v1/crawl/${crawlResponse.body.id}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(completedResponse.statusCode).toBe(200);
// expect(completedResponse.body).toHaveProperty("status");
// expect(completedResponse.body.status).toBe("failed");
// expect(completedResponse.body).toHaveProperty("data");
// let isNullOrEmptyArray = false;
// if (
// completedResponse.body.data === null ||
// completedResponse.body.data.length === 0
// ) {
// isNullOrEmptyArray = true;
// }
// expect(isNullOrEmptyArray).toBe(true);
// expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
// expect(completedResponse.body).toHaveProperty("data");
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
// expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
// },
// 60000
// ); // 60 seconds
const responseCancel = await request(TEST_URL)
.delete(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(responseCancel.statusCode).toBe(200);
expect(responseCancel.body).toHaveProperty("status");
expect(responseCancel.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("cancelled");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
60000
); // 60 seconds
})
});

View File

@ -1,11 +1,10 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing";
import { Logger } from "../../src/lib/logger";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types";
import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
export async function crawlCancelController(req: Request, res: Response) {
try {
@ -19,8 +18,9 @@ export async function crawlCancelController(req: Request, res: Response) {
if (!success) {
return res.status(status).json({ error });
}
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
@ -40,31 +40,18 @@ export async function crawlCancelController(req: Request, res: Response) {
}
}
const jobState = await job.getState();
const { partialDocs } = await job.progress();
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length);
}
try {
await getWebScraperQueue().client.del(job.lockKey());
await job.takeLock();
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true);
sc.cancelled = true;
await saveCrawl(req.params.jobId, sc);
} catch (error) {
Logger.error(error);
}
const newJobState = await job.getState();
res.json({
status: "cancelled"
});
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}

View File

@ -1,9 +1,9 @@
import express, { NextFunction, Request, Response } from "express";
import { crawlController } from "../../src/controllers/v1/crawl";
import { crawlController } from "../controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { mapController } from "../../src/controllers/v1/map";
import { crawlStatusController } from "../controllers/v1/crawl-status";
import { mapController } from "../controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/auth";
@ -16,6 +16,7 @@ import { v4 as uuidv4 } from "uuid";
import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -130,7 +131,13 @@ v1Router.ws(
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
// v1Router.delete("/crawl/:jobId", crawlCancelController);
v1Router.delete(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.Crawl),
crawlCancelController
);
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication