wip: map, crawl, scrape mockups

This commit is contained in:
rafaelsideguide 2024-08-06 15:24:45 -03:00
parent b60ee30dba
commit 6cdf4c68ec
18 changed files with 1514 additions and 0 deletions

View File

@ -0,0 +1,47 @@
import { crawlController } from '../crawl'
import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create';
import { validateIdempotencyKey } from '../../services/idempotency/validate';
import { v4 as uuidv4 } from 'uuid';
jest.mock('../auth', () => ({
authenticateUser: jest.fn().mockResolvedValue({
success: true,
team_id: 'team123',
error: null,
status: 200
}),
reduce: jest.fn()
}));
jest.mock('../../services/idempotency/validate');
describe('crawlController', () => {
it('should prevent duplicate requests using the same idempotency key', async () => {
const req = {
headers: {
'x-idempotency-key': await uuidv4(),
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
},
body: {
url: 'https://mendable.ai'
}
} as unknown as Request;
const res = {
status: jest.fn().mockReturnThis(),
json: jest.fn()
} as unknown as Response;
// Mock the idempotency key validation to return false for the second call
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
// First request should succeed
await crawlController(req, res);
expect(res.status).not.toHaveBeenCalledWith(409);
// Second request with the same key should fail
await crawlController(req, res);
expect(res.status).toHaveBeenCalledWith(409);
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
});
});

View File

@ -0,0 +1,87 @@
import { Request, Response } from "express";
import { Job } from "bull";
import { Logger } from "../../lib/logger";
import { getWebScraperQueue } from "../../services/queue-service";
import { checkAlerts } from "../../services/alerts";
export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
) {
Logger.info("🐂 Cleaning jobs older than 24h");
try {
const webScraperQueue = getWebScraperQueue();
const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push(
webScraperQueue.getJobs(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
);
}
const completedJobs: Job[] = (
await Promise.all(completedJobsPromises)
).flat();
const before24hJobs =
completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
if (!before24hJobs) {
return res.status(200).send(`No jobs to remove.`);
}
for (const job of before24hJobs) {
try {
await job.remove();
count++;
} catch (jobError) {
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
export async function checkQueuesController(req: Request, res: Response) {
try {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
// Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) {
try {
const webScraperQueue = getWebScraperQueue();
const [webScraperActive] = await Promise.all([
webScraperQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,85 @@
import { Request, Response } from "express";
import Redis from "ioredis";
import { Logger } from "../../lib/logger";
import { redisRateLimitClient } from "../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) {
const retryOperation = async (operation, retries = 3) => {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
return await operation();
} catch (error) {
if (attempt === retries) throw error;
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
}
}
};
try {
const queueRedis = new Redis(process.env.REDIS_URL);
const testKey = "test";
const testValue = "test";
// Test queueRedis
let queueRedisHealth;
try {
await retryOperation(() => queueRedis.set(testKey, testValue));
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
await retryOperation(() => queueRedis.del(testKey));
} catch (error) {
Logger.error(`queueRedis health check failed: ${error}`);
queueRedisHealth = null;
}
// Test redisRateLimitClient
let redisRateLimitHealth;
try {
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
redisRateLimitHealth = await retryOperation(() =>
redisRateLimitClient.get(testKey)
);
await retryOperation(() => redisRateLimitClient.del(testKey));
} catch (error) {
Logger.error(`redisRateLimitClient health check failed: ${error}`);
redisRateLimitHealth = null;
}
const healthStatus = {
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
redisRateLimitClient:
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
};
if (
healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy"
) {
Logger.info("Both Redis instances are healthy");
return res.status(200).json({ status: "healthy", details: healthStatus });
} else {
Logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}`
);
// await sendSlackWebhook(
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
// healthStatus
// )}`,
// true
// );
return res
.status(500)
.json({ status: "unhealthy", details: healthStatus });
}
} catch (error) {
Logger.error(`Redis health check failed: ${error}`);
// await sendSlackWebhook(
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
// true
// );
return res
.status(500)
.json({ status: "unhealthy", message: error.message });
}
}

View File

@ -0,0 +1,220 @@
import { parseApi } from "../../../src/lib/parseApi";
import { getRateLimiter, } from "../../../src/services/rate-limiter";
import { AuthResponse, NotificationType, RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase";
import { withAuth } from "../../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
import { sendNotification } from "../../../src/services/notification/email_notification";
import { Logger } from "../../../src/lib/logger";
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode);
}
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key
});
} catch (error) {
Logger.error(`Error setting trace attributes: ${error.message}`);
}
}
export async function supaAuthenticateUser(
req,
res,
mode?: RateLimiterMode
): Promise<{
success: boolean;
team_id?: string;
error?: string;
status?: number;
plan?: string;
}> {
const authHeader = req.headers.authorization;
if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 };
}
const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
if (!token) {
return {
success: false,
error: "Unauthorized: Token missing",
status: 401,
};
}
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
let rateLimiter: RateLimiterRedis;
let subscriptionData: { team_id: string, plan: string } | null = null;
let normalizedApi: string;
let team_id: string;
if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
team_id = "preview";
} else {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service.rpc(
'get_key_and_price_id_2', { api_key: normalizedApi }
);
// get_key_and_price_id_2 rpc definition:
// create or replace function get_key_and_price_id_2(api_key uuid)
// returns table(key uuid, team_id uuid, price_id text) as $$
// begin
// if api_key is null then
// return query
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
// end if;
// return query
// select ak.key, ak.team_id, s.price_id
// from api_keys ak
// left join subscriptions s on ak.team_id = s.team_id
// where ak.key = api_key;
// end;
// $$ language plpgsql;
if (error) {
Logger.warn(`Error fetching key and price_id: ${error.message}`);
} else {
// console.log('Key and Price ID:', data);
}
if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const internal_team_id = data[0].team_id;
team_id = internal_team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging
setTrace(team_id, normalizedApi);
subscriptionData = {
team_id: team_id,
plan: plan
}
switch (mode) {
case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
break;
case RateLimiterMode.Scrape:
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
break;
case RateLimiterMode.Search:
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
break;
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
break;
default:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
break;
// case RateLimiterMode.Search:
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
// break;
}
}
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
try {
await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) {
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
// We can only send a rate limit email every 7 days, send notification already has the date in between checking
const startDate = new Date();
const endDate = new Date();
endDate.setDate(endDate.getDate() + 7);
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
return {
success: false,
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
status: 429,
};
}
if (
token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
) {
return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev
// const origin = req.headers.origin;
// if (origin && origin.includes("firecrawl.dev")){
// return { success: true, team_id: "preview" };
// }
// if(process.env.ENV !== "production") {
// return { success: true, team_id: "preview" };
// }
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
}
// make sure api key is valid, based on the api_keys table in supabase
if (!subscriptionData) {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service
.from("api_keys")
.select("*")
.eq("key", normalizedApi);
if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
subscriptionData = data[0];
}
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
}
function getPlanByPriceId(price_id: string) {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER:
return 'starter';
case process.env.STRIPE_PRICE_ID_STANDARD:
return 'standard';
case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale';
case process.env.STRIPE_PRICE_ID_HOBBY:
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
return 'hobby';
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
return 'standardnew';
case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return 'growth';
default:
return 'free';
}
}

View File

@ -0,0 +1,71 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabase_service } from "../../src/services/supabase";
import { billTeam } from "../../src/services/billing/credit_billing";
import { Logger } from "../../src/lib/logger";
export async function crawlCancelController(req: Request, res: Response) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
// check if the job belongs to the team
if (useDbAuthentication) {
const { data, error: supaError } = await supabase_service
.from("bulljobs_teams")
.select("*")
.eq("job_id", req.params.jobId)
.eq("team_id", team_id);
if (supaError) {
return res.status(500).json({ error: supaError.message });
}
if (data.length === 0) {
return res.status(403).json({ error: "Unauthorized" });
}
}
const jobState = await job.getState();
const { partialDocs } = await job.progress();
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length);
}
try {
await getWebScraperQueue().client.del(job.lockKey());
await job.takeLock();
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) {
Logger.error(error);
}
const newJobState = await job.getState();
res.json({
status: "cancelled"
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,89 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
import { Logger } from "../../../src/lib/logger";
import { v4 as uuidv4 } from "uuid";
export async function crawlStatusController(req: Request, res: Response) {
// TODO: validate req.params.jobId
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
// const job = await getWebScraperQueue().getJob(req.params.jobId);
// if (!job) {
// return res.status(404).json({ error: "Job not found" });
// }
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") {
// const supabaseData = await supabaseGetJobById(req.params.jobId);
// if (supabaseData) {
// data = supabaseData.docs;
// }
// }
// const jobStatus = await job.getState();
// mock:
const id = uuidv4();
const result = {
totalCount: 100,
creditsUsed: 2,
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
status: "scraping", // scraping, completed, failed
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
data: [{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
},
{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}]
}
res.status(200).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,139 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
import { billTeam } from "../../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
export async function crawlController(req: Request, res: Response) {
// expected req.body
// req.body = {
// url: string
// crawlerOptions: {
// includePaths: string[]
// excludePaths: string[]
// maxDepth: number
// limit: number
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
// allowExternalLinks: boolean
// ignoreSitemap: number
// }
// scrapeOptions: Exclude<Scrape, "url">
// }
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
try {
url = checkAndUpdateURL(url);
} catch (error) {
return res.status(400).json({ error: 'Invalid Url' });
}
// TODO: add job to queue
const id = uuidv4();
return res.status(200).json({ jobId: id, url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}` });
// const mode = req.body.mode ?? "crawl";
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,46 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../src/lib/logger";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
if (!success) {
return res.status(status).json({ error });
}
// authenticate on supabase
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview",
pageOptions: pageOptions,
origin: "website-preview",
});
res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,24 @@
import { AuthResponse, RateLimiterMode } from "../types";
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
export const keyAuthController = async (req: Request, res: Response) => {
try {
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser(
req,
res
);
if (!success) {
return res.status(status).json({ error });
}
// if success, return success: true
return res.status(200).json({ success: true });
} catch (error) {
return res.status(500).json({ error: error.message });
}
};

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function livenessController(req: Request, res: Response) {
//TODO: add checks if the application is live and healthy like checking the redis connection
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,128 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
import { billTeam } from "../../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
export async function mapController(req: Request, res: Response) {
// expected req.body
// req.body = {
// url: string
// ignoreSitemap: true??
// other crawler options?
// }
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
// if (req.headers["x-idempotency-key"]) {
// const isIdempotencyValid = await validateIdempotencyKey(req);
// if (!isIdempotencyValid) {
// return res.status(409).json({ error: "Idempotency key already used" });
// }
// try {
// createIdempotencyKey(req);
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
// await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// return res.status(402).json({ error: "Insufficient credits" });
// }
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
try {
url = checkAndUpdateURL(url);
} catch (error) {
return res.status(400).json({ error: 'Invalid Url' });
}
return res.status(200).json({ urls: [ "test1", "test2" ] });
// const mode = req.body.mode ?? "crawl";
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function readinessController(req: Request, res: Response) {
// TODO: add checks when the application is ready to serve traffic
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,253 @@
// import { ExtractorOptions, PageOptions } from './../../lib/entities';
import { Request, Response } from "express";
// import { WebScraperDataProvider } from "../../scraper/WebScraper";
// import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../types";
// import { logJob } from "../../services/logging/log_job";
// import { Document } from "../../lib/entities";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
// import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
// import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../../src/lib/default-values';
// import { v4 as uuidv4 } from "uuid";
import { Logger } from '../../lib/logger';
import { checkAndUpdateURL } from '../../lib/validateUrl';
export async function scrapeController(req: Request, res: Response) {
let url = req.body.url;
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
try {
url = checkAndUpdateURL(url);
} catch (error) {
return { success: false, error: "Invalid URL", returnCode: 400 };
}
// TODO: check req.body
// mockup req.body
// req.body = {
// url: "test",
// headers: {
// "x-key": "test"
// },
// formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
// includeTags: ["test"],
// excludeTags: ["test"],
// onlyMainContent: false,
// timeout: 30000,
// waitFor: number
// }
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ error });
}
// check credits
const result = {
success: true,
warning: "test",
data: {
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}
}
return res.status(200).json(result);
// const crawlerOptions = req.body.crawlerOptions ?? {};
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
// const origin = req.body.origin ?? defaultOrigin;
// let timeout = req.body.timeout ?? defaultTimeout;
// if (extractorOptions.mode.includes("llm-extraction")) {
// pageOptions.onlyMainContent = true;
// timeout = req.body.timeout ?? 90000;
// }
// const checkCredits = async () => {
// try {
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// earlyReturn = true;
// return res.status(402).json({ error: "Insufficient credits" });
// }
// } catch (error) {
// Logger.error(error);
// earlyReturn = true;
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
// }
// };
// await checkCredits();
// const jobId = uuidv4();
// const startTime = new Date().getTime();
// const result = await scrapeHelper(
// jobId,
// req,
// team_id,
// crawlerOptions,
// pageOptions,
// extractorOptions,
// timeout,
// plan
// );
// const endTime = new Date().getTime();
// const timeTakenInSeconds = (endTime - startTime) / 1000;
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
// if (result.success) {
// let creditsToBeBilled = 1; // Assuming 1 credit per document
// const creditsPerLLMExtract = 50;
// if (extractorOptions.mode.includes("llm-extraction")) {
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
// creditsToBeBilled += creditsPerLLMExtract;
// }
// let startTimeBilling = new Date().getTime();
// if (earlyReturn) {
// // Don't bill if we're early returning
// return;
// }
// const billingResult = await billTeam(
// team_id,
// creditsToBeBilled
// );
// if (!billingResult.success) {
// return res.status(402).json({
// success: false,
// error: "Failed to bill team. Insufficient credits or subscription not found.",
// });
// }
// }
// logJob({
// job_id: jobId,
// success: result.success,
// message: result.error,
// num_docs: 1,
// docs: [result.data],
// time_taken: timeTakenInSeconds,
// team_id: team_id,
// mode: "scrape",
// url: req.body.url,
// crawlerOptions: crawlerOptions,
// pageOptions: pageOptions,
// origin: origin,
// extractor_options: extractorOptions,
// num_tokens: numTokens,
// });
// return res.status(result.returnCode).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
// export async function scrapeHelper(
// jobId: string,
// req: Request,
// team_id: string,
// crawlerOptions: any,
// pageOptions: PageOptions,
// extractorOptions: ExtractorOptions,
// timeout: number,
// plan?: string
// ): Promise<{
// success: boolean;
// error?: string;
// data?: Document;
// returnCode: number;
// }> {
// const url = req.body.url;
// if (!url) {
// return { success: false, error: "Url is required", returnCode: 400 };
// }
// if (isUrlBlocked(url)) {
// return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
// }
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId,
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
// );
// const docsPromise = a.getDocuments(false);
// let docs;
// try {
// docs = await Promise.race([docsPromise, timeoutPromise]);
// } catch (error) {
// return error;
// }
// // make sure doc.content is not empty
// let filteredDocs = docs.filter(
// (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
// );
// if (filteredDocs.length === 0) {
// return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
// }
// // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
// if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
// filteredDocs.forEach(doc => {
// delete doc.rawHtml;
// });
// }
// return {
// success: true,
// data: filteredDocs[0],
// returnCode: 200,
// };
// }

View File

@ -0,0 +1,197 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger";
export async function searchHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
searchOptions: SearchOptions,
): Promise<{
success: boolean;
error?: string;
data?: any;
returnCode: number;
}> {
const query = req.body.query;
const advanced = false;
if (!query) {
return { success: false, error: "Query is required", returnCode: 400 };
}
const tbs = searchOptions.tbs ?? null;
const filter = searchOptions.filter ?? null;
const num_results = searchOptions.limit ?? 7;
const num_results_buffer = Math.floor(num_results * 1.5);
let res = await search({
query: query,
advanced: advanced,
num_results: num_results_buffer,
tbs: tbs,
filter: filter,
lang: searchOptions.lang ?? "en",
country: searchOptions.country ?? "us",
location: searchOptions.location,
});
let justSearch = pageOptions.fetchPageContent === false;
if (justSearch) {
const billingResult = await billTeam(
team_id,
res.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return { success: true, data: res, returnCode: 200 };
}
res = res.filter((r) => !isUrlBlocked(r.url));
if (res.length > num_results) {
res = res.slice(0, num_results);
}
if (res.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
}
// filter out social media links
const a = new WebScraperDataProvider();
await a.setOptions({
jobId,
mode: "single_urls",
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
crawlerOptions: {
...crawlerOptions,
},
pageOptions: {
...pageOptions,
onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false,
removeTags: pageOptions?.removeTags ?? [],
fallback: false,
},
});
const docs = await a.getDocuments(false);
if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
}
// make sure doc.content is not empty
const filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
);
if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs };
}
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return {
success: true,
data: filteredDocs,
returnCode: 200,
};
}
export async function searchController(req: Request, res: Response) {
try {
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Search
);
if (!success) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? {
includeHtml: false,
onlyMainContent: true,
fetchPageContent: true,
removeTags: [],
fallback: false,
};
const origin = req.body.origin ?? "api";
const searchOptions = req.body.searchOptions ?? { limit: 7 };
const jobId = uuidv4();
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: "Internal server error" });
}
const startTime = new Date().getTime();
const result = await searchHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
searchOptions,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: result.data ? result.data.length : 0,
docs: result.data,
time_taken: timeTakenInSeconds,
team_id: team_id,
mode: "search",
url: req.body.query,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,42 @@
import { Request, Response } from "express";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
const { current, current_url, total, current_step, partialDocs } = await job.progress();
let data = job.returnvalue;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(req.params.jobId);
if (supabaseData) {
data = supabaseData.docs;
}
}
let jobStatus = await job.getState();
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
jobStatus = 'active';
}
res.json({
status: jobStatus,
// progress: job.progress(),
current,
current_url,
current_step,
total,
data: data ? data : null,
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -13,6 +13,7 @@ import { ScrapeEvents } from "./lib/scrape-events";
import http from 'node:http';
import https from 'node:https';
import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1";
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -78,6 +79,7 @@ if (cluster.isMaster) {
// register router
app.use(v0Router);
app.use(v1Router);
app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002;

View File

@ -0,0 +1,38 @@
const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return(/^([^.:]+:\/\/)/.test(url));
}
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
let error = false;
let urlObj = {};
try {
urlObj = new URL(s);
} catch (err) {
error = true;
}
return { error, urlObj };
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
}

34
apps/api/src/routes/v1.ts Normal file
View File

@ -0,0 +1,34 @@
import express from "express";
import { crawlController } from "../../src/controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { mapController } from "../../src/controllers/v1/map";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
export const v1Router = express.Router();
v1Router.post("/v1/scrape", scrapeController);
v1Router.post("/v1/crawl", crawlController);
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication
// v1Router.get("/v1/keyAuth", keyAuthController);
// // Search routes
// v0Router.post("/v1/search", searchController);
// Health/Probe routes
// v1Router.get("/v1/health/liveness", livenessController);
// v1Router.get("/v1/health/readiness", readinessController);
v1Router.post("/v1/map", mapController);