mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
wip: map, crawl, scrape mockups
This commit is contained in:
parent
b60ee30dba
commit
6cdf4c68ec
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
|
@ -0,0 +1,47 @@
|
|||
import { crawlController } from '../crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
jest.mock('../auth', () => ({
|
||||
authenticateUser: jest.fn().mockResolvedValue({
|
||||
success: true,
|
||||
team_id: 'team123',
|
||||
error: null,
|
||||
status: 200
|
||||
}),
|
||||
reduce: jest.fn()
|
||||
}));
|
||||
jest.mock('../../services/idempotency/validate');
|
||||
|
||||
describe('crawlController', () => {
|
||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
||||
const req = {
|
||||
headers: {
|
||||
'x-idempotency-key': await uuidv4(),
|
||||
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
|
||||
},
|
||||
body: {
|
||||
url: 'https://mendable.ai'
|
||||
}
|
||||
} as unknown as Request;
|
||||
const res = {
|
||||
status: jest.fn().mockReturnThis(),
|
||||
json: jest.fn()
|
||||
} as unknown as Response;
|
||||
|
||||
// Mock the idempotency key validation to return false for the second call
|
||||
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
|
||||
|
||||
// First request should succeed
|
||||
await crawlController(req, res);
|
||||
expect(res.status).not.toHaveBeenCalledWith(409);
|
||||
|
||||
// Second request with the same key should fail
|
||||
await crawlController(req, res);
|
||||
expect(res.status).toHaveBeenCalledWith(409);
|
||||
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
|
||||
});
|
||||
});
|
87
apps/api/src/controllers/v1/admin/queue.ts.WIP
Normal file
87
apps/api/src/controllers/v1/admin/queue.ts.WIP
Normal file
|
@ -0,0 +1,87 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getWebScraperQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function checkQueuesController(req: Request, res: Response) {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
85
apps/api/src/controllers/v1/admin/redis-health.ts.WIP
Normal file
85
apps/api/src/controllers/v1/admin/redis-health.ts.WIP
Normal file
|
@ -0,0 +1,85 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
try {
|
||||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt === retries) throw error;
|
||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
||||
// Test queueRedis
|
||||
let queueRedisHealth;
|
||||
try {
|
||||
await retryOperation(() => queueRedis.set(testKey, testValue));
|
||||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||
await retryOperation(() => queueRedis.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`queueRedis health check failed: ${error}`);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
// Test redisRateLimitClient
|
||||
let redisRateLimitHealth;
|
||||
try {
|
||||
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
|
||||
redisRateLimitHealth = await retryOperation(() =>
|
||||
redisRateLimitClient.get(testKey)
|
||||
);
|
||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
const healthStatus = {
|
||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||
redisRateLimitClient:
|
||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
||||
};
|
||||
|
||||
if (
|
||||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
Logger.info("Both Redis instances are healthy");
|
||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
Logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
// healthStatus
|
||||
// )}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", message: error.message });
|
||||
}
|
||||
}
|
220
apps/api/src/controllers/v1/auth.ts
Normal file
220
apps/api/src/controllers/v1/auth.ts
Normal file
|
@ -0,0 +1,220 @@
|
|||
import { parseApi } from "../../../src/lib/parseApi";
|
||||
import { getRateLimiter, } from "../../../src/services/rate-limiter";
|
||||
import { AuthResponse, NotificationType, RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { withAuth } from "../../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { sendNotification } from "../../../src/services/notification/email_notification";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
}> {
|
||||
const authHeader = req.headers.authorization;
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
}
|
||||
const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
|
||||
if (!token) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Token missing",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||
req.socket.remoteAddress) as string;
|
||||
const iptoken = incomingIP + token;
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
let team_id: string;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
team_id = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
// begin
|
||||
// if api_key is null then
|
||||
// return query
|
||||
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
|
||||
// end if;
|
||||
|
||||
// return query
|
||||
// select ak.key, ak.team_id, s.price_id
|
||||
// from api_keys ak
|
||||
// left join subscriptions s on ak.team_id = s.team_id
|
||||
// where ak.key = api_key;
|
||||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const internal_team_id = data[0].team_id;
|
||||
team_id = internal_team_id;
|
||||
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
}
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
default:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
|
||||
break;
|
||||
// case RateLimiterMode.Search:
|
||||
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
// We can only send a rate limit email every 7 days, send notification already has the date in between checking
|
||||
const startDate = new Date();
|
||||
const endDate = new Date();
|
||||
endDate.setDate(endDate.getDate() + 7);
|
||||
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
status: 429,
|
||||
};
|
||||
}
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
// const origin = req.headers.origin;
|
||||
// if (origin && origin.includes("firecrawl.dev")){
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
// if(process.env.ENV !== "production") {
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
|
||||
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
|
||||
}
|
||||
|
||||
// make sure api key is valid, based on the api_keys table in supabase
|
||||
if (!subscriptionData) {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
default:
|
||||
return 'free';
|
||||
}
|
||||
}
|
71
apps/api/src/controllers/v1/crawl-cancel.ts.WIP
Normal file
71
apps/api/src/controllers/v1/crawl-cancel.ts.WIP
Normal file
|
@ -0,0 +1,71 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
// check if the job belongs to the team
|
||||
if (useDbAuthentication) {
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
}
|
||||
|
||||
const jobState = await job.getState();
|
||||
const { partialDocs } = await job.progress();
|
||||
|
||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||
Logger.info("Billing team for partial docs...");
|
||||
// Note: the credits that we will bill them here might be lower than the actual
|
||||
// due to promises that are not yet resolved
|
||||
await billTeam(team_id, partialDocs.length);
|
||||
}
|
||||
|
||||
try {
|
||||
await getWebScraperQueue().client.del(job.lockKey());
|
||||
await job.takeLock();
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
const newJobState = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
89
apps/api/src/controllers/v1/crawl-status.ts
Normal file
89
apps/api/src/controllers/v1/crawl-status.ts
Normal file
|
@ -0,0 +1,89 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
// TODO: validate req.params.jobId
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
// if (!job) {
|
||||
// return res.status(404).json({ error: "Job not found" });
|
||||
// }
|
||||
|
||||
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
// const jobStatus = await job.getState();
|
||||
|
||||
// mock:
|
||||
const id = uuidv4();
|
||||
const result = {
|
||||
totalCount: 100,
|
||||
creditsUsed: 2,
|
||||
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
|
||||
status: "scraping", // scraping, completed, failed
|
||||
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
data: [{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
},
|
||||
{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
res.status(200).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
139
apps/api/src/controllers/v1/crawl.ts
Normal file
139
apps/api/src/controllers/v1/crawl.ts
Normal file
|
@ -0,0 +1,139 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
// expected req.body
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// crawlerOptions: {
|
||||
// includePaths: string[]
|
||||
// excludePaths: string[]
|
||||
// maxDepth: number
|
||||
// limit: number
|
||||
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks: boolean
|
||||
// ignoreSitemap: number
|
||||
// }
|
||||
// scrapeOptions: Exclude<Scrape, "url">
|
||||
// }
|
||||
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return res.status(400).json({ error: 'Invalid Url' });
|
||||
}
|
||||
|
||||
// TODO: add job to queue
|
||||
|
||||
const id = uuidv4();
|
||||
return res.status(200).json({ jobId: id, url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}` });
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
46
apps/api/src/controllers/v1/crawlPreview.ts.WIP
Normal file
46
apps/api/src/controllers/v1/crawlPreview.ts.WIP
Normal file
|
@ -0,0 +1,46 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
// authenticate on supabase
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
||||
team_id: "preview",
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
24
apps/api/src/controllers/v1/keyAuth.ts.WIP
Normal file
24
apps/api/src/controllers/v1/keyAuth.ts.WIP
Normal file
|
@ -0,0 +1,24 @@
|
|||
|
||||
import { AuthResponse, RateLimiterMode } from "../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
|
||||
|
||||
export const keyAuthController = async (req: Request, res: Response) => {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
// if success, return success: true
|
||||
return res.status(200).json({ success: true });
|
||||
} catch (error) {
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
};
|
||||
|
6
apps/api/src/controllers/v1/liveness.ts.WIP
Normal file
6
apps/api/src/controllers/v1/liveness.ts.WIP
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
128
apps/api/src/controllers/v1/map.ts
Normal file
128
apps/api/src/controllers/v1/map.ts
Normal file
|
@ -0,0 +1,128 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addWebScraperJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
|
||||
export async function mapController(req: Request, res: Response) {
|
||||
// expected req.body
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// ignoreSitemap: true??
|
||||
// other crawler options?
|
||||
// }
|
||||
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
// if (req.headers["x-idempotency-key"]) {
|
||||
// const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
// if (!isIdempotencyValid) {
|
||||
// return res.status(409).json({ error: "Idempotency key already used" });
|
||||
// }
|
||||
// try {
|
||||
// createIdempotencyKey(req);
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
// await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return res.status(400).json({ error: 'Invalid Url' });
|
||||
}
|
||||
|
||||
return res.status(200).json({ urls: [ "test1", "test2" ] });
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
6
apps/api/src/controllers/v1/readiness.ts.WIP
Normal file
6
apps/api/src/controllers/v1/readiness.ts.WIP
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
253
apps/api/src/controllers/v1/scrape.ts
Normal file
253
apps/api/src/controllers/v1/scrape.ts
Normal file
|
@ -0,0 +1,253 @@
|
|||
// import { ExtractorOptions, PageOptions } from './../../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
// import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
// import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
// import { logJob } from "../../services/logging/log_job";
|
||||
// import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
// import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
|
||||
// import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../../src/lib/default-values';
|
||||
// import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../../lib/logger';
|
||||
import { checkAndUpdateURL } from '../../lib/validateUrl';
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url);
|
||||
} catch (error) {
|
||||
return { success: false, error: "Invalid URL", returnCode: 400 };
|
||||
}
|
||||
|
||||
// TODO: check req.body
|
||||
// mockup req.body
|
||||
// req.body = {
|
||||
// url: "test",
|
||||
// headers: {
|
||||
// "x-key": "test"
|
||||
// },
|
||||
// formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
|
||||
// includeTags: ["test"],
|
||||
// excludeTags: ["test"],
|
||||
// onlyMainContent: false,
|
||||
// timeout: 30000,
|
||||
// waitFor: number
|
||||
// }
|
||||
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
// check credits
|
||||
|
||||
const result = {
|
||||
success: true,
|
||||
warning: "test",
|
||||
data: {
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res.status(200).json(result);
|
||||
|
||||
// const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
// const origin = req.body.origin ?? defaultOrigin;
|
||||
// let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// pageOptions.onlyMainContent = true;
|
||||
// timeout = req.body.timeout ?? 90000;
|
||||
// }
|
||||
|
||||
// const checkCredits = async () => {
|
||||
// try {
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// earlyReturn = true;
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// earlyReturn = true;
|
||||
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
// }
|
||||
// };
|
||||
|
||||
|
||||
// await checkCredits();
|
||||
|
||||
// const jobId = uuidv4();
|
||||
|
||||
// const startTime = new Date().getTime();
|
||||
// const result = await scrapeHelper(
|
||||
// jobId,
|
||||
// req,
|
||||
// team_id,
|
||||
// crawlerOptions,
|
||||
// pageOptions,
|
||||
// extractorOptions,
|
||||
// timeout,
|
||||
// plan
|
||||
// );
|
||||
// const endTime = new Date().getTime();
|
||||
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
// if (result.success) {
|
||||
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
// const creditsPerLLMExtract = 50;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
// creditsToBeBilled += creditsPerLLMExtract;
|
||||
// }
|
||||
|
||||
// let startTimeBilling = new Date().getTime();
|
||||
|
||||
// if (earlyReturn) {
|
||||
// // Don't bill if we're early returning
|
||||
// return;
|
||||
// }
|
||||
// const billingResult = await billTeam(
|
||||
// team_id,
|
||||
// creditsToBeBilled
|
||||
// );
|
||||
// if (!billingResult.success) {
|
||||
// return res.status(402).json({
|
||||
// success: false,
|
||||
// error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// logJob({
|
||||
// job_id: jobId,
|
||||
// success: result.success,
|
||||
// message: result.error,
|
||||
// num_docs: 1,
|
||||
// docs: [result.data],
|
||||
// time_taken: timeTakenInSeconds,
|
||||
// team_id: team_id,
|
||||
// mode: "scrape",
|
||||
// url: req.body.url,
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: origin,
|
||||
// extractor_options: extractorOptions,
|
||||
// num_tokens: numTokens,
|
||||
// });
|
||||
|
||||
|
||||
// return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// export async function scrapeHelper(
|
||||
// jobId: string,
|
||||
// req: Request,
|
||||
// team_id: string,
|
||||
// crawlerOptions: any,
|
||||
// pageOptions: PageOptions,
|
||||
// extractorOptions: ExtractorOptions,
|
||||
// timeout: number,
|
||||
// plan?: string
|
||||
// ): Promise<{
|
||||
// success: boolean;
|
||||
// error?: string;
|
||||
// data?: Document;
|
||||
// returnCode: number;
|
||||
// }> {
|
||||
|
||||
// const url = req.body.url;
|
||||
// if (!url) {
|
||||
// return { success: false, error: "Url is required", returnCode: 400 };
|
||||
// }
|
||||
|
||||
// if (isUrlBlocked(url)) {
|
||||
// return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
// }
|
||||
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId,
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: {
|
||||
// ...crawlerOptions,
|
||||
// },
|
||||
// pageOptions: pageOptions,
|
||||
// extractorOptions: extractorOptions,
|
||||
// });
|
||||
|
||||
// const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
|
||||
// setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
|
||||
// );
|
||||
|
||||
// const docsPromise = a.getDocuments(false);
|
||||
|
||||
// let docs;
|
||||
// try {
|
||||
// docs = await Promise.race([docsPromise, timeoutPromise]);
|
||||
// } catch (error) {
|
||||
// return error;
|
||||
// }
|
||||
|
||||
// // make sure doc.content is not empty
|
||||
// let filteredDocs = docs.filter(
|
||||
// (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
// );
|
||||
// if (filteredDocs.length === 0) {
|
||||
// return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||
// }
|
||||
|
||||
|
||||
// // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
// if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
// filteredDocs.forEach(doc => {
|
||||
// delete doc.rawHtml;
|
||||
// });
|
||||
// }
|
||||
|
||||
// return {
|
||||
// success: true,
|
||||
// data: filteredDocs[0],
|
||||
// returnCode: 200,
|
||||
// };
|
||||
// }
|
197
apps/api/src/controllers/v1/search.ts.WIP
Normal file
197
apps/api/src/controllers/v1/search.ts.WIP
Normal file
|
@ -0,0 +1,197 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: any;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const query = req.body.query;
|
||||
const advanced = false;
|
||||
if (!query) {
|
||||
return { success: false, error: "Query is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
const tbs = searchOptions.tbs ?? null;
|
||||
const filter = searchOptions.filter ?? null;
|
||||
const num_results = searchOptions.limit ?? 7;
|
||||
const num_results_buffer = Math.floor(num_results * 1.5);
|
||||
|
||||
let res = await search({
|
||||
query: query,
|
||||
advanced: advanced,
|
||||
num_results: num_results_buffer,
|
||||
tbs: tbs,
|
||||
filter: filter,
|
||||
lang: searchOptions.lang ?? "en",
|
||||
country: searchOptions.country ?? "us",
|
||||
location: searchOptions.location,
|
||||
});
|
||||
|
||||
let justSearch = pageOptions.fetchPageContent === false;
|
||||
|
||||
|
||||
if (justSearch) {
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
res.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
res = res.filter((r) => !isUrlBlocked(r.url));
|
||||
if (res.length > num_results) {
|
||||
res = res.slice(0, num_results);
|
||||
}
|
||||
|
||||
if (res.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
// filter out social media links
|
||||
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: {
|
||||
...pageOptions,
|
||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||
includeHtml: pageOptions?.includeHtml ?? false,
|
||||
removeTags: pageOptions?.removeTags ?? [],
|
||||
fallback: false,
|
||||
},
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
filteredDocs.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: false,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
const result = await searchHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
searchOptions,
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: result.data ? result.data.length : 0,
|
||||
docs: result.data,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "search",
|
||||
url: req.body.query,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
42
apps/api/src/controllers/v1/status.ts.WIP
Normal file
42
apps/api/src/controllers/v1/status.ts.WIP
Normal file
|
@ -0,0 +1,42 @@
|
|||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
let jobStatus = await job.getState();
|
||||
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
|
||||
jobStatus = 'active';
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -13,6 +13,7 @@ import { ScrapeEvents } from "./lib/scrape-events";
|
|||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import { v1Router } from "./routes/v1";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
|
@ -78,6 +79,7 @@ if (cluster.isMaster) {
|
|||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use(v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
|
|
38
apps/api/src/lib/validateUrl.ts
Normal file
38
apps/api/src/lib/validateUrl.ts
Normal file
|
@ -0,0 +1,38 @@
|
|||
|
||||
const protocolIncluded = (url: string) => {
|
||||
// if :// not in the start of the url assume http (maybe https?)
|
||||
// regex checks if :// appears before any .
|
||||
return(/^([^.:]+:\/\/)/.test(url));
|
||||
}
|
||||
|
||||
const getURLobj = (s: string) => {
|
||||
// URL fails if we dont include the protocol ie google.com
|
||||
let error = false;
|
||||
let urlObj = {};
|
||||
try {
|
||||
urlObj = new URL(s);
|
||||
} catch (err) {
|
||||
error = true;
|
||||
}
|
||||
return { error, urlObj };
|
||||
};
|
||||
|
||||
export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
}
|
34
apps/api/src/routes/v1.ts
Normal file
34
apps/api/src/routes/v1.ts
Normal file
|
@ -0,0 +1,34 @@
|
|||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { mapController } from "../../src/controllers/v1/map";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
|
||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post("/v1/scrape", scrapeController);
|
||||
v1Router.post("/v1/crawl", crawlController);
|
||||
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
||||
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
|
||||
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
|
||||
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
// v1Router.get("/v1/keyAuth", keyAuthController);
|
||||
|
||||
// // Search routes
|
||||
// v0Router.post("/v1/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/v1/health/liveness", livenessController);
|
||||
// v1Router.get("/v1/health/readiness", readinessController);
|
||||
|
||||
v1Router.post("/v1/map", mapController);
|
Loading…
Reference in New Issue
Block a user