wip: map, crawl, scrape mockups

2024-11-16 11:42:24 +08:00 · 2024-08-06 15:24:45 -03:00 · 2024-08-06 15:24:45 -03:00 · 6cdf4c68ec
commit 6cdf4c68ec
parent b60ee30dba
18 changed files with 1514 additions and 0 deletions
--- a/apps/api/src/controllers/v1/tests/crawl.test.ts.WIP
+++ b/apps/api/src/controllers/v1/tests/crawl.test.ts.WIP
@ -0,0 +1,47 @@
+import { crawlController } from '../crawl'
+import { Request, Response } from 'express';
+import { authenticateUser } from '../auth'; // Ensure this import is correct
+import { createIdempotencyKey } from '../../services/idempotency/create';
+import { validateIdempotencyKey } from '../../services/idempotency/validate';
+import { v4 as uuidv4 } from 'uuid';
+
+jest.mock('../auth', () => ({
+  authenticateUser: jest.fn().mockResolvedValue({
+    success: true,
+    team_id: 'team123',
+    error: null,
+    status: 200
+  }),
+  reduce: jest.fn()
+}));
+jest.mock('../../services/idempotency/validate');
+
+describe('crawlController', () => {
+  it('should prevent duplicate requests using the same idempotency key', async () => {
+    const req = {
+      headers: {
+        'x-idempotency-key': await uuidv4(),
+        'Authorization': `Bearer ${process.env.TEST_API_KEY}`
+      },
+      body: {
+        url: 'https://mendable.ai'
+      }
+    } as unknown as Request;
+    const res = {
+      status: jest.fn().mockReturnThis(),
+      json: jest.fn()
+    } as unknown as Response;
+
+    // Mock the idempotency key validation to return false for the second call
+    (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
+
+    // First request should succeed
+    await crawlController(req, res);
+    expect(res.status).not.toHaveBeenCalledWith(409);
+
+    // Second request with the same key should fail
+    await crawlController(req, res);
+    expect(res.status).toHaveBeenCalledWith(409);
+    expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
+  });
+});
--- a/apps/api/src/controllers/v1/admin/queue.ts.WIP
+++ b/apps/api/src/controllers/v1/admin/queue.ts.WIP
@ -0,0 +1,87 @@
+import { Request, Response } from "express";
+
+import { Job } from "bull";
+import { Logger } from "../../lib/logger";
+import { getWebScraperQueue } from "../../services/queue-service";
+import { checkAlerts } from "../../services/alerts";
+
+export async function cleanBefore24hCompleteJobsController(
+  req: Request,
+  res: Response
+) {
+  Logger.info("🐂 Cleaning jobs older than 24h");
+  try {
+    const webScraperQueue = getWebScraperQueue();
+    const batchSize = 10;
+    const numberOfBatches = 9; // Adjust based on your needs
+    const completedJobsPromises: Promise<Job[]>[] = [];
+    for (let i = 0; i < numberOfBatches; i++) {
+      completedJobsPromises.push(
+        webScraperQueue.getJobs(
+          ["completed"],
+          i * batchSize,
+          i * batchSize + batchSize,
+          true
+        )
+      );
+    }
+    const completedJobs: Job[] = (
+      await Promise.all(completedJobsPromises)
+    ).flat();
+    const before24hJobs =
+      completedJobs.filter(
+        (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+      ) || [];
+
+    let count = 0;
+
+    if (!before24hJobs) {
+      return res.status(200).send(`No jobs to remove.`);
+    }
+
+    for (const job of before24hJobs) {
+      try {
+        await job.remove();
+        count++;
+      } catch (jobError) {
+        Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
+      }
+    }
+    return res.status(200).send(`Removed ${count} completed jobs.`);
+  } catch (error) {
+    Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
+    return res.status(500).send("Failed to clean jobs");
+  }
+}
+
+
+export async function checkQueuesController(req: Request, res: Response) {
+    try {
+      await checkAlerts();
+      return res.status(200).send("Alerts initialized");
+    } catch (error) {
+      Logger.debug(`Failed to initialize alerts: ${error}`);
+      return res.status(500).send("Failed to initialize alerts");
+    }
+  }
+
+  // Use this as a "health check" that way we dont destroy the server
+export async function queuesController(req: Request, res: Response) {
+    try {
+      const webScraperQueue = getWebScraperQueue();
+
+      const [webScraperActive] = await Promise.all([
+        webScraperQueue.getActiveCount(),
+      ]);
+
+      const noActiveJobs = webScraperActive === 0;
+      // 200 if no active jobs, 503 if there are active jobs
+      return res.status(noActiveJobs ? 200 : 500).json({
+        webScraperActive,
+        noActiveJobs,
+      });
+    } catch (error) {
+      Logger.error(error);
+      return res.status(500).json({ error: error.message });
+    }
+  }
--- a/apps/api/src/controllers/v1/admin/redis-health.ts.WIP
+++ b/apps/api/src/controllers/v1/admin/redis-health.ts.WIP
@ -0,0 +1,85 @@
+import { Request, Response } from "express";
+import Redis from "ioredis";
+import { Logger } from "../../lib/logger";
+import { redisRateLimitClient } from "../../services/rate-limiter";
+
+export async function redisHealthController(req: Request, res: Response) {
+  const retryOperation = async (operation, retries = 3) => {
+    for (let attempt = 1; attempt <= retries; attempt++) {
+      try {
+        return await operation();
+      } catch (error) {
+        if (attempt === retries) throw error;
+        Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
+        await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
+      }
+    }
+  };
+
+  try {
+    const queueRedis = new Redis(process.env.REDIS_URL);
+
+    const testKey = "test";
+    const testValue = "test";
+
+    // Test queueRedis
+    let queueRedisHealth;
+    try {
+      await retryOperation(() => queueRedis.set(testKey, testValue));
+      queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
+      await retryOperation(() => queueRedis.del(testKey));
+    } catch (error) {
+      Logger.error(`queueRedis health check failed: ${error}`);
+      queueRedisHealth = null;
+    }
+
+    // Test redisRateLimitClient
+    let redisRateLimitHealth;
+    try {
+      await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
+      redisRateLimitHealth = await retryOperation(() =>
+        redisRateLimitClient.get(testKey)
+      );
+      await retryOperation(() => redisRateLimitClient.del(testKey));
+    } catch (error) {
+      Logger.error(`redisRateLimitClient health check failed: ${error}`);
+      redisRateLimitHealth = null;
+    }
+
+    const healthStatus = {
+      queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
+      redisRateLimitClient:
+        redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
+    };
+
+    if (
+      healthStatus.queueRedis === "healthy" &&
+      healthStatus.redisRateLimitClient === "healthy"
+    ) {
+      Logger.info("Both Redis instances are healthy");
+      return res.status(200).json({ status: "healthy", details: healthStatus });
+    } else {
+      Logger.info(
+        `Redis instances health check: ${JSON.stringify(healthStatus)}`
+      );
+      // await sendSlackWebhook(
+      //   `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
+      //     healthStatus
+      //   )}`,
+      //   true
+      // );
+      return res
+        .status(500)
+        .json({ status: "unhealthy", details: healthStatus });
+    }
+  } catch (error) {
+    Logger.error(`Redis health check failed: ${error}`);
+    // await sendSlackWebhook(
+    //   `[REDIS DOWN] Redis instances health check: ${error.message}`,
+    //   true
+    // );
+    return res
+      .status(500)
+      .json({ status: "unhealthy", message: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/auth.ts
+++ b/apps/api/src/controllers/v1/auth.ts
@ -0,0 +1,220 @@
+import { parseApi } from "../../../src/lib/parseApi";
+import { getRateLimiter, } from "../../../src/services/rate-limiter";
+import { AuthResponse, NotificationType, RateLimiterMode } from "../../../src/types";
+import { supabase_service } from "../../../src/services/supabase";
+import { withAuth } from "../../../src/lib/withAuth";
+import { RateLimiterRedis } from "rate-limiter-flexible";
+import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
+import { sendNotification } from "../../../src/services/notification/email_notification";
+import { Logger } from "../../../src/lib/logger";
+
+export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
+  return withAuth(supaAuthenticateUser)(req, res, mode);
+}
+function setTrace(team_id: string, api_key: string) {
+  try {
+    setTraceAttributes({
+      team_id,
+      api_key
+    });
+  } catch (error) {
+    Logger.error(`Error setting trace attributes: ${error.message}`);
+  }
+
+}
+export async function supaAuthenticateUser(
+  req,
+  res,
+  mode?: RateLimiterMode
+): Promise<{
+  success: boolean;
+  team_id?: string;
+  error?: string;
+  status?: number;
+  plan?: string;
+}> {
+  const authHeader = req.headers.authorization;
+  if (!authHeader) {
+    return { success: false, error: "Unauthorized", status: 401 };
+  }
+  const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
+  if (!token) {
+    return {
+      success: false,
+      error: "Unauthorized: Token missing",
+      status: 401,
+    };
+  }
+
+  const incomingIP = (req.headers["x-forwarded-for"] ||
+    req.socket.remoteAddress) as string;
+  const iptoken = incomingIP + token;
+
+  let rateLimiter: RateLimiterRedis;
+  let subscriptionData: { team_id: string, plan: string } | null = null;
+  let normalizedApi: string;
+
+  let team_id: string;
+
+  if (token == "this_is_just_a_preview_token") {
+    rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
+    team_id = "preview";
+  } else {
+    normalizedApi = parseApi(token);
+
+    const { data, error } = await supabase_service.rpc(
+      'get_key_and_price_id_2', { api_key: normalizedApi }
+    );
+    // get_key_and_price_id_2 rpc definition:
+    // create or replace function get_key_and_price_id_2(api_key uuid)
+    //   returns table(key uuid, team_id uuid, price_id text) as $$
+    //   begin
+    //     if api_key is null then
+    //       return query
+    //       select null::uuid as key, null::uuid as team_id, null::text as price_id;
+    //     end if;
+
+    //     return query
+    //     select ak.key, ak.team_id, s.price_id
+    //     from api_keys ak
+    //     left join subscriptions s on ak.team_id = s.team_id
+    //     where ak.key = api_key;
+    //   end;
+    //   $$ language plpgsql;
+
+    if (error) {
+      Logger.warn(`Error fetching key and price_id: ${error.message}`);
+    } else {
+      // console.log('Key and Price ID:', data);
+    }
+
+    
+
+    if (error || !data || data.length === 0) {
+      Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
+      return {
+        success: false,
+        error: "Unauthorized: Invalid token",
+        status: 401,
+      };
+    }
+    const internal_team_id = data[0].team_id;
+    team_id = internal_team_id;
+
+    const plan = getPlanByPriceId(data[0].price_id);
+    // HyperDX Logging
+    setTrace(team_id, normalizedApi);
+    subscriptionData = {
+      team_id: team_id,
+      plan: plan
+    }
+    switch (mode) {
+      case RateLimiterMode.Crawl:
+        rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
+        break;
+      case RateLimiterMode.Scrape:
+        rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
+        break;
+      case RateLimiterMode.Search:
+        rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
+        break;
+      case RateLimiterMode.CrawlStatus:
+        rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
+        break;
+      
+      case RateLimiterMode.Preview:
+        rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
+        break;
+      default:
+        rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
+        break;
+      // case RateLimiterMode.Search:
+      //   rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
+      //   break;
+    }
+  }
+
+  const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
+
+  try {
+    await rateLimiter.consume(team_endpoint_token);
+  } catch (rateLimiterRes) {
+    Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
+    const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
+    const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
+
+    // We can only send a rate limit email every 7 days, send notification already has the date in between checking
+    const startDate = new Date();
+    const endDate = new Date();
+    endDate.setDate(endDate.getDate() + 7);
+    // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
+    return {
+      success: false,
+      error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
+      status: 429,
+    };
+  }
+
+  if (
+    token === "this_is_just_a_preview_token" &&
+    (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
+  ) {
+    return { success: true, team_id: "preview" };
+    // check the origin of the request and make sure its from firecrawl.dev
+    // const origin = req.headers.origin;
+    // if (origin && origin.includes("firecrawl.dev")){
+    //   return { success: true, team_id: "preview" };
+    // }
+    // if(process.env.ENV !== "production") {
+    //   return { success: true, team_id: "preview" };
+    // }
+
+    // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
+  }
+
+  // make sure api key is valid, based on the api_keys table in supabase
+  if (!subscriptionData) {
+    normalizedApi = parseApi(token);
+
+    const { data, error } = await supabase_service
+      .from("api_keys")
+      .select("*")
+      .eq("key", normalizedApi);
+
+    
+
+    if (error || !data || data.length === 0) {
+      Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
+      return {
+        success: false,
+        error: "Unauthorized: Invalid token",
+        status: 401,
+      };
+    }
+
+    subscriptionData = data[0];
+  }
+
+  return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
+}
+function getPlanByPriceId(price_id: string) {
+  switch (price_id) {
+    case process.env.STRIPE_PRICE_ID_STARTER:
+      return 'starter';
+    case process.env.STRIPE_PRICE_ID_STANDARD:
+      return 'standard';
+    case process.env.STRIPE_PRICE_ID_SCALE:
+      return 'scale';
+    case process.env.STRIPE_PRICE_ID_HOBBY:
+    case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
+      return 'hobby';
+    case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
+    case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
+      return 'standardnew';
+    case process.env.STRIPE_PRICE_ID_GROWTH:
+    case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
+      return 'growth';
+    default:
+      return 'free';
+  }
+}
--- a/apps/api/src/controllers/v1/crawl-cancel.ts.WIP
+++ b/apps/api/src/controllers/v1/crawl-cancel.ts.WIP
@ -0,0 +1,71 @@
+import { Request, Response } from "express";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../src/types";
+import { addWebScraperJob } from "../../src/services/queue-jobs";
+import { getWebScraperQueue } from "../../src/services/queue-service";
+import { supabase_service } from "../../src/services/supabase";
+import { billTeam } from "../../src/services/billing/credit_billing";
+import { Logger } from "../../src/lib/logger";
+
+export async function crawlCancelController(req: Request, res: Response) {
+  try {
+    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.CrawlStatus
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+    const job = await getWebScraperQueue().getJob(req.params.jobId);
+    if (!job) {
+      return res.status(404).json({ error: "Job not found" });
+    }
+
+    // check if the job belongs to the team
+    if (useDbAuthentication) {
+      const { data, error: supaError } = await supabase_service
+        .from("bulljobs_teams")
+        .select("*")
+        .eq("job_id", req.params.jobId)
+        .eq("team_id", team_id);
+      if (supaError) {
+        return res.status(500).json({ error: supaError.message });
+      }
+
+      if (data.length === 0) {
+        return res.status(403).json({ error: "Unauthorized" });
+      }
+    }
+
+    const jobState = await job.getState();
+    const { partialDocs } = await job.progress();
+
+    if (partialDocs && partialDocs.length > 0 && jobState === "active") {
+      Logger.info("Billing team for partial docs...");
+      // Note: the credits that we will bill them here might be lower than the actual
+      // due to promises that are not yet resolved
+      await billTeam(team_id, partialDocs.length);
+    }
+
+    try {
+      await getWebScraperQueue().client.del(job.lockKey());
+      await job.takeLock();
+      await job.discard();
+      await job.moveToFailed(Error("Job cancelled by user"), true);
+    } catch (error) {
+      Logger.error(error);
+    }
+
+    const newJobState = await job.getState();
+
+    res.json({
+      status: "cancelled"
+    });
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -0,0 +1,89 @@
+import { Request, Response } from "express";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../../src/types";
+import { addWebScraperJob } from "../../../src/services/queue-jobs";
+import { getWebScraperQueue } from "../../../src/services/queue-service";
+import { supabaseGetJobById } from "../../../src/lib/supabase-jobs";
+import { Logger } from "../../../src/lib/logger";
+import { v4 as uuidv4 } from "uuid";
+
+export async function crawlStatusController(req: Request, res: Response) {
+  // TODO: validate req.params.jobId
+
+  try {    
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.CrawlStatus
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+
+    // const job = await getWebScraperQueue().getJob(req.params.jobId);
+    // if (!job) {
+    //   return res.status(404).json({ error: "Job not found" });
+    // }
+
+    // const { current, current_url, total, current_step, partialDocs } = await job.progress();
+
+    // let data = job.returnvalue;
+    // if (process.env.USE_DB_AUTHENTICATION === "true") {
+    //   const supabaseData = await supabaseGetJobById(req.params.jobId);
+
+    //   if (supabaseData) {
+    //     data = supabaseData.docs;
+    //   }
+    // }
+
+    // const jobStatus = await job.getState();
+
+    // mock:
+    const id = uuidv4();
+    const result = {
+      totalCount: 100,
+      creditsUsed: 2,
+      expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
+      status: "scraping", // scraping, completed, failed
+      next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
+      data: [{
+        markdown: "test",
+        content: "test",
+        html: "test",
+        rawHtml: "test",
+        linksOnPage: ["test1", "test2"],
+        screenshot: "test",
+        metadata: {
+          title: "test",
+          description: "test",
+          language: "test",
+          sourceURL: "test",
+          statusCode: 200,
+          error: "test"
+        }
+      },
+      {
+        markdown: "test",
+        content: "test",
+        html: "test",
+        rawHtml: "test",
+        linksOnPage: ["test1", "test2"],
+        screenshot: "test",
+        metadata: {
+          title: "test",
+          description: "test",
+          language: "test",
+          sourceURL: "test",
+          statusCode: 200,
+          error: "test"
+        }
+      }]
+    }
+
+    res.status(200).json(result);
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
+
--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@ -0,0 +1,139 @@
+import { Request, Response } from "express";
+import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
+import { billTeam } from "../../../src/services/billing/credit_billing";
+import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../../src/types";
+import { addWebScraperJob } from "../../../src/services/queue-jobs";
+import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
+import { logCrawl } from "../../../src/services/logging/crawl_log";
+import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
+import { createIdempotencyKey } from "../../../src/services/idempotency/create";
+import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
+import { v4 as uuidv4 } from "uuid";
+import { Logger } from "../../../src/lib/logger";
+import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
+
+export async function crawlController(req: Request, res: Response) {
+  // expected req.body
+
+  // req.body = {
+  //   url: string
+  //   crawlerOptions: {
+  //     includePaths: string[]
+  //     excludePaths: string[]
+  //     maxDepth: number
+  //     limit: number
+  //     allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
+  //     allowExternalLinks: boolean
+  //     ignoreSitemap: number
+  //   }
+  //   scrapeOptions: Exclude<Scrape, "url">
+  // }
+
+
+  try {
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.Crawl
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+
+    if (req.headers["x-idempotency-key"]) {
+      const isIdempotencyValid = await validateIdempotencyKey(req);
+      if (!isIdempotencyValid) {
+        return res.status(409).json({ error: "Idempotency key already used" });
+      }
+      try {
+        createIdempotencyKey(req);
+      } catch (error) {
+        Logger.error(error);
+        return res.status(500).json({ error: error.message });
+      }
+    }
+
+    const { success: creditsCheckSuccess, message: creditsCheckMessage } =
+      await checkTeamCredits(team_id, 1);
+    if (!creditsCheckSuccess) {
+      return res.status(402).json({ error: "Insufficient credits" });
+    }
+
+    let url = req.body.url;
+    if (!url) {
+      return res.status(400).json({ error: "Url is required" });
+    }
+
+    if (isUrlBlocked(url)) {
+      return res
+        .status(403)
+        .json({
+          error:
+            "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
+        });
+    }
+
+    try {
+      url = checkAndUpdateURL(url);
+    } catch (error) {
+      return res.status(400).json({ error: 'Invalid Url' });
+    }
+
+    // TODO: add job to queue
+
+    const id = uuidv4();
+    return res.status(200).json({ jobId: id, url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}` });
+
+    // const mode = req.body.mode ?? "crawl";
+
+    // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
+    // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
+
+    // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
+    //   try {
+    //     const a = new WebScraperDataProvider();
+    //     await a.setOptions({
+    //       jobId: uuidv4(),
+    //       mode: "single_urls",
+    //       urls: [url],
+    //       crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
+    //       pageOptions: pageOptions,
+    //     });
+
+    //     const docs = await a.getDocuments(false, (progress) => {
+    //       job.progress({
+    //         current: progress.current,
+    //         total: progress.total,
+    //         current_step: "SCRAPING",
+    //         current_url: progress.currentDocumentUrl,
+    //       });
+    //     });
+    //     return res.json({
+    //       success: true,
+    //       documents: docs,
+    //     });
+    //   } catch (error) {
+    //     Logger.error(error);
+    //     return res.status(500).json({ error: error.message });
+    //   }
+    // }
+
+    // const job = await addWebScraperJob({
+    //   url: url,
+    //   mode: mode ?? "crawl", // fix for single urls not working
+    //   crawlerOptions: crawlerOptions,
+    //   team_id: team_id,
+    //   pageOptions: pageOptions,
+    //   origin: req.body.origin ?? defaultOrigin,
+    // });
+
+    // await logCrawl(job.id.toString(), team_id);
+
+    // res.json({ jobId: job.id });
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/crawlPreview.ts.WIP
+++ b/apps/api/src/controllers/v1/crawlPreview.ts.WIP
@ -0,0 +1,46 @@
+import { Request, Response } from "express";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../src/types";
+import { addWebScraperJob } from "../../src/services/queue-jobs";
+import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
+import { Logger } from "../../src/lib/logger";
+
+export async function crawlPreviewController(req: Request, res: Response) {
+  try {
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.Preview
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+    // authenticate on supabase
+    const url = req.body.url;
+    if (!url) {
+      return res.status(400).json({ error: "Url is required" });
+    }
+
+    if (isUrlBlocked(url)) {
+      return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
+    }
+
+    const mode = req.body.mode ?? "crawl";
+    const crawlerOptions = req.body.crawlerOptions ?? {};
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
+
+    const job = await addWebScraperJob({
+      url: url,
+      mode: mode ?? "crawl", // fix for single urls not working
+      crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
+      team_id: "preview",
+      pageOptions: pageOptions,
+      origin: "website-preview",
+    });
+
+    res.json({ jobId: job.id });
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/keyAuth.ts.WIP
+++ b/apps/api/src/controllers/v1/keyAuth.ts.WIP
@ -0,0 +1,24 @@
+
+import { AuthResponse, RateLimiterMode } from "../types";
+
+import { Request, Response } from "express";
+import { authenticateUser } from "./auth";
+
+
+export const keyAuthController = async (req: Request, res: Response) => {
+  try {
+    // make sure to authenticate user first, Bearer <token>
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+    // if success, return success: true
+    return res.status(200).json({ success: true });
+  } catch (error) {
+    return res.status(500).json({ error: error.message });
+  }
+};
+
--- a/apps/api/src/controllers/v1/liveness.ts.WIP
+++ b/apps/api/src/controllers/v1/liveness.ts.WIP
@ -0,0 +1,6 @@
+import { Request, Response } from "express";
+
+export async function livenessController(req: Request, res: Response) {
+  //TODO: add checks if the application is live and healthy like checking the redis connection
+  res.status(200).json({ status: "ok" });
+}
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -0,0 +1,128 @@
+import { Request, Response } from "express";
+import { WebScraperDataProvider } from "../../../src/scraper/WebScraper";
+import { billTeam } from "../../../src/services/billing/credit_billing";
+import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../../src/types";
+import { addWebScraperJob } from "../../../src/services/queue-jobs";
+import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
+import { logCrawl } from "../../../src/services/logging/crawl_log";
+import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
+import { createIdempotencyKey } from "../../../src/services/idempotency/create";
+import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
+import { v4 as uuidv4 } from "uuid";
+import { Logger } from "../../../src/lib/logger";
+import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
+
+export async function mapController(req: Request, res: Response) {
+  // expected req.body
+
+  // req.body = {
+  //   url: string
+  //   ignoreSitemap: true??
+  //   other crawler options?
+  // }
+
+
+  try {
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.Crawl
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+
+    // if (req.headers["x-idempotency-key"]) {
+    //   const isIdempotencyValid = await validateIdempotencyKey(req);
+    //   if (!isIdempotencyValid) {
+    //     return res.status(409).json({ error: "Idempotency key already used" });
+    //   }
+    //   try {
+    //     createIdempotencyKey(req);
+    //   } catch (error) {
+    //     Logger.error(error);
+    //     return res.status(500).json({ error: error.message });
+    //   }
+    // }
+
+    // const { success: creditsCheckSuccess, message: creditsCheckMessage } =
+    //   await checkTeamCredits(team_id, 1);
+    // if (!creditsCheckSuccess) {
+    //   return res.status(402).json({ error: "Insufficient credits" });
+    // }
+
+    let url = req.body.url;
+    if (!url) {
+      return res.status(400).json({ error: "Url is required" });
+    }
+
+    if (isUrlBlocked(url)) {
+      return res
+        .status(403)
+        .json({
+          error:
+            "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
+        });
+    }
+
+    try {
+      url = checkAndUpdateURL(url);
+    } catch (error) {
+      return res.status(400).json({ error: 'Invalid Url' });
+    }
+
+    return res.status(200).json({ urls: [ "test1", "test2" ] });
+
+    // const mode = req.body.mode ?? "crawl";
+
+    // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
+    // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
+
+    // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
+    //   try {
+    //     const a = new WebScraperDataProvider();
+    //     await a.setOptions({
+    //       jobId: uuidv4(),
+    //       mode: "single_urls",
+    //       urls: [url],
+    //       crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
+    //       pageOptions: pageOptions,
+    //     });
+
+    //     const docs = await a.getDocuments(false, (progress) => {
+    //       job.progress({
+    //         current: progress.current,
+    //         total: progress.total,
+    //         current_step: "SCRAPING",
+    //         current_url: progress.currentDocumentUrl,
+    //       });
+    //     });
+    //     return res.json({
+    //       success: true,
+    //       documents: docs,
+    //     });
+    //   } catch (error) {
+    //     Logger.error(error);
+    //     return res.status(500).json({ error: error.message });
+    //   }
+    // }
+
+    // const job = await addWebScraperJob({
+    //   url: url,
+    //   mode: mode ?? "crawl", // fix for single urls not working
+    //   crawlerOptions: crawlerOptions,
+    //   team_id: team_id,
+    //   pageOptions: pageOptions,
+    //   origin: req.body.origin ?? defaultOrigin,
+    // });
+
+    // await logCrawl(job.id.toString(), team_id);
+
+    // res.json({ jobId: job.id });
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/readiness.ts.WIP
+++ b/apps/api/src/controllers/v1/readiness.ts.WIP
@ -0,0 +1,6 @@
+import { Request, Response } from "express";
+
+export async function readinessController(req: Request, res: Response) {
+  // TODO: add checks when the application is ready to serve traffic
+  res.status(200).json({ status: "ok" });
+}
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -0,0 +1,253 @@
+// import { ExtractorOptions, PageOptions } from './../../lib/entities';
+import { Request, Response } from "express";
+// import { WebScraperDataProvider } from "../../scraper/WebScraper";
+// import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../../types";
+// import { logJob } from "../../services/logging/log_job";
+// import { Document } from "../../lib/entities";
+import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
+// import { numTokensFromString } from '../../lib/LLM-extraction/helpers';
+// import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../../../src/lib/default-values';
+// import { v4 as uuidv4 } from "uuid";
+import { Logger } from '../../lib/logger';
+import { checkAndUpdateURL } from '../../lib/validateUrl';
+
+export async function scrapeController(req: Request, res: Response) {
+  let url = req.body.url;
+  if (!url) {
+    return { success: false, error: "Url is required", returnCode: 400 };
+  }
+
+  if (isUrlBlocked(url)) {
+    return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
+  }
+
+  try {
+    url = checkAndUpdateURL(url);
+  } catch (error) {
+    return { success: false, error: "Invalid URL", returnCode: 400 };
+  }
+
+  // TODO: check req.body
+  // mockup req.body
+  // req.body = {
+  //   url: "test",
+  //   headers: {
+  //     "x-key": "test"
+  //   },
+  //   formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
+  //   includeTags: ["test"],
+  //   excludeTags: ["test"],
+  //   onlyMainContent: false,
+  //   timeout: 30000,
+  //   waitFor: number
+  // }
+
+  try {
+    let earlyReturn = false;
+    // make sure to authenticate user first, Bearer <token>
+    const { success, team_id, error, status, plan } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.Scrape
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+
+    // check credits
+
+    const result = {
+      success: true,
+      warning: "test",
+      data: {
+        markdown: "test",
+        content: "test",
+        html: "test",
+        rawHtml: "test",
+        linksOnPage: ["test1", "test2"],
+        screenshot: "test",
+        metadata: {
+          title: "test",
+          description: "test",
+          language: "test",
+          sourceURL: "test",
+          statusCode: 200,
+          error: "test"
+        }
+      }
+    }
+
+    return res.status(200).json(result);
+
+    // const crawlerOptions = req.body.crawlerOptions ?? {};
+    // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
+    // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
+    // const origin = req.body.origin ?? defaultOrigin;
+    // let timeout = req.body.timeout ?? defaultTimeout;
+
+    // if (extractorOptions.mode.includes("llm-extraction")) {
+    //   pageOptions.onlyMainContent = true;
+    //   timeout = req.body.timeout ?? 90000;
+    // }
+
+    // const checkCredits = async () => {
+    //   try {
+    //     const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
+    //     if (!creditsCheckSuccess) {
+    //       earlyReturn = true;
+    //       return res.status(402).json({ error: "Insufficient credits" });
+    //     }
+    //   } catch (error) {
+    //     Logger.error(error);
+    //     earlyReturn = true;
+    //     return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
+    //   }
+    // };
+
+
+    // await checkCredits();
+
+    // const jobId = uuidv4();
+
+    // const startTime = new Date().getTime();
+    // const result = await scrapeHelper(
+    //   jobId,
+    //   req,
+    //   team_id,
+    //   crawlerOptions,
+    //   pageOptions,
+    //   extractorOptions,
+    //   timeout,
+    //   plan
+    // );
+    // const endTime = new Date().getTime();
+    // const timeTakenInSeconds = (endTime - startTime) / 1000;
+    // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
+
+    // if (result.success) {
+    //   let creditsToBeBilled = 1; // Assuming 1 credit per document
+    //   const creditsPerLLMExtract = 50;
+
+    //   if (extractorOptions.mode.includes("llm-extraction")) {
+    //     // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
+    //     creditsToBeBilled += creditsPerLLMExtract;
+    //   }
+
+    //   let startTimeBilling = new Date().getTime();
+
+    //   if (earlyReturn) {
+    //     // Don't bill if we're early returning
+    //     return;
+    //   }
+    //   const billingResult = await billTeam(
+    //     team_id,
+    //     creditsToBeBilled
+    //   );
+    //   if (!billingResult.success) {
+    //     return res.status(402).json({
+    //       success: false,
+    //       error: "Failed to bill team. Insufficient credits or subscription not found.",
+    //     });
+    //   }
+    // }
+
+    // logJob({
+    //   job_id: jobId,
+    //   success: result.success,
+    //   message: result.error,
+    //   num_docs: 1,
+    //   docs: [result.data],
+    //   time_taken: timeTakenInSeconds,
+    //   team_id: team_id,
+    //   mode: "scrape",
+    //   url: req.body.url,
+    //   crawlerOptions: crawlerOptions,
+    //   pageOptions: pageOptions,
+    //   origin: origin, 
+    //   extractor_options: extractorOptions,
+    //   num_tokens: numTokens,
+    // });
+
+    
+    // return res.status(result.returnCode).json(result);
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
+
+
+// export async function scrapeHelper(
+//   jobId: string,
+//   req: Request,
+//   team_id: string,
+//   crawlerOptions: any,
+//   pageOptions: PageOptions,
+//   extractorOptions: ExtractorOptions,
+//   timeout: number,
+//   plan?: string
+// ): Promise<{
+//   success: boolean;
+//   error?: string;
+//   data?: Document;
+//   returnCode: number;
+// }> {
+
+  // const url = req.body.url;
+  // if (!url) {
+  //   return { success: false, error: "Url is required", returnCode: 400 };
+  // }
+
+  // if (isUrlBlocked(url)) {
+  //   return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
+  // }
+
+  // const a = new WebScraperDataProvider();
+  // await a.setOptions({
+  //   jobId,
+  //   mode: "single_urls",
+  //   urls: [url],
+  //   crawlerOptions: {
+  //     ...crawlerOptions,
+  //   },
+  //   pageOptions: pageOptions,
+  //   extractorOptions: extractorOptions,
+  // });
+
+  // const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
+  //   setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
+  // );
+
+  // const docsPromise = a.getDocuments(false);
+
+  // let docs;
+  // try {
+  //   docs = await Promise.race([docsPromise, timeoutPromise]);
+  // } catch (error) {
+  //   return error;
+  // }
+
+  // // make sure doc.content is not empty
+  // let filteredDocs = docs.filter(
+  //   (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
+  // );
+  // if (filteredDocs.length === 0) {
+  //   return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
+  // }
+
+ 
+  // // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
+  // if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
+  //   filteredDocs.forEach(doc => {
+  //     delete doc.rawHtml;
+  //   });
+  // }
+
+  // return {
+  //   success: true,
+  //   data: filteredDocs[0],
+  //   returnCode: 200,
+  // };
+// }
--- a/apps/api/src/controllers/v1/search.ts.WIP
+++ b/apps/api/src/controllers/v1/search.ts.WIP
@ -0,0 +1,197 @@
+import { Request, Response } from "express";
+import { WebScraperDataProvider } from "../scraper/WebScraper";
+import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
+import { authenticateUser } from "./auth";
+import { RateLimiterMode } from "../types";
+import { logJob } from "../services/logging/log_job";
+import { PageOptions, SearchOptions } from "../lib/entities";
+import { search } from "../search";
+import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
+import { v4 as uuidv4 } from "uuid";
+import { Logger } from "../lib/logger";
+
+export async function searchHelper(
+  jobId: string,
+  req: Request,
+  team_id: string,
+  crawlerOptions: any,
+  pageOptions: PageOptions,
+  searchOptions: SearchOptions,
+): Promise<{
+  success: boolean;
+  error?: string;
+  data?: any;
+  returnCode: number;
+}> {
+  const query = req.body.query;
+  const advanced = false;
+  if (!query) {
+    return { success: false, error: "Query is required", returnCode: 400 };
+  }
+
+  const tbs = searchOptions.tbs ?? null;
+  const filter = searchOptions.filter ?? null;
+  const num_results = searchOptions.limit ?? 7;
+  const num_results_buffer = Math.floor(num_results * 1.5);
+
+  let res = await search({
+    query: query,
+    advanced: advanced,
+    num_results: num_results_buffer,
+    tbs: tbs,
+    filter: filter,
+    lang: searchOptions.lang ?? "en",
+    country: searchOptions.country ?? "us",
+    location: searchOptions.location,
+  });
+
+  let justSearch = pageOptions.fetchPageContent === false;
+  
+
+  if (justSearch) {
+    const billingResult = await billTeam(
+      team_id,
+      res.length
+    );
+    if (!billingResult.success) {
+      return {
+        success: false,
+        error:
+          "Failed to bill team. Insufficient credits or subscription not found.",
+        returnCode: 402,
+      };
+    }
+    return { success: true, data: res, returnCode: 200 };
+  }
+
+  res = res.filter((r) => !isUrlBlocked(r.url));
+  if (res.length > num_results) {
+    res = res.slice(0, num_results);
+  }
+
+  if (res.length === 0) {
+    return { success: true, error: "No search results found", returnCode: 200 };
+  }
+
+  // filter out social media links
+
+
+  const a = new WebScraperDataProvider();
+  await a.setOptions({
+    jobId,
+    mode: "single_urls",
+    urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
+    crawlerOptions: {
+      ...crawlerOptions,
+    },
+    pageOptions: {
+      ...pageOptions,
+      onlyMainContent: pageOptions?.onlyMainContent ?? true,
+      fetchPageContent: pageOptions?.fetchPageContent ?? true,
+      includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
+      fallback: false,
+    },
+  });
+
+  const docs = await a.getDocuments(false);
+  
+  if (docs.length === 0) {
+    return { success: true, error: "No search results found", returnCode: 200 };
+  }
+
+  // make sure doc.content is not empty
+  const filteredDocs = docs.filter(
+    (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
+  );
+
+  if (filteredDocs.length === 0) {
+    return { success: true, error: "No page found", returnCode: 200, data: docs };
+  }
+
+  const billingResult = await billTeam(
+    team_id,
+    filteredDocs.length
+  );
+  if (!billingResult.success) {
+    return {
+      success: false,
+      error:
+        "Failed to bill team. Insufficient credits or subscription not found.",
+      returnCode: 402,
+    };
+  }
+
+  return {
+    success: true,
+    data: filteredDocs,
+    returnCode: 200,
+  };
+}
+
+export async function searchController(req: Request, res: Response) {
+  try {
+    // make sure to authenticate user first, Bearer <token>
+    const { success, team_id, error, status } = await authenticateUser(
+      req,
+      res,
+      RateLimiterMode.Search
+    );
+    if (!success) {
+      return res.status(status).json({ error });
+    }
+    const crawlerOptions = req.body.crawlerOptions ?? {};
+    const pageOptions = req.body.pageOptions ?? {
+      includeHtml: false,
+      onlyMainContent: true,
+      fetchPageContent: true,
+      removeTags: [],
+      fallback: false,
+    };
+    const origin = req.body.origin ?? "api";
+
+    const searchOptions = req.body.searchOptions ?? { limit: 7 };
+
+    const jobId = uuidv4();
+
+    try {
+      const { success: creditsCheckSuccess, message: creditsCheckMessage } =
+        await checkTeamCredits(team_id, 1);
+      if (!creditsCheckSuccess) {
+        return res.status(402).json({ error: "Insufficient credits" });
+      }
+    } catch (error) {
+      Logger.error(error);
+      return res.status(500).json({ error: "Internal server error" });
+    }
+    const startTime = new Date().getTime();
+    const result = await searchHelper(
+      jobId,
+      req,
+      team_id,
+      crawlerOptions,
+      pageOptions,
+      searchOptions,
+    );
+    const endTime = new Date().getTime();
+    const timeTakenInSeconds = (endTime - startTime) / 1000;
+    logJob({
+      job_id: jobId,
+      success: result.success,
+      message: result.error,
+      num_docs: result.data ? result.data.length : 0,
+      docs: result.data,
+      time_taken: timeTakenInSeconds,
+      team_id: team_id,
+      mode: "search",
+      url: req.body.query,
+      crawlerOptions: crawlerOptions,
+      pageOptions: pageOptions,
+      origin: origin,
+    });
+    return res.status(result.returnCode).json(result);
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/controllers/v1/status.ts.WIP
+++ b/apps/api/src/controllers/v1/status.ts.WIP
@ -0,0 +1,42 @@
+import { Request, Response } from "express";
+import { getWebScraperQueue } from "../../src/services/queue-service";
+import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
+import { Logger } from "../../src/lib/logger";
+
+export async function crawlJobStatusPreviewController(req: Request, res: Response) {
+  try {
+    const job = await getWebScraperQueue().getJob(req.params.jobId);
+    if (!job) {
+      return res.status(404).json({ error: "Job not found" });
+    }
+
+    const { current, current_url, total, current_step, partialDocs } = await job.progress();
+    let data = job.returnvalue;
+    if (process.env.USE_DB_AUTHENTICATION === "true") {
+      const supabaseData = await supabaseGetJobById(req.params.jobId);
+
+      if (supabaseData) {
+        data = supabaseData.docs;
+      }
+    }
+
+    let jobStatus = await job.getState();
+    if (jobStatus === 'waiting' || jobStatus === 'stuck') {
+      jobStatus = 'active';
+    }
+
+    res.json({
+      status: jobStatus,
+      // progress: job.progress(),
+      current,
+      current_url,
+      current_step,
+      total,
+      data: data ? data : null,
+      partial_data: jobStatus == 'completed' ? [] : partialDocs,
+    });
+  } catch (error) {
+    Logger.error(error);
+    return res.status(500).json({ error: error.message });
+  }
+}
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -13,6 +13,7 @@ import { ScrapeEvents } from "./lib/scrape-events";
 import http from 'node:http';
 import https from 'node:https';
 import CacheableLookup  from 'cacheable-lookup';
+import { v1Router } from "./routes/v1";

 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -78,6 +79,7 @@ if (cluster.isMaster) {

  // register router
  app.use(v0Router);
+  app.use(v1Router);
  app.use(adminRouter);

  const DEFAULT_PORT = process.env.PORT ?? 3002;
--- a/apps/api/src/lib/validateUrl.ts
+++ b/apps/api/src/lib/validateUrl.ts
@ -0,0 +1,38 @@
+
+const protocolIncluded = (url: string) => {
+  // if :// not in the start of the url assume http (maybe https?)
+  // regex checks if :// appears before any .
+  return(/^([^.:]+:\/\/)/.test(url));
+}
+
+const getURLobj = (s: string) => {
+  // URL fails if we dont include the protocol ie google.com
+  let error = false;
+  let urlObj = {};
+  try {
+    urlObj = new URL(s);
+  } catch (err) {
+    error = true;
+  }
+  return { error, urlObj };
+};
+
+export const checkAndUpdateURL = (url: string) => {
+
+  if (!protocolIncluded(url)) {
+    url = `http://${url}`;
+  }
+
+  const { error, urlObj } = getURLobj(url);
+  if (error) {
+    throw new Error("Invalid URL");
+  }
+
+  const typedUrlObj = urlObj as URL;
+
+  if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
+    throw new Error("Invalid URL");
+  }
+
+  return { urlObj: typedUrlObj, url: url };
+}
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -0,0 +1,34 @@
+import express from "express";
+import { crawlController } from "../../src/controllers/v1/crawl";
+// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
+import { scrapeController } from "../../src/controllers/v1/scrape";
+import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
+import { mapController } from "../../src/controllers/v1/map";
+// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
+// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
+// import { searchController } from "../../src/controllers/v1/search";
+// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
+// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
+// import { livenessController } from "../controllers/v1/liveness";
+// import { readinessController } from "../controllers/v1/readiness";
+
+export const v1Router = express.Router();
+
+v1Router.post("/v1/scrape", scrapeController);
+v1Router.post("/v1/crawl", crawlController);
+v1Router.get("/v1/crawl/:jobId", crawlStatusController);
+// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
+// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
+// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
+
+// // Auth route for key based authentication
+// v1Router.get("/v1/keyAuth", keyAuthController);
+
+// // Search routes
+// v0Router.post("/v1/search", searchController);
+
+// Health/Probe routes
+// v1Router.get("/v1/health/liveness", livenessController);
+// v1Router.get("/v1/health/readiness", readinessController);
+
+v1Router.post("/v1/map", mapController);