WebScraper refactor into scrapeURL (#714)

* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-16 03:32:22 +08:00 · 2024-11-07 20:57:33 +01:00 · 2024-11-07 20:57:33 +01:00 · 8d467c8ca7
commit 8d467c8ca7
parent ed5a0d3cf2
142 changed files with 4230 additions and 6334 deletions
--- a/.github/archive/js-sdk.yml
+++ b/.github/archive/js-sdk.yml
@ -8,7 +8,6 @@ env:
  FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
  HOST: ${{ secrets.HOST }}
  LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
-  LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
  POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
  POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
  NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
  SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
  SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
  TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
-  HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
  HDX_NODE_BETA_MODE: 1

 jobs:  
--- a/.github/archive/python-sdk.yml
+++ b/.github/archive/python-sdk.yml
@ -8,7 +8,6 @@ env:
  FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
  HOST: ${{ secrets.HOST }}
  LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
-  LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
  POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
  POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
  NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
  SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
  SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
  TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
-  HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
  HDX_NODE_BETA_MODE: 1

 jobs:  
--- a/.github/archive/rust-sdk.yml
+++ b/.github/archive/rust-sdk.yml
@ -8,7 +8,6 @@ env:
  FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
  HOST: ${{ secrets.HOST }}
  LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
-  LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
  POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
  POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
  NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
  SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
  SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
  TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
-  HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
  HDX_NODE_BETA_MODE: 1


--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -12,7 +12,6 @@ env:
  FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
  HOST: ${{ secrets.HOST }}
  LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
-  LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
  POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
  POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
  NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -25,7 +24,6 @@ env:
  SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
  SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
  TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
-  HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
  HDX_NODE_BETA_MODE: 1
  FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
  USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
--- a/.github/workflows/deploy-image-staging.yml
+++ b/.github/workflows/deploy-image-staging.yml
@ -0,0 +1,32 @@
+name: STAGING Deploy Images to GHCR
+
+env:
+  DOTNET_VERSION: '6.0.x'
+
+on:
+  push:
+    branches:
+      - mog/webscraper-refactor
+  workflow_dispatch:
+
+jobs:
+  push-app-image:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: './apps/api'
+    steps:
+      - name: 'Checkout GitHub Action'
+        uses: actions/checkout@main
+
+      - name: 'Login to GitHub Container Registry'
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{github.actor}}
+          password: ${{secrets.GITHUB_TOKEN}}
+
+      - name: 'Build Inventory Image'
+        run: |
+          docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
+          docker push ghcr.io/mendableai/firecrawl-staging:latest
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
 SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
 OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
 BULL_AUTH_KEY= @
-LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
 PLAYWRIGHT_MICROSERVICE_URL=  # set if you'd like to run a playwright fallback
 LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
 SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
--- a/SELF_HOST.md
+++ b/SELF_HOST.md
@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
 SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
 OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
 BULL_AUTH_KEY= @
-LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
 PLAYWRIGHT_MICROSERVICE_URL=  # set if you'd like to run a playwright fallback
 LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
 SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -33,8 +33,6 @@ SCRAPING_BEE_API_KEY=
 # add for LLM dependednt features (image alt generation, etc.)
 OPENAI_API_KEY=
 BULL_AUTH_KEY=@
-# use if you're configuring basic logging with logtail
-LOGTAIL_KEY=
 # set if you have a llamaparse key you'd like to use to parse pdfs
 LLAMAPARSE_API_KEY=
 # set if you'd like to send slack server health status messages
@ -54,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
 STRIPE_PRICE_ID_GROWTH=
 STRIPE_PRICE_ID_GROWTH_YEARLY=

-HYPERDX_API_KEY=
-HDX_NODE_BETA_MODE=1
-
 # set if you'd like to use the fire engine closed beta
 FIRE_ENGINE_BETA_URL=

--- a/apps/api/jest.setup.js
+++ b/apps/api/jest.setup.js
@ -1 +1 @@
-global.fetch = require('jest-fetch-mock');
+// global.fetch = require('jest-fetch-mock');
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -32,9 +32,11 @@
    "@tsconfig/recommended": "^1.0.3",
    "@types/body-parser": "^1.19.2",
    "@types/cors": "^2.8.13",
+    "@types/escape-html": "^1.0.4",
    "@types/express": "^4.17.17",
    "@types/jest": "^29.5.12",
    "@types/node": "^20.14.1",
+    "@types/pdf-parse": "^1.1.4",
    "body-parser": "^1.20.1",
    "express": "^4.18.2",
    "jest": "^29.6.3",
@ -53,9 +55,7 @@
    "@bull-board/api": "^5.20.5",
    "@bull-board/express": "^5.20.5",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.13",
-    "@hyperdx/node-opentelemetry": "^0.8.1",
-    "@logtail/node": "^0.4.12",
+    "@dqbd/tiktoken": "^1.0.16",
    "@nangohq/node": "^0.40.8",
    "@sentry/cli": "^2.33.1",
    "@sentry/node": "^8.26.0",
@ -78,6 +78,7 @@
    "date-fns": "^3.6.0",
    "dotenv": "^16.3.1",
    "dotenv-cli": "^7.4.2",
+    "escape-html": "^1.0.3",
    "express-rate-limit": "^7.3.1",
    "express-ws": "^5.0.2",
    "form-data": "^4.0.0",
@ -92,6 +93,7 @@
    "languagedetect": "^2.0.0",
    "logsnag": "^1.0.0",
    "luxon": "^3.4.3",
+    "marked": "^14.1.2",
    "md5": "^2.3.0",
    "moment": "^2.29.4",
    "mongoose": "^8.4.4",
@ -114,6 +116,8 @@
    "typesense": "^1.5.4",
    "unstructured-client": "^0.11.3",
    "uuid": "^10.0.0",
+    "winston": "^3.14.2",
+    "winston-transport": "^4.8.0",
    "wordpos": "^2.1.0",
    "ws": "^8.18.0",
    "xml2js": "^0.6.2",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -1,15 +1,15 @@
 ### Crawl Website
 POST http://localhost:3002/v0/scrape HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json

 {
-    "url":"corterix.com"
+    "url":"firecrawl.dev"
 }

 ### Check Job Status
 GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}


 ### Check Job Status
@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1

 ### Scrape Website
 POST http://localhost:3002/v0/crawl HTTP/1.1
-Authorization: Bearer fc-
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json

 {
@ -45,7 +45,7 @@ content-type: application/json

 ### Scrape Website
 POST http://localhost:3002/v0/scrape HTTP/1.1
-Authorization: Bearer 
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json

 {
@ -56,12 +56,12 @@ content-type: application/json

 ### Check Job Status
 GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
-Authorization: Bearer 
+Authorization: Bearer {{$dotenv TEST_API_KEY}}

 ### Get Job Result

 POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
-Authorization: Bearer 
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json

 {
@ -70,7 +70,7 @@ content-type: application/json

 ### Check Job Status
 GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
-Authorization: Bearer 
+Authorization: Bearer {{$dotenv TEST_API_KEY}}

 ### Get Active Jobs Count
 GET http://localhost:3002/serverHealthCheck
--- a/apps/api/sharedLibs/go-html-to-md/.gitignore
+++ b/apps/api/sharedLibs/go-html-to-md/.gitignore
@ -0,0 +1,2 @@
+html-to-markdown.so
+html-to-markdown.h
--- a/apps/api/src/tests/e2e_full_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_full_withAuth/index.test.ts
@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
      expect(crawlInitResponse.statusCode).toBe(200);
      expect(crawlInitResponse.body).toHaveProperty("jobId");

-      let crawlStatus: string;
+      let crawlStatus: string = "scraping";
      let crawlData = [];
      while (crawlStatus !== "completed") {
        const statusResponse = await request(TEST_URL)
--- a/apps/api/src/tests/e2e_noAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_noAuth/index.test.ts
@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
    process.env.SCRAPING_BEE_API_KEY = "";
    process.env.OPENAI_API_KEY = "";
    process.env.BULL_AUTH_KEY = "";
-    process.env.LOGTAIL_KEY = "";
    process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
    process.env.LLAMAPARSE_API_KEY = "";
    process.env.TEST_API_KEY = "";
--- a/apps/api/src/tests/e2e_v1_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_v1_withAuth/index.test.ts
@ -1,7 +1,7 @@
 import request from "supertest";
 import { configDotenv } from "dotenv";
 import {
-  ScrapeRequest,
+  ScrapeRequestInput,
  ScrapeResponseRequestTest,
 } from "../../controllers/v1/types";

@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
    });

    it.concurrent("should throw error for blocklisted URL", async () => {
-      const scrapeRequest: ScrapeRequest = {
+      const scrapeRequest: ScrapeRequestInput = {
        url: "https://facebook.com/fake-test",
      };

@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
    it.concurrent(
      "should return a successful response with a valid API key",
      async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://roastmywebsite.ai",
        };

@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
    it.concurrent(
      "should return a successful response with a valid API key",
      async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://arxiv.org/abs/2410.04840",
        };

@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
    it.concurrent(
      "should return a successful response with a valid API key and includeHtml set to true",
      async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://roastmywebsite.ai",
          formats: ["markdown", "html"],
        };
@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
      30000
    );
    it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
        //   formats: ["markdown", "html"],
        };
@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
      }, 60000);

      it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://arxiv.org/pdf/astro-ph/9301001"
        };
        const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
      }, 60000);

      it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
-        const scrapeRequest: ScrapeRequest = {
+        const scrapeRequest: ScrapeRequestInput = {
          url: "https://www.scrapethissite.com/",
          onlyMainContent: false // default is true
        };
@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
        expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
        expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
  
-        const scrapeRequestWithRemoveTags: ScrapeRequest = {
+        const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
            url: "https://www.scrapethissite.com/",
            excludeTags: ['.nav', '#footer', 'strong'],
            onlyMainContent: false // default is true
@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
      it.concurrent(
        "should return a successful response with a valid API key and includeHtml set to true",
        async () => {
-          const scrapeRequest: ScrapeRequest = {
+          const scrapeRequest: ScrapeRequestInput = {
            url: "https://roastmywebsite.ai",
            formats: ["html","rawHtml"],
          };
@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
      it.concurrent(
        "should return a successful response with waitFor",
        async () => {
-          const scrapeRequest: ScrapeRequest = {
+          const scrapeRequest: ScrapeRequestInput = {
            url: "https://ycombinator.com/companies",
            formats: ["markdown"],
            waitFor: 8000
@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
      it.concurrent(
        "should return a successful response with a valid links on page",
        async () => {
-          const scrapeRequest: ScrapeRequest = {
+          const scrapeRequest: ScrapeRequestInput = {
            url: "https://roastmywebsite.ai",
            formats: ["links"],
          };
@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
  });
  
  it.concurrent("should throw error for blocklisted URL", async () => {
-    const scrapeRequest: ScrapeRequest = {
+    const scrapeRequest: ScrapeRequestInput = {
      url: "https://facebook.com/fake-test",
    };

--- a/apps/api/src/tests/e2e_v1_withAuth_all_params/index.test.ts
+++ b/apps/api/src/tests/e2e_v1_withAuth_all_params/index.test.ts
@ -0,0 +1,603 @@
+import request from "supertest";
+import { configDotenv } from "dotenv";
+import {
+  ScrapeRequest,
+  ScrapeResponseRequestTest,
+} from "../../controllers/v1/types";
+
+configDotenv();
+const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
+const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
+
+describe("E2E Tests for v1 API Routes", () => {
+
+  it.concurrent('should return a successful response for a scrape with 403 page', async () => {
+    const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+      .post('/v1/scrape')
+      .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
+      .set('Content-Type', 'application/json')
+      .send({ url: 'https://httpstat.us/403' });
+
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty('data');
+    if (!("data" in response.body)) {
+      throw new Error("Expected response body to have 'data' property");
+    }
+    expect(response.body.data).toHaveProperty('markdown');
+    expect(response.body.data).toHaveProperty('metadata');
+    expect(response.body.data.metadata.statusCode).toBe(403);
+  }, 30000);
+
+  it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL
+      } as ScrapeRequest;
+
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+      expect(response.body.data).toHaveProperty("markdown");
+      
+      expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+      expect(response.body.data.markdown).toContain("Content with id #content-1");
+      // expect(response.body.data.markdown).toContain("Loading...");
+      expect(response.body.data.markdown).toContain("Click me!");
+      expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
+      expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
+      expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
+      expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
+      expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
+    },
+  30000);
+
+  it.concurrent("should handle 'formats:html' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html"]
+      } as ScrapeRequest;
+
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+
+      expect(response.body.data).not.toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("html");
+
+      expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
+      expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
+    },
+  30000);
+
+  it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["rawHtml"]
+      } as ScrapeRequest;
+
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+      expect(response.body.data).not.toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("rawHtml");
+
+      expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
+      expect(response.body.data.rawHtml).toContain(">Header</header>");
+    },
+  30000);
+  
+  // - TODO: tests for links
+  // - TODO: tests for screenshot
+  // - TODO: tests for screenshot@fullPage
+
+  it.concurrent("should handle 'headers' parameter correctly", async () => {
+    // @ts-ignore
+    const scrapeRequest = {
+      url: E2E_TEST_SERVER_URL,
+      headers: { "e2e-header-test": "firecrawl" }
+    } as ScrapeRequest;
+
+    const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+      .post("/v1/scrape")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send(scrapeRequest);
+
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    if (!("data" in response.body)) {
+      throw new Error("Expected response body to have 'data' property");
+    }
+
+    expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
+  }, 30000);
+  
+  it.concurrent("should handle 'includeTags' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        includeTags: ['#content-1']
+      } as ScrapeRequest;
+
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+      expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
+      expect(response.body.data.markdown).toContain("Content with id #content-1");
+    },
+  30000);
+  
+  it.concurrent("should handle 'excludeTags' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        excludeTags: ['#content-1']
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+      expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+      expect(response.body.data.markdown).not.toContain("Content with id #content-1");
+    },
+  30000);
+  
+  it.concurrent("should handle 'onlyMainContent' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html", "markdown"],
+        onlyMainContent: false
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      
+      expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
+      expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
+    },
+  30000);
+  
+  it.concurrent("should handle 'timeout' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        timeout: 500
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(408);
+
+      if (!("error" in response.body)) {
+        throw new Error("Expected response body to have 'error' property");
+      }
+      expect(response.body.error).toBe("Request timed out");
+      expect(response.body.success).toBe(false);
+    }, 30000);
+
+  
+  it.concurrent("should handle 'mobile' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        mobile: true
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).toContain("This content is only visible on mobile");
+    },
+  30000);
+  
+  it.concurrent("should handle 'parsePDF' parameter correctly",
+    async () => {  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty('data');
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+
+      expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
+      expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
+
+      const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
+      await new Promise((r) => setTimeout(r, 6000));
+
+      expect(responseNoParsePDF.statusCode).toBe(200);
+      expect(responseNoParsePDF.body).toHaveProperty('data');
+      if (!("data" in responseNoParsePDF.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
+    },
+  30000);
+  
+  // it.concurrent("should handle 'location' parameter correctly",
+  //   async () => {
+  //     const scrapeRequest: ScrapeRequest = {
+  //       url: "https://roastmywebsite.ai",
+  //       location: {
+  //         country: "US",
+  //         languages: ["en"]
+  //       }
+  //     };
+  
+  //     const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+  //       .post("/v1/scrape")
+  //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+  //       .set("Content-Type", "application/json")
+  //       .send(scrapeRequest);
+  
+  //     expect(response.statusCode).toBe(200);
+  //     // Add assertions to verify location is handled correctly
+  //   },
+  // 30000);
+  
+  it.concurrent("should handle 'skipTlsVerification' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: "https://expired.badssl.com/",
+        timeout: 120000
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+        console.log("Error1a")
+        // console.log(response.body)
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.metadata.pageStatusCode).toBe(500);
+      console.log("Error?")
+      
+      const scrapeRequestWithSkipTlsVerification = {
+        url: "https://expired.badssl.com/",
+        skipTlsVerification: true,
+        timeout: 120000
+
+      } as ScrapeRequest;
+  
+      const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequestWithSkipTlsVerification);
+  
+      console.log("Error1b")
+      // console.log(responseWithSkipTlsVerification.body)
+      expect(responseWithSkipTlsVerification.statusCode).toBe(200);
+      if (!("data" in responseWithSkipTlsVerification.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(responseWithSkipTlsVerification.body.data)
+      expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
+    },
+  60000);
+  
+  it.concurrent("should handle 'removeBase64Images' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        removeBase64Images: true
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(response.body.data.markdown)
+      // - TODO: not working for every image
+      // expect(response.body.data.markdown).toContain("Image-Removed");
+    },
+  30000);
+
+  it.concurrent("should handle 'action wait' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [{
+          type: "wait",
+          milliseconds: 10000
+        }]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).not.toContain("Loading...");
+      expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
+    },
+  30000);
+
+  // screenshot
+  it.concurrent("should handle 'action screenshot' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [{
+          type: "screenshot"
+        }]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      if (!response.body.data.actions?.screenshots) {
+        throw new Error("Expected response body to have screenshots array");
+      }
+      expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
+      expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
+
+      // TODO compare screenshot with expected screenshot
+    },
+  30000);
+
+  it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [{
+          type: "screenshot",
+          fullPage: true
+        },
+      {
+        type:"scrape"
+      }]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      // console.log(response.body.data.actions?.screenshots[0])
+      if (!response.body.data.actions?.screenshots) {
+        throw new Error("Expected response body to have screenshots array");
+      }
+      expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
+      expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
+
+      if (!response.body.data.actions?.scrapes) {
+        throw new Error("Expected response body to have scrapes array"); 
+      }
+      expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
+      expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
+      // TODO compare screenshot with expected full page screenshot
+    },
+  30000);
+
+  it.concurrent("should handle 'action click' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        actions: [{
+          type: "click",
+          selector: "#click-me"
+        }]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      expect(response.body.data.markdown).not.toContain("Click me!");
+      expect(response.body.data.markdown).toContain("Text changed after click!");
+    },
+  30000);
+
+  it.concurrent("should handle 'action write' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["html"],
+        actions: [{
+          type: "click",
+          selector: "#input-1"
+        },
+        {
+          type: "write",
+          text: "Hello, world!"
+        }
+      ]} as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      expect(response.statusCode).toBe(200);
+      if (!("data" in response.body)) {
+        throw new Error("Expected response body to have 'data' property");
+      }
+      
+      // TODO: fix this test (need to fix fire-engine first)
+      // uncomment the following line:
+      // expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
+    },
+  30000);
+
+  // TODO: fix this test (need to fix fire-engine first)
+  it.concurrent("should handle 'action pressKey' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["markdown"],
+        actions: [
+          {
+            type: "press",
+            key: "ArrowDown"
+          }
+        ]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      // // TODO: fix this test (need to fix fire-engine first)
+      // // right now response.body is: { success: false, error: '(Internal server error) - null' }
+      // expect(response.statusCode).toBe(200);
+      // if (!("data" in response.body)) {
+      //   throw new Error("Expected response body to have 'data' property");
+      // }
+      // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
+    },
+  30000);
+
+  // TODO: fix this test (need to fix fire-engine first)
+  it.concurrent("should handle 'action scroll' parameter correctly",
+    async () => {
+      const scrapeRequest = {
+        url: E2E_TEST_SERVER_URL,
+        formats: ["markdown"],
+        actions: [
+          {
+            type: "click",
+            selector: "#scroll-bottom-loader"
+          },
+          {
+            type: "scroll",
+            direction: "down",
+            amount: 2000
+          }
+        ]
+      } as ScrapeRequest;
+  
+      const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
+        .post("/v1/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send(scrapeRequest);
+  
+      // TODO: uncomment this tests
+      // expect(response.statusCode).toBe(200);
+      // if (!("data" in response.body)) {
+      //   throw new Error("Expected response body to have 'data' property");
+      // }
+      // 
+      // expect(response.body.data.markdown).toContain("You have reached the bottom!")
+    },
+  30000);
+
+  // TODO: test scrape action
+
+});
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => {
        await new Promise((r) => setTimeout(r, 10000));
        const completedResponse = await request(TEST_URL)
          .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
-          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .maxResponseSize(4000000000);

        expect(completedResponse.statusCode).toBe(200);
        expect(completedResponse.body).toHaveProperty("status");
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -9,9 +9,8 @@ import {
 import { supabase_service } from "../services/supabase";
 import { withAuth } from "../lib/withAuth";
 import { RateLimiterRedis } from "rate-limiter-flexible";
-import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
 import { sendNotification } from "../services/notification/email_notification";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
 import { redlock } from "../services/redlock";
 import { deleteKey, getValue } from "../services/redis";
 import { setValue } from "../services/redis";
@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
 export async function setCachedACUC(
  api_key: string,
  acuc:
-    | AuthCreditUsageChunk
-    | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
+    | AuthCreditUsageChunk | null
+    | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
 ) {
  const cacheKeyACUC = `acuc_${api_key}`;
  const redLockKey = `lock_${cacheKeyACUC}`;
@ -49,7 +48,7 @@ export async function setCachedACUC(
  try {
    await redlock.using([redLockKey], 10000, {}, async (signal) => {
      if (typeof acuc === "function") {
-        acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
+        acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));

        if (acuc === null) {
          if (signal.aborted) {
@ -69,7 +68,7 @@ export async function setCachedACUC(
      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
    });
  } catch (error) {
-    Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
+    logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
  }
 }

@ -103,7 +102,7 @@ export async function getACUC(
        break;
      }

-      Logger.warn(
+      logger.warn(
        `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
      );
      retries++;
@ -146,33 +145,14 @@ export async function authenticateUser(
  res,
  mode?: RateLimiterMode
 ): Promise<AuthResponse> {
-  return withAuth(supaAuthenticateUser)(req, res, mode);
-}
-
-function setTrace(team_id: string, api_key: string) {
-  try {
-    setTraceAttributes({
-      team_id,
-      api_key,
-    });
-  } catch (error) {
-    Sentry.captureException(error);
-    Logger.error(`Error setting trace attributes: ${error.message}`);
-  }
+  return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
 }

 export async function supaAuthenticateUser(
  req,
  res,
  mode?: RateLimiterMode
-): Promise<{
-  success: boolean;
-  team_id?: string;
-  error?: string;
-  status?: number;
-  plan?: PlanType;
-  chunk?: AuthCreditUsageChunk;
-}> {
+): Promise<AuthResponse> {
  const authHeader =
    req.headers.authorization ??
    (req.headers["sec-websocket-protocol"]
@ -200,7 +180,7 @@ export async function supaAuthenticateUser(

  let teamId: string | null = null;
  let priceId: string | null = null;
-  let chunk: AuthCreditUsageChunk;
+  let chunk: AuthCreditUsageChunk | null = null;

  if (token == "this_is_just_a_preview_token") {
    if (mode == RateLimiterMode.CrawlStatus) {
@ -233,8 +213,6 @@ export async function supaAuthenticateUser(
    priceId = chunk.price_id;

    const plan = getPlanByPriceId(priceId);
-    // HyperDX Logging
-    setTrace(teamId, normalizedApi);
    subscriptionData = {
      team_id: teamId,
      plan,
@ -291,7 +269,7 @@ export async function supaAuthenticateUser(
  try {
    await rateLimiter.consume(team_endpoint_token);
  } catch (rateLimiterRes) {
-    Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
+    logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
    const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
    const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);

@ -318,7 +296,7 @@ export async function supaAuthenticateUser(
      mode === RateLimiterMode.CrawlStatus ||
      mode === RateLimiterMode.Search)
  ) {
-    return { success: true, team_id: "preview" };
+    return { success: true, team_id: "preview", chunk: null };
    // check the origin of the request and make sure its from firecrawl.dev
    // const origin = req.headers.origin;
    // if (origin && origin.includes("firecrawl.dev")){
@ -333,12 +311,12 @@ export async function supaAuthenticateUser(

  return {
    success: true,
-    team_id: subscriptionData.team_id,
-    plan: (subscriptionData.plan ?? "") as PlanType,
+    team_id: teamId ?? undefined,
+    plan: (subscriptionData?.plan ?? "") as PlanType,
    chunk,
  };
 }
-function getPlanByPriceId(price_id: string): PlanType {
+function getPlanByPriceId(price_id: string | null): PlanType {
  switch (price_id) {
    case process.env.STRIPE_PRICE_ID_STARTER:
      return "starter";
--- a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
+++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
@ -1,7 +1,7 @@
 import { Request, Response } from "express";
 import { supabase_service } from "../../../services/supabase";
 import { clearACUC } from "../../auth";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";

 export async function acucCacheClearController(req: Request, res: Response) {
  try {
@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
      .select("*")
      .eq("team_id", team_id);

-    await Promise.all(keys.data.map((x) => clearACUC(x.key)));
+    await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));

    res.json({ ok: true });
  } catch (error) {
-    Logger.error(`Error clearing ACUC cache via API route: ${error}`);
+    logger.error(`Error clearing ACUC cache via API route: ${error}`);
    res.status(500).json({ error: "Internal server error" });
  }
 }
--- a/apps/api/src/controllers/v0/admin/queue.ts
+++ b/apps/api/src/controllers/v0/admin/queue.ts
@ -1,7 +1,7 @@
 import { Request, Response } from "express";

 import { Job } from "bullmq";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
 import { getScrapeQueue } from "../../../services/queue-service";
 import { checkAlerts } from "../../../services/alerts";
 import { sendSlackWebhook } from "../../../services/alerts/slack";
@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
  req: Request,
  res: Response
 ) {
-  Logger.info("🐂 Cleaning jobs older than 24h");
+  logger.info("🐂 Cleaning jobs older than 24h");
  try {
    const scrapeQueue = getScrapeQueue();
    const batchSize = 10;
@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
    ).flat();
    const before24hJobs =
      completedJobs.filter(
-        (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+        (job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
      ) || [];

    let count = 0;
@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
        await job.remove();
        count++;
      } catch (jobError) {
-        Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
+        logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
      }
    }
    return res.status(200).send(`Removed ${count} completed jobs.`);
  } catch (error) {
-    Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
+    logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
    return res.status(500).send("Failed to clean jobs");
  }
 }
@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
    await checkAlerts();
    return res.status(200).send("Alerts initialized");
  } catch (error) {
-    Logger.debug(`Failed to initialize alerts: ${error}`);
+    logger.debug(`Failed to initialize alerts: ${error}`);
    return res.status(500).send("Failed to initialize alerts");
  }
 }
@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
      noActiveJobs,
    });
  } catch (error) {
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
    }

    if (targetMachineCount !== activeMachines) {
-      Logger.info(
+      logger.info(
        `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
      );

@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
      count: activeMachines,
    });
  } catch (error) {
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).send("Failed to initialize autoscaler");
  }
 }
--- a/apps/api/src/controllers/v0/admin/redis-health.ts
+++ b/apps/api/src/controllers/v0/admin/redis-health.ts
@ -1,6 +1,6 @@
 import { Request, Response } from "express";
 import Redis from "ioredis";
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";
 import { redisRateLimitClient } from "../../../services/rate-limiter";

 export async function redisHealthController(req: Request, res: Response) {
@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
        return await operation();
      } catch (error) {
        if (attempt === retries) throw error;
-        Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
+        logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
        await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
      }
    }
  };

  try {
-    const queueRedis = new Redis(process.env.REDIS_URL);
+    const queueRedis = new Redis(process.env.REDIS_URL!);

    const testKey = "test";
    const testValue = "test";
@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
      queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
      await retryOperation(() => queueRedis.del(testKey));
    } catch (error) {
-      Logger.error(`queueRedis health check failed: ${error}`);
+      logger.error(`queueRedis health check failed: ${error}`);
      queueRedisHealth = null;
    }

@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
      );
      await retryOperation(() => redisRateLimitClient.del(testKey));
    } catch (error) {
-      Logger.error(`redisRateLimitClient health check failed: ${error}`);
+      logger.error(`redisRateLimitClient health check failed: ${error}`);
      redisRateLimitHealth = null;
    }

@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
      healthStatus.queueRedis === "healthy" &&
      healthStatus.redisRateLimitClient === "healthy"
    ) {
-      Logger.info("Both Redis instances are healthy");
+      logger.info("Both Redis instances are healthy");
      return res.status(200).json({ status: "healthy", details: healthStatus });
    } else {
-      Logger.info(
+      logger.info(
        `Redis instances health check: ${JSON.stringify(healthStatus)}`
      );
      // await sendSlackWebhook(
@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
        .json({ status: "unhealthy", details: healthStatus });
    }
  } catch (error) {
-    Logger.error(`Redis health check failed: ${error}`);
+    logger.error(`Redis health check failed: ${error}`);
    // await sendSlackWebhook(
    //   `[REDIS DOWN] Redis instances health check: ${error.message}`,
    //   true
--- a/apps/api/src/controllers/v0/crawl-cancel.ts
+++ b/apps/api/src/controllers/v0/crawl-cancel.ts
@ -2,7 +2,7 @@ import { Request, Response } from "express";
 import { authenticateUser } from "../auth";
 import { RateLimiterMode } from "../../../src/types";
 import { supabase_service } from "../../../src/services/supabase";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
 import * as Sentry from "@sentry/node";
 import { configDotenv } from "dotenv";
@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) {
  try {
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';

-    const { success, team_id, error, status } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.CrawlStatus
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }

+    const { team_id } = auth;
+
    const sc = await getCrawl(req.params.jobId);
    if (!sc) {
      return res.status(404).json({ error: "Job not found" });
@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
      sc.cancelled = true;
      await saveCrawl(req.params.jobId, sc);
    } catch (error) {
-      Logger.error(error);
+      logger.error(error);
    }

    res.json({
@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
    });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v0/crawl-status.ts
+++ b/apps/api/src/controllers/v0/crawl-status.ts
@ -2,15 +2,17 @@ import { Request, Response } from "express";
 import { authenticateUser } from "../auth";
 import { RateLimiterMode } from "../../../src/types";
 import { getScrapeQueue } from "../../../src/services/queue-service";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
 import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
 import * as Sentry from "@sentry/node";
 import { configDotenv } from "dotenv";
+import { Job } from "bullmq";
+import { toLegacyDocument } from "../v1/types";
 configDotenv();

 export async function getJobs(crawlId: string, ids: string[]) {
-  const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
+  const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
  
  if (process.env.USE_DB_AUTHENTICATION === "true") {
    const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) {

 export async function crawlStatusController(req: Request, res: Response) {
  try {
-    const { success, team_id, error, status } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.CrawlStatus
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }

+    const { team_id } = auth;
+
    const sc = await getCrawl(req.params.jobId);
    if (!sc) {
      return res.status(404).json({ error: "Job not found" });
@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) {
      status: jobStatus,
      current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
      total: jobs.length,
-      data: jobStatus === "completed" ? data : null,
-      partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
+      data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
+      partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
    });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
 import { createIdempotencyKey } from "../../../src/services/idempotency/create";
 import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
 import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
 import { getScrapeQueue } from "../../../src/services/queue-service";
 import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
 import * as Sentry from "@sentry/node";
 import { getJobPriority } from "../../lib/job-priority";
+import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
+import { ZodError } from "zod";

 export async function crawlController(req: Request, res: Response) {
  try {
-    const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.Crawl
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }

+    const { team_id, plan, chunk } = auth;
+
    if (req.headers["x-idempotency-key"]) {
      const isIdempotencyValid = await validateIdempotencyKey(req);
      if (!isIdempotencyValid) {
@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) {
      try {
        createIdempotencyKey(req);
      } catch (error) {
-        Logger.error(error);
+        logger.error(error);
        return res.status(500).json({ error: error.message });
      }
    }
@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) {
    // TODO: need to do this to v1
    crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
    
-    let url = req.body.url;
+    let url = urlSchema.parse(req.body.url);
    if (!url) {
      return res.status(400).json({ error: "Url is required" });
    }
@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) {
    //       documents: docs,
    //     });
    //   } catch (error) {
-    //     Logger.error(error);
+    //     logger.error(error);
    //     return res.status(500).json({ error: error.message });
    //   }
    // }
@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) {

    await logCrawl(id, team_id);

+    const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
+
    const sc: StoredCrawl = {
      originUrl: url,
      crawlerOptions,
-      pageOptions,
+      scrapeOptions,
+      internalOptions,
      team_id,
      plan,
      createdAt: Date.now(),
@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) {
          data: {
            url,
            mode: "single_urls",
-            crawlerOptions: crawlerOptions,
+            crawlerOptions,
+            scrapeOptions,
+            internalOptions,
            team_id,
            plan,
-            pageOptions: pageOptions,
            origin: req.body.origin ?? defaultOrigin,
            crawl_id: id,
            sitemapped: true,
@ -208,10 +216,11 @@ export async function crawlController(req: Request, res: Response) {
        {
          url,
          mode: "single_urls",
-          crawlerOptions: crawlerOptions,
+          crawlerOptions,
+          scrapeOptions,
+          internalOptions,
          team_id,
-          plan,
-          pageOptions: pageOptions,
+          plan: plan!,
          origin: req.body.origin ?? defaultOrigin,
          crawl_id: id,
        },
@ -226,7 +235,9 @@ export async function crawlController(req: Request, res: Response) {
    res.json({ jobId: id });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
-    return res.status(500).json({ error: error.message });
+    logger.error(error);
+    return res.status(500).json({ error: error instanceof ZodError
+      ? "Invalid URL"
+      : error.message });
  }
 }
--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@ -3,15 +3,16 @@ import { authenticateUser } from "../auth";
 import { RateLimiterMode } from "../../../src/types";
 import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
 import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
 import { addScrapeJob } from "../../../src/services/queue-jobs";
 import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
 import * as Sentry from "@sentry/node";
+import { fromLegacyScrapeOptions } from "../v1/types";

 export async function crawlPreviewController(req: Request, res: Response) {
  try {
-    const { success, error, status, team_id:a, plan } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.Preview
@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) {

    const team_id = "preview";

-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }

+    const { plan } = auth;
+
    let url = req.body.url;
    if (!url) {
      return res.status(400).json({ error: "Url is required" });
@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
    //       documents: docs,
    //     });
    //   } catch (error) {
-    //     Logger.error(error);
+    //     logger.error(error);
    //     return res.status(500).json({ error: error.message });
    //   }
    // }
@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) {
      robots = await this.getRobotsTxt();
    } catch (_) {}

+    const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
+
    const sc: StoredCrawl = {
      originUrl: url,
      crawlerOptions,
-      pageOptions,
+      scrapeOptions,
+      internalOptions,
      team_id,
      plan,
      robots,
@ -107,10 +113,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
        await addScrapeJob({
          url,
          mode: "single_urls",
-          crawlerOptions: crawlerOptions,
          team_id,
-          plan,
-          pageOptions: pageOptions,
+          plan: plan!,
+          crawlerOptions,
+          scrapeOptions,
+          internalOptions,
          origin: "website-preview",
          crawl_id: id,
          sitemapped: true,
@ -123,10 +130,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
      await addScrapeJob({
        url,
        mode: "single_urls",
-        crawlerOptions: crawlerOptions,
        team_id,
-        plan,
-        pageOptions: pageOptions,
+        plan: plan!,
+        crawlerOptions,
+        scrapeOptions,
+        internalOptions,
        origin: "website-preview",
        crawl_id: id,
      }, {}, jobId);
@ -136,7 +144,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
    res.json({ jobId: id });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v0/keyAuth.ts
+++ b/apps/api/src/controllers/v0/keyAuth.ts
@ -8,13 +8,14 @@ import { authenticateUser } from "../auth";
 export const keyAuthController = async (req: Request, res: Response) => {
  try {
    // make sure to authenticate user first, Bearer <token>
-    const { success, team_id, error, status } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }
+
    // if success, return success: true
    return res.status(200).json({ success: true });
  } catch (error) {
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -7,7 +7,7 @@ import {
 import { authenticateUser } from "../auth";
 import { PlanType, RateLimiterMode } from "../../types";
 import { logJob } from "../../services/logging/log_job";
-import { Document } from "../../lib/entities";
+import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
 import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
 import {
@ -19,9 +19,11 @@ import {
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import { getScrapeQueue } from "../../services/queue-service";
 import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import * as Sentry from "@sentry/node";
 import { getJobPriority } from "../../lib/job-priority";
+import { fromLegacyScrapeOptions } from "../v1/types";
+import { ZodError } from "zod";

 export async function scrapeHelper(
  jobId: string,
@ -35,10 +37,10 @@ export async function scrapeHelper(
 ): Promise<{
  success: boolean;
  error?: string;
-  data?: Document;
+  data?: Document | { url: string };
  returnCode: number;
 }> {
-  const url = req.body.url;
+  const url = urlSchema.parse(req.body.url);
  if (typeof url !== "string") {
    return { success: false, error: "Url is required", returnCode: 400 };
  }
@ -54,15 +56,16 @@ export async function scrapeHelper(

  const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });

+  const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
+
  await addScrapeJob(
    {
      url,
      mode: "single_urls",
-      crawlerOptions,
      team_id,
-      pageOptions,
-      plan,
-      extractorOptions,
+      scrapeOptions,
+      internalOptions,
+      plan: plan!,
      origin: req.body.origin ?? defaultOrigin,
      is_scrape: true,
    },
@ -81,7 +84,7 @@ export async function scrapeHelper(
    },
    async (span) => {
      try {
-        doc = (await waitForJob(jobId, timeout))[0];
+        doc = (await waitForJob<Document>(jobId, timeout));
      } catch (e) {
        if (e instanceof Error && e.message.startsWith("Job wait")) {
          span.setAttribute("timedOut", true);
@ -149,7 +152,7 @@ export async function scrapeHelper(

  return {
    success: true,
-    data: doc,
+    data: toLegacyDocument(doc, internalOptions),
    returnCode: 200,
  };
 }
@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) {
  try {
    let earlyReturn = false;
    // make sure to authenticate user first, Bearer <token>
-    const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.Scrape
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }

+    const { team_id, plan, chunk } = auth;
+
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
    const extractorOptions = {
@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) {
        return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
      }
    } catch (error) {
-      Logger.error(error);
+      logger.error(error);
      earlyReturn = true;
      return res.status(500).json({
        error:
@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) {
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
    const numTokens =
-      result.data && result.data.markdown
-        ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
+      result.data && (result.data as Document).markdown
+        ? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
        : 0;

    if (result.success) {
@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) {
      if (creditsToBeBilled > 0) {
        // billing for doc done on queue end, bill only for llm extraction
        billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
-          Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
+          logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
          // Optionally, you could notify an admin or add to a retry queue here
        });
      }
@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) {
    
    let doc = result.data;
    if (!pageOptions || !pageOptions.includeRawHtml) {
-      if (doc && doc.rawHtml) {
-        delete doc.rawHtml;
+      if (doc && (doc as Document).rawHtml) {
+        delete (doc as Document).rawHtml;
      }
    }
  
    if(pageOptions && pageOptions.includeExtract) {
-      if(!pageOptions.includeMarkdown && doc && doc.markdown) {
-        delete doc.markdown;
+      if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
+        delete (doc as Document).markdown;
      }
    }

+    const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
+
    logJob({
      job_id: jobId,
      success: result.success,
@ -276,19 +283,20 @@ export async function scrapeController(req: Request, res: Response) {
      mode: "scrape",
      url: req.body.url,
      crawlerOptions: crawlerOptions,
-      pageOptions: pageOptions,
+      scrapeOptions,
      origin: origin,
-      extractor_options: extractorOptions,
      num_tokens: numTokens,
    });

    return res.status(result.returnCode).json(result);
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({
      error:
-        typeof error === "string"
+        error instanceof ZodError
+          ? "Invalid URL"
+          : typeof error === "string"
            ? error
            : error?.message ?? "Internal Server Error",
    });
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -1,5 +1,4 @@
 import { Request, Response } from "express";
-import { WebScraperDataProvider } from "../../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
 import { authenticateUser } from "../auth";
 import { PlanType, RateLimiterMode } from "../../types";
@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
 import { search } from "../../search";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
 import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import { getScrapeQueue } from "../../services/queue-service";
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import * as Sentry from "@sentry/node";
 import { getJobPriority } from "../../lib/job-priority";
+import { Job } from "bullmq";
+import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";

 export async function searchHelper(
  jobId: string,
  req: Request,
  team_id: string,
-  subscription_id: string,
+  subscription_id: string | null | undefined,
  crawlerOptions: any,
  pageOptions: PageOptions,
  searchOptions: SearchOptions,
-  plan: PlanType
+  plan: PlanType | undefined
 ): Promise<{
  success: boolean;
  error?: string;
@ -35,8 +36,8 @@ export async function searchHelper(
    return { success: false, error: "Query is required", returnCode: 400 };
  }

-  const tbs = searchOptions.tbs ?? null;
-  const filter = searchOptions.filter ?? null;
+  const tbs = searchOptions.tbs ?? undefined;
+  const filter = searchOptions.filter ?? undefined;
  let num_results = Math.min(searchOptions.limit ?? 7, 10);

  if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
@ -58,10 +59,11 @@ export async function searchHelper(

  let justSearch = pageOptions.fetchPageContent === false;

+  const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);

  if (justSearch) {
    billTeam(team_id, subscription_id, res.length).catch(error => {
-      Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
+      logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
      // Optionally, you could notify an admin or add to a retry queue here
    });
    return { success: true, data: res, returnCode: 200 };
@ -88,9 +90,9 @@ export async function searchHelper(
      data: {
        url,
        mode: "single_urls",
-        crawlerOptions: crawlerOptions,
        team_id: team_id,
-        pageOptions: pageOptions,
+        scrapeOptions,
+        internalOptions,
      },
      opts: {
        jobId: uuid,
@ -104,7 +106,7 @@ export async function searchHelper(
    await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
  }

-  const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]);
+  const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
  
  if (docs.length === 0) {
    return { success: true, error: "No search results found", returnCode: 200 };
@ -115,7 +117,7 @@ export async function searchHelper(

  // make sure doc.content is not empty
  const filteredDocs = docs.filter(
-    (doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
+    (doc: any) => doc && doc.content && doc.content.trim().length > 0
  );

  if (filteredDocs.length === 0) {
@ -132,14 +134,15 @@ export async function searchHelper(
 export async function searchController(req: Request, res: Response) {
  try {
    // make sure to authenticate user first, Bearer <token>
-    const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      res,
      RateLimiterMode.Search
    );
-    if (!success) {
-      return res.status(status).json({ error });
+    if (!auth.success) {
+      return res.status(auth.status).json({ error: auth.error });
    }
+    const { team_id, plan, chunk } = auth;
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? {
      includeHtml: req.body.pageOptions?.includeHtml ?? false,
@ -162,7 +165,7 @@ export async function searchController(req: Request, res: Response) {
      }
    } catch (error) {
      Sentry.captureException(error);
-      Logger.error(error);
+      logger.error(error);
      return res.status(500).json({ error: "Internal server error" });
    }
    const startTime = new Date().getTime();
@ -189,7 +192,6 @@ export async function searchController(req: Request, res: Response) {
      mode: "search",
      url: req.body.query,
      crawlerOptions: crawlerOptions,
-      pageOptions: pageOptions,
      origin: origin,
    });
    return res.status(result.returnCode).json(result);
@ -199,7 +201,7 @@ export async function searchController(req: Request, res: Response) {
    }

    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error("Unhandled error occurred in search", { error });
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v0/status.ts
+++ b/apps/api/src/controllers/v0/status.ts
@ -1,5 +1,5 @@
 import { Request, Response } from "express";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
 import { getJobs } from "./crawl-status";
 import * as Sentry from "@sentry/node";
@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
    });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -4,8 +4,6 @@ import {
  BatchScrapeRequest,
  batchScrapeRequestSchema,
  CrawlResponse,
-  legacyExtractorOptions,
-  legacyScrapeOptions,
  RequestWithAuth,
 } from "./types";
 import {
@ -29,19 +27,16 @@ export async function batchScrapeController(

  await logCrawl(id, req.auth.team_id);

-  let { remainingCredits } = req.account;
+  let { remainingCredits } = req.account!;
  const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
  if(!useDbAuthentication){
    remainingCredits = Infinity;
  }

-  const pageOptions = legacyScrapeOptions(req.body);
-  const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
-
-
  const sc: StoredCrawl = {
    crawlerOptions: null,
-    pageOptions,
+    scrapeOptions: req.body,
+    internalOptions: {},
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan,
@ -64,10 +59,9 @@ export async function batchScrapeController(
        url: x,
        mode: "single_urls" as const,
        team_id: req.auth.team_id,
-        plan: req.auth.plan,
+        plan: req.auth.plan!,
        crawlerOptions: null,
-        pageOptions,
-        extractorOptions,
+        scrapeOptions: req.body,
        origin: "api",
        crawl_id: id,
        sitemapped: true,
--- a/apps/api/src/controllers/v1/crawl-cancel.ts
+++ b/apps/api/src/controllers/v1/crawl-cancel.ts
@ -1,6 +1,6 @@
 import { Response } from "express";
 import { supabase_service } from "../../services/supabase";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
 import * as Sentry from "@sentry/node";
 import { configDotenv } from "dotenv";
@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
      sc.cancelled = true;
      await saveCrawl(req.params.jobId, sc);
    } catch (error) {
-      Logger.error(error);
+      logger.error(error);
    }

    res.json({
@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
    });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 }
--- a/apps/api/src/controllers/v1/crawl-status-ws.ts
+++ b/apps/api/src/controllers/v1/crawl-status-ws.ts
@ -1,14 +1,15 @@
 import { authMiddleware } from "../../routes/v1";
 import { RateLimiterMode } from "../../types";
 import { authenticateUser } from "../auth";
-import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
+import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
 import { WebSocket } from "ws";
 import { v4 as uuidv4 } from "uuid";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
 import { getScrapeQueue } from "../../services/queue-service";
 import { getJob, getJobs } from "./crawl-status";
 import * as Sentry from "@sentry/node";
+import { Job, JobState } from "bullmq";

 type ErrorMessage = {
  type: "error",
@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
    return close(ws, 3003, { type: "error", error: "Forbidden" });
  }

-  let doneJobIDs = [];
+  let doneJobIDs: string[] = [];
  let finished = false;

  const loop = async () => {
@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara

    const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
    const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
-    const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
-
-    for (const jobID of newlyDoneJobIDs) {
-      const job = await getJob(jobID);
+    const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
+    const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]

+    for (const job of newlyDoneJobs) {
      if (job.returnvalue) {
        send(ws, {
          type: "document",
-          data: legacyDocumentConverter(job.returnvalue),
+          data: job.returnvalue,
        })
      } else {
        return close(ws, 3000, { type: "error", error: job.failedReason });
@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara

  const throttledJobsSet = new Set(throttledJobs);

-  const validJobStatuses = [];
-  const validJobIDs = [];
+  const validJobStatuses: [string, JobState | "unknown"][] = [];
+  const validJobIDs: string[] = [];

  for (const [id, status] of jobStatuses) {
    if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
@ -126,7 +126,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
      completed: doneJobIDs.length,
      creditsUsed: jobIDs.length,
      expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
-      data: data.map(x => legacyDocumentConverter(x)),
+      data: data,
    }
  });

@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
 // Basically just middleware and error wrapping
 export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
  try {
-    const { success, team_id, error, status, plan } = await authenticateUser(
+    const auth = await authenticateUser(
      req,
      null,
      RateLimiterMode.CrawlStatus,
    );

-    if (!success) {
+    if (!auth.success) {
      return close(ws, 3000, {
        type: "error",
-        error,
+        error: auth.error,
      });
    }

+    const { team_id, plan } = auth;
+
    req.auth = { team_id, plan };

    await crawlStatusWS(ws, req);
@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
      }
    }

-    Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
+    logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
    return close(ws, 1011, {
      type: "error",
      error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -1,9 +1,10 @@
 import { Response } from "express";
-import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
+import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
 import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
 import { getScrapeQueue } from "../../services/queue-service";
 import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
 import { configDotenv } from "dotenv";
+import { Job, JobState } from "bullmq";
 configDotenv();

 export async function getJob(id: string) {
@ -24,7 +25,7 @@ export async function getJob(id: string) {
 }

 export async function getJobs(ids: string[]) {
-  const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
+  const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
  
  if (process.env.USE_DB_AUTHENTICATION === "true") {
    const supabaseData = await supabaseGetJobsById(ids);
@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara

  const throttledJobsSet = new Set(throttledJobs);

-  const validJobStatuses = [];
-  const validJobIDs = [];
+  const validJobStatuses: [string, JobState | "unknown"][] = [];
+  const validJobIDs: string[] = [];

  for (const [id, status] of jobStatuses) {
    if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
@ -81,7 +82,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
  const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
  const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);

-  let doneJobs = [];
+  let doneJobs: Job[] = [];

  if (end === undefined) { // determine 10 megabyte limit
    let bytes = 0;
@ -98,7 +99,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
      for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
        const job = jobs[ii];
        doneJobs.push(job);
-        bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
+        bytes += JSON.stringify(job.returnvalue).length;
      }
    }

@ -122,7 +123,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
  }

  if (data.length > 0) {
-    if (!doneJobs[0].data.pageOptions.includeRawHtml) {
+    if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
      for (let ii = 0; ii < doneJobs.length; ii++) {
        if (data[ii]) {
          delete data[ii].rawHtml;
@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
      status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
        ? undefined
        : nextURL.href,
-    data: data.map(x => legacyDocumentConverter(x)),
+    data: data,
  });
 }

--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@ -4,9 +4,8 @@ import {
  CrawlRequest,
  crawlRequestSchema,
  CrawlResponse,
-  legacyCrawlerOptions,
-  legacyScrapeOptions,
  RequestWithAuth,
+  toLegacyCrawlerOptions,
 } from "./types";
 import {
  addCrawlJob,
@ -20,9 +19,10 @@ import {
 import { logCrawl } from "../../services/logging/crawl_log";
 import { getScrapeQueue } from "../../services/queue-service";
 import { addScrapeJob } from "../../services/queue-jobs";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import { getJobPriority } from "../../lib/job-priority";
 import { callWebhook } from "../../services/webhook";
+import { scrapeOptions as scrapeOptionsSchema } from "./types";

 export async function crawlController(
  req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
@ -34,18 +34,22 @@ export async function crawlController(

  await logCrawl(id, req.auth.team_id);

-  let { remainingCredits } = req.account;
+  let { remainingCredits } = req.account!;
  const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
  if(!useDbAuthentication){
    remainingCredits = Infinity;
  }

-  const crawlerOptions = legacyCrawlerOptions(req.body);
-  const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
+  const crawlerOptions = {
+    ...req.body,
+    url: undefined,
+    scrapeOptions: undefined,
+  };
+  const scrapeOptions = req.body.scrapeOptions;

  // TODO: @rafa, is this right? copied from v0
-  if (Array.isArray(crawlerOptions.includes)) {
-    for (const x of crawlerOptions.includes) {
+  if (Array.isArray(crawlerOptions.includePaths)) {
+    for (const x of crawlerOptions.includePaths) {
      try {
        new RegExp(x);
      } catch (e) {
@ -54,8 +58,8 @@ export async function crawlController(
    }
  }

-  if (Array.isArray(crawlerOptions.excludes)) {
-    for (const x of crawlerOptions.excludes) {
+  if (Array.isArray(crawlerOptions.excludePaths)) {
+    for (const x of crawlerOptions.excludePaths) {
      try {
        new RegExp(x);
      } catch (e) {
@ -68,8 +72,9 @@ export async function crawlController(
  
  const sc: StoredCrawl = {
    originUrl: req.body.url,
-    crawlerOptions,
-    pageOptions,
+    crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
+    scrapeOptions,
+    internalOptions: {},
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan,
@ -78,9 +83,9 @@ export async function crawlController(
  const crawler = crawlToCrawler(id, sc);

  try {
-    sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
+    sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
  } catch (e) {
-    Logger.debug(
+    logger.debug(
      `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
        e
      )}`
@ -112,7 +117,7 @@ export async function crawlController(
          team_id: req.auth.team_id,
          plan: req.auth.plan,
          crawlerOptions,
-          pageOptions,
+          scrapeOptions,
          origin: "api",
          crawl_id: id,
          sitemapped: true,
@ -142,10 +147,10 @@ export async function crawlController(
      {
        url: req.body.url,
        mode: "single_urls",
-        crawlerOptions: crawlerOptions,
        team_id: req.auth.team_id,
-        plan: req.auth.plan,
-        pageOptions: pageOptions,
+        crawlerOptions,
+        scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
+        plan: req.auth.plan!,
        origin: "api",
        crawl_id: id,
        webhook: req.body.webhook,
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -1,9 +1,9 @@
 import { Response } from "express";
 import { v4 as uuidv4 } from "uuid";
 import {
-  legacyCrawlerOptions,
  mapRequestSchema,
  RequestWithAuth,
+  scrapeOptions,
 } from "./types";
 import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
 import { MapResponse, MapRequest } from "./types";
@ -18,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 import { performCosineSimilarity } from "../../lib/map-cosine";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import Redis from "ioredis";

 configDotenv();
-const redis = new Redis(process.env.REDIS_URL);
+const redis = new Redis(process.env.REDIS_URL!);

 // Max Links that /map can return
 const MAX_MAP_LIMIT = 5000;
@ -44,8 +44,12 @@ export async function mapController(

  const sc: StoredCrawl = {
    originUrl: req.body.url,
-    crawlerOptions: legacyCrawlerOptions(req.body),
-    pageOptions: {},
+    crawlerOptions: {
+      ...req.body,
+      scrapeOptions: undefined,
+    },
+    scrapeOptions: scrapeOptions.parse({}),
+    internalOptions: {},
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan,
@ -65,8 +69,8 @@ export async function mapController(
  const cacheKey = `fireEngineMap:${mapUrl}`;
  const cachedResult = null;

-  let allResults: any[];
-  let pagePromises: Promise<any>[];
+  let allResults: any[] = [];
+  let pagePromises: Promise<any>[] = [];

  if (cachedResult) {
    allResults = JSON.parse(cachedResult);
@ -139,7 +143,7 @@ export async function mapController(
        return null;
      }
    })
-    .filter((x) => x !== null);
+    .filter((x) => x !== null) as string[];

  // allows for subdomains to be included
  links = links.filter((x) => isSameDomain(x, req.body.url));
@ -153,7 +157,7 @@ export async function mapController(
  links = removeDuplicateUrls(links);

  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
-    Logger.error(
+    logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
    // Optionally, you could notify an admin or add to a retry queue here
@ -175,9 +179,8 @@ export async function mapController(
    mode: "map",
    url: req.body.url,
    crawlerOptions: {},
-    pageOptions: {},
+    scrapeOptions: {},
    origin: req.body.origin,
-    extractor_options: { mode: "markdown" },
    num_tokens: 0,
  });

--- a/apps/api/src/controllers/v1/scrape-status.ts
+++ b/apps/api/src/controllers/v1/scrape-status.ts
@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {

    const job = await supabaseGetJobByIdOnlyData(req.params.jobId);

-    if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
+    if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
      return res.status(403).json({
        success: false,
        error: "You are not allowed to access this resource.",
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -1,10 +1,7 @@
-import { Request, Response } from "express";
-import { Logger } from "../../lib/logger";
+import { Response } from "express";
+import { logger } from "../../lib/logger";
 import {
  Document,
-  legacyDocumentConverter,
-  legacyExtractorOptions,
-  legacyScrapeOptions,
  RequestWithAuth,
  ScrapeRequest,
  scrapeRequestSchema,
@ -12,7 +9,6 @@ import {
 } from "./types";
 import { billTeam } from "../../services/billing/credit_billing";
 import { v4 as uuidv4 } from "uuid";
-import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import { logJob } from "../../services/logging/log_job";
 import { getJobPriority } from "../../lib/job-priority";
@ -28,8 +24,6 @@ export async function scrapeController(

  const origin = req.body.origin;
  const timeout = req.body.timeout;
-  const pageOptions = legacyScrapeOptions(req.body);
-  const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
  const jobId = uuidv4();

  const startTime = new Date().getTime();
@ -43,11 +37,10 @@ export async function scrapeController(
    {
      url: req.body.url,
      mode: "single_urls",
-      crawlerOptions: {},
      team_id: req.auth.team_id,
-      plan: req.auth.plan,
-      pageOptions,
-      extractorOptions,
+      scrapeOptions: req.body,
+      internalOptions: {},
+      plan: req.auth.plan!,
      origin: req.body.origin,
      is_scrape: true,
    },
@ -56,13 +49,13 @@ export async function scrapeController(
    jobPriority
  );

-  const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
+  const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);

-  let doc: any | undefined;
+  let doc: Document;
  try {
-    doc = (await waitForJob(jobId, timeout + totalWait))[0];
+    doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
  } catch (e) {
-    Logger.error(`Error in scrapeController: ${e}`);
+    logger.error(`Error in scrapeController: ${e}`);
    if (e instanceof Error && e.message.startsWith("Job wait")) {
      return res.status(408).json({
        success: false,
@ -71,34 +64,19 @@ export async function scrapeController(
    } else {
      return res.status(500).json({
        success: false,
-        error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
-          extractorOptions && extractorOptions.mode !== "markdown"
-            ? " - Could be due to LLM parsing issues"
-            : ""
-        }`,
+        error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
      });
    }
  }

  await getScrapeQueue().remove(jobId);

-  if (!doc) {
-    console.error("!!! PANIC DOC IS", doc);
-    return res.status(200).json({
-      success: true,
-      warning: "No page found",
-      data: doc,
-    });
-  }
-
-  delete doc.index;
-  delete doc.provider;
-
  const endTime = new Date().getTime();
  const timeTakenInSeconds = (endTime - startTime) / 1000;
  const numTokens =
-    doc && doc.markdown
-      ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+    doc && doc.extract
+      // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+      ? 0 // TODO: fix
      : 0;

  let creditsToBeBilled = 1; // Assuming 1 credit per document
@ -111,22 +89,16 @@ export async function scrapeController(
  }

  billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
-    Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
+    logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
    // Optionally, you could notify an admin or add to a retry queue here
  });

-  if (!pageOptions || !pageOptions.includeRawHtml) {
+  if (!req.body.formats.includes("rawHtml")) {
    if (doc && doc.rawHtml) {
      delete doc.rawHtml;
    }
  }

-  if(pageOptions && pageOptions.includeExtract) {
-    if(!pageOptions.includeMarkdown && doc && doc.markdown) {
-      delete doc.markdown;
-    }
-  }
-
  logJob({
    job_id: jobId,
    success: true,
@ -137,16 +109,14 @@ export async function scrapeController(
    team_id: req.auth.team_id,
    mode: "scrape",
    url: req.body.url,
-    crawlerOptions: {},
-    pageOptions: pageOptions,
+    scrapeOptions: req.body,
    origin: origin,
-    extractor_options: extractorOptions,
    num_tokens: numTokens,
  });

  return res.status(200).json({
    success: true,
-    data: legacyDocumentConverter(doc),
+    data: doc,
    scrape_id: origin?.includes("website") ? jobId : undefined,
  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -1,10 +1,11 @@
 import { Request, Response } from "express";
 import { z } from "zod";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
-import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
 import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
 import { PlanType } from "../../types";
 import { countries } from "../../lib/validate-country";
+import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
+import { InternalOptions } from "../../scraper/scrapeURL";

 export type Format =
  | "markdown"
@ -167,6 +168,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
 });

 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
+export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;

 export const batchScrapeRequestSchema = scrapeOptions.extend({
  urls: url.array(),
@ -240,7 +242,7 @@ export const mapRequestSchema = crawlerOptions.extend({
  includeSubdomains: z.boolean().default(true),
  search: z.string().optional(),
  ignoreSitemap: z.boolean().default(false),
-  limit: z.number().min(1).max(5000).default(5000).optional(),
+  limit: z.number().min(1).max(5000).default(5000),
 }).strict(strictMessage);

 // export type MapRequest = {
@ -252,13 +254,14 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;

 export type Document = {
  markdown?: string;
-  extract?: string;
+  extract?: any;
  html?: string;
  rawHtml?: string;
  links?: string[];
  screenshot?: string;
  actions?: {
-    screenshots: string[];
+    screenshots?: string[];
+    scrapes?: ScrapeActionContent[];
  };
  warning?: string;
  metadata: {
@ -291,11 +294,11 @@ export type Document = {
    publishedTime?: string;
    articleTag?: string;
    articleSection?: string;
+    url?: string;
    sourceURL?: string;
    statusCode?: number;
    error?: string;
    [key: string]: string | string[] | number | undefined;
-
  };
 };

@ -366,7 +369,7 @@ export type CrawlStatusResponse =

 type AuthObject = {
  team_id: string;
-  plan: PlanType;
+  plan: PlanType | undefined;
 };

 type Account = {
@ -439,7 +442,7 @@ export interface ResponseWithSentry<
  sentry?: string,
 }

-export function legacyCrawlerOptions(x: CrawlerOptions) {
+export function toLegacyCrawlerOptions(x: CrawlerOptions) {
  return {
    includes: x.includePaths,
    excludes: x.excludePaths,
@ -453,68 +456,90 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
  };
 }

-export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
+export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
  return {
-    includeMarkdown: x.formats.includes("markdown"),
-    includeHtml: x.formats.includes("html"),
-    includeRawHtml: x.formats.includes("rawHtml"),
-    includeExtract: x.formats.includes("extract"),
-    onlyIncludeTags: x.includeTags,
-    removeTags: x.excludeTags,
-    onlyMainContent: x.onlyMainContent,
-    waitFor: x.waitFor,
-    headers: x.headers,
-    includeLinks: x.formats.includes("links"),
-    screenshot: x.formats.includes("screenshot"),
-    fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
-    parsePDF: x.parsePDF,
-    actions: x.actions as Action[], // no strict null checking grrrr - mogery
-    geolocation: x.location ?? x.geolocation,
-    skipTlsVerification: x.skipTlsVerification,
-    removeBase64Images: x.removeBase64Images,
-    mobile: x.mobile,
-  };
-}
-
-export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
-  return {
-    mode: x.mode ? "llm-extraction" : "markdown",
-    extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
-    extractionSchema: x.schema,
-    userPrompt: x.prompt ?? "",
-  };
-}
-
-export function legacyDocumentConverter(doc: any): Document {
-  if (doc === null || doc === undefined) return null;
-
-  if (doc.metadata) {
-    if (doc.metadata.screenshot) {
-      doc.screenshot = doc.metadata.screenshot;
-      delete doc.metadata.screenshot;
-    }
-
-    if (doc.metadata.fullPageScreenshot) {
-      doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
-      delete doc.metadata.fullPageScreenshot;
-    }
-  }
-
-  return {
-    markdown: doc.markdown,
-    links: doc.linksOnPage,
-    rawHtml: doc.rawHtml,
-    html: doc.html,
-    extract: doc.llm_extraction,
-    screenshot: doc.screenshot ?? doc.fullPageScreenshot,
-    actions: doc.actions ?? undefined,
-    warning: doc.warning ?? undefined,
-    metadata: {
-      ...doc.metadata,
-      pageError: undefined,
-      pageStatusCode: undefined,
-      error: doc.metadata?.pageError,
-      statusCode: doc.metadata?.pageStatusCode,
+    crawlOptions: crawlerOptions.parse({
+      includePaths: x.includes,
+      excludePaths: x.excludes,
+      limit: x.maxCrawledLinks ?? x.limit,
+      maxDepth: x.maxDepth,
+      allowBackwardLinks: x.allowBackwardCrawling,
+      allowExternalLinks: x.allowExternalContentLinks,
+      ignoreSitemap: x.ignoreSitemap,
+      // TODO: returnOnlyUrls support
+    }),
+    internalOptions: {
+      v0CrawlOnlyUrls: x.returnOnlyUrls,
    },
  };
 }
+
+export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
+  return {
+    scrapeOptions: scrapeOptions.parse({
+      formats: [
+        (pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
+        (pageOptions.includeHtml ?? false) ? "html" as const : null,
+        (pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
+        (pageOptions.screenshot ?? false) ? "screenshot" as const : null,
+        (pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
+        (extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
+        "links"
+      ].filter(x => x !== null),
+      waitFor: pageOptions.waitFor,
+      headers: pageOptions.headers,
+      includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
+      excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
+      onlyMainContent: pageOptions.onlyMainContent ?? false,
+      timeout: timeout,
+      parsePDF: pageOptions.parsePDF,
+      actions: pageOptions.actions,
+      location: pageOptions.geolocation,
+      skipTlsVerification: pageOptions.skipTlsVerification,
+      removeBase64Images: pageOptions.removeBase64Images,
+      extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
+        systemPrompt: extractorOptions.extractionPrompt,
+        prompt: extractorOptions.userPrompt,
+        schema: extractorOptions.extractionSchema,
+      } : undefined,
+      mobile: pageOptions.mobile,
+    }),
+    internalOptions: {
+      atsv: pageOptions.atsv,
+      v0DisableJsDom: pageOptions.disableJsDom,
+      v0UseFastMode: pageOptions.useFastMode,
+    },
+    // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
+  }
+}
+
+export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
+  const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
+  const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
+  return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
+}
+
+export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
+  if (internalOptions.v0CrawlOnlyUrls) {
+    return { url: document.metadata.sourceURL! };
+  }
+
+  return {
+    content: document.markdown!,
+    markdown: document.markdown!,
+    html: document.html,
+    rawHtml: document.rawHtml,
+    linksOnPage: document.links,
+    llm_extraction: document.extract,
+    metadata: {
+      ...document.metadata,
+      error: undefined,
+      statusCode: undefined,
+      pageError: document.metadata.error,
+      pageStatusCode: document.metadata.statusCode,
+      screenshot: document.screenshot,
+    },
+    actions: document.actions ,
+    warning: document.warning,
+  }
+}
--- a/apps/api/src/example.ts
+++ b/apps/api/src/example.ts
@ -1,19 +0,0 @@
-import { WebScraperDataProvider } from "./scraper/WebScraper";
-
-async function example() {
-  const example = new WebScraperDataProvider();
-
-  await example.setOptions({
-    jobId: "TEST",
-    mode: "crawl",
-    urls: ["https://mendable.ai"],
-    crawlerOptions: {},
-  });
-  const docs = await example.getDocuments(false);
-  docs.map((doc) => {
-    console.log(doc.metadata.sourceURL);
-  });
-  console.log(docs.length);
-}
-
-// example();
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -6,28 +6,24 @@ import bodyParser from "body-parser";
 import cors from "cors";
 import { getScrapeQueue } from "./services/queue-service";
 import { v0Router } from "./routes/v0";
-import { initSDK } from "@hyperdx/node-opentelemetry";
 import os from "os";
-import { Logger } from "./lib/logger";
+import { logger } from "./lib/logger";
 import { adminRouter } from "./routes/admin";
-import { ScrapeEvents } from "./lib/scrape-events";
 import http from 'node:http';
 import https from 'node:https';
 import CacheableLookup  from 'cacheable-lookup';
 import { v1Router } from "./routes/v1";
 import expressWs from "express-ws";
-import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
 import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
 import { ZodError } from "zod";
 import { v4 as uuidv4 } from "uuid";
-import dns from 'node:dns';

 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
 const { ExpressAdapter } = require("@bull-board/express");

 const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
-Logger.info(`Number of CPUs: ${numCPUs} available`);
+logger.info(`Number of CPUs: ${numCPUs} available`);

 const cacheable = new CacheableLookup()

@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
  serverAdapter: serverAdapter,
 });

-
 app.use(
  `/admin/${process.env.BULL_AUTH_KEY}/queues`,
  serverAdapter.getRouter()
@ -78,15 +73,10 @@ app.use(adminRouter);
 const DEFAULT_PORT = process.env.PORT ?? 3002;
 const HOST = process.env.HOST ?? "localhost";

-// HyperDX OpenTelemetry
-if (process.env.ENV === "production") {
-  initSDK({ consoleCapture: true, additionalInstrumentations: [] });
-}
-
 function startServer(port = DEFAULT_PORT) {
  const server = app.listen(Number(port), HOST, () => {
-    Logger.info(`Worker ${process.pid} listening on port ${port}`);
-    Logger.info(
+    logger.info(`Worker ${process.pid} listening on port ${port}`);
+    logger.info(
      `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
    );
  });
@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => {
    const [waitingJobs] = await Promise.all([
      scrapeQueue.getWaitingCount(),
    ]);
-
    const noWaitingJobs = waitingJobs === 0;
    // 200 if no active jobs, 503 if there are active jobs
    return res.status(noWaitingJobs ? 200 : 500).json({
@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
    });
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(error);
+    logger.error(error);
    return res.status(500).json({ error: error.message });
  }
 });
@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
            // Re-check the waiting jobs count after the timeout
            waitingJobsCount = await getWaitingJobsCount();
            if (waitingJobsCount >= treshold) {
-              const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
+              const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
              const message = {
                text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
                  timeout / 60000
@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
              });

              if (!response.ok) {
-                Logger.error("Failed to send Slack notification");
+                logger.error("Failed to send Slack notification");
              }
            }
          }, timeout);
        }
      } catch (error) {
        Sentry.captureException(error);
-        Logger.debug(error);
+        logger.debug(error);
      }
    };

@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => {
 app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
  if (err instanceof ZodError) {
      if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
-        Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
+        logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
      }

      res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
    }
  }

-  Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id  + " -- " + verbose);
+  logger.error("Error occurred in request! (" + req.path + ") -- ID " + id  + " -- " + verbose);
  res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
 });

-Logger.info(`Worker ${process.pid} started`);
+logger.info(`Worker ${process.pid} started`);

 // const sq = getScrapeQueue();

--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation

 import { generateOpenAICompletions } from "./models";
 import { Document, ExtractorOptions } from "../entities";
-import { Logger } from "../logger";
+import { logger } from "../logger";

 // Generate completion using OpenAI
 export async function generateCompletions(
  documents: Document[],
-  extractionOptions: ExtractorOptions,
+  extractionOptions: ExtractorOptions | undefined,
  mode: "markdown" | "raw-html"
 ): Promise<Document[]> {
  // const schema = zodToJsonSchema(options.schema)

-  const schema = extractionOptions.extractionSchema;
-  const systemPrompt = extractionOptions.extractionPrompt;
-  const prompt = extractionOptions.userPrompt;
+  const schema = extractionOptions?.extractionSchema;
+  const systemPrompt = extractionOptions?.extractionPrompt;
+  const prompt = extractionOptions?.userPrompt;

  const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider

@ -51,7 +51,7 @@ export async function generateCompletions(

            return completionResult;
          } catch (error) {
-            Logger.error(`Error generating completions: ${error}`);
+            logger.error(`Error generating completions: ${error}`);
            throw error;
          }
        default:
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -95,7 +95,7 @@ export async function generateOpenAICompletions({

    try {
      llmExtraction = JSON.parse(
-        jsonCompletion.choices[0].message.content.trim()
+        (jsonCompletion.choices[0].message.content ?? "").trim()
      );
    } catch (e) {
      throw new Error("Invalid JSON");
--- a/apps/api/src/lib/batch-process.ts
+++ b/apps/api/src/lib/batch-process.ts
@ -3,7 +3,7 @@ export async function batchProcess<T>(
    batchSize: number,
    asyncFunction: (item: T, index: number) => Promise<void>
  ): Promise<void> {
-    const batches = [];
+    const batches: T[][] = [];
    for (let i = 0; i < array.length; i += batchSize) {
      const batch = array.slice(i, i + batchSize);
      batches.push(batch);
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -1,13 +1,16 @@
+import { InternalOptions } from "../scraper/scrapeURL";
+import { ScrapeOptions } from "../controllers/v1/types";
 import { WebCrawler } from "../scraper/WebScraper/crawler";
 import { redisConnection } from "../services/queue-service";
-import { Logger } from "./logger";
+import { logger } from "./logger";

 export type StoredCrawl = {
    originUrl?: string;
    crawlerOptions: any;
-    pageOptions: any;
+    scrapeOptions: Omit<ScrapeOptions, "timeout">;
+    internalOptions: InternalOptions;
    team_id: string;
-    plan: string;
+    plan?: string;
    robots?: string;
    cancelled?: boolean;
    createdAt: number;
@ -100,7 +103,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
        urlO.hash = "";
        url = urlO.href;
    } catch (error) {
-        Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
+        logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
    }

    const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
@ -117,7 +120,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
            urlO.hash = "";
            return urlO.href;
        } catch (error) {
-            Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
+            logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
        }

        return url;
@ -131,7 +134,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
 export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
    const crawler = new WebCrawler({
        jobId: id,
-        initialUrl: sc.originUrl,
+        initialUrl: sc.originUrl!,
        includes: sc.crawlerOptions?.includes ?? [],
        excludes: sc.crawlerOptions?.excludes ?? [],
        maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -1,3 +1,5 @@
+import type { Document as V1Document } from "../controllers/v1/types";
+
 export interface Progress {
  current: number;
  total: number;
@ -129,7 +131,8 @@ export class Document {
  provider?: string;
  warning?: string;
  actions?: {
-    screenshots: string[];
+    screenshots?: string[];
+    scrapes?: ScrapeActionContent[];
  }

  index?: number;
--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@ -5,7 +5,7 @@ import "../services/sentry"
 import * as Sentry from "@sentry/node";

 import dotenv from 'dotenv';
-import { Logger } from './logger';
+import { logger } from './logger';
 dotenv.config();

 // TODO: add a timeout to the Go parser
@ -40,7 +40,7 @@ class GoMarkdownConverter {
  }
 }

-export async function parseMarkdown(html: string): Promise<string> {
+export async function parseMarkdown(html: string | null | undefined): Promise<string> {
  if (!html) {
    return '';
  }
@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise<string> {

      markdownContent = processMultiLineLinks(markdownContent);
      markdownContent = removeSkipToContentLinks(markdownContent);
-      Logger.info(`HTML to Markdown conversion using Go parser successful`);
+      logger.info(`HTML to Markdown conversion using Go parser successful`);
      return markdownContent;
    }
  } catch (error) {
    Sentry.captureException(error);
-    Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
+    logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
  }

  // Fallback to TurndownService if Go parser fails or is not enabled
--- a/apps/api/src/lib/job-priority.ts
+++ b/apps/api/src/lib/job-priority.ts
@ -1,6 +1,6 @@
 import { redisConnection } from "../../src/services/queue-service";
 import { PlanType } from "../../src/types";
-import { Logger } from "./logger";
+import { logger } from "./logger";

 const SET_KEY_PREFIX = "limit_team_id:";
 export async function addJobPriority(team_id, job_id) {
@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
    // This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
    await redisConnection.expire(setKey, 60);
  } catch (e) {
-    Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
+    logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
  }
 }

@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
    // remove job_id from the set
    await redisConnection.srem(setKey, job_id);
  } catch (e) {
-    Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
+    logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
  }
 }

@ -33,7 +33,7 @@ export async function getJobPriority({
  team_id,
  basePriority = 10,
 }: {
-  plan: PlanType;
+  plan: PlanType | undefined;
  team_id: string;
  basePriority?: number;
 }): Promise<number> {
@ -95,7 +95,7 @@ export async function getJobPriority({
      );
    }
  } catch (e) {
-    Logger.error(
+    logger.error(
      `Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
    );
    return basePriority;
--- a/apps/api/src/lib/load-testing-example.ts
+++ b/apps/api/src/lib/load-testing-example.ts
@ -1,42 +0,0 @@
-// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
-
-// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
-
-// const scrapInBatches = async (
-//   urls: string[],
-//   batchSize: number,
-//   delayMs: number
-// ) => {
-//   let successCount = 0;
-//   let errorCount = 0;
-
-//   for (let i = 0; i < urls.length; i += batchSize) {
-//     const batch = urls
-//       .slice(i, i + batchSize)
-//       .map((url) => scrapWithFireEngine(url));
-//     try {
-//       const results = await Promise.all(batch);
-//       results.forEach((data, index) => {
-//         if (data.trim() === "") {
-//           errorCount++;
-//         } else {
-//           successCount++;
-//           console.log(
-//             `Scraping result ${i + index + 1}:`,
-//             data.trim().substring(0, 20) + "..."
-//           );
-//         }
-//       });
-//     } catch (error) {
-//       console.error("Error during scraping:", error);
-//     }
-//     await delay(delayMs);
-//   }
-
-//   console.log(`Total successful scrapes: ${successCount}`);
-//   console.log(`Total errored scrapes: ${errorCount}`);
-// };
-// function run() {
-//   const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
-//   scrapInBatches(urls, 10, 1000);
-// }
--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@ -1,57 +1,82 @@
+import * as winston from "winston";
+
 import { configDotenv } from "dotenv";
+import Transport from "winston-transport";
 configDotenv();

-enum LogLevel {
-  NONE = 'NONE',    // No logs will be output.
-  ERROR = 'ERROR',  // For logging error messages that indicate a failure in a specific operation.
-  WARN = 'WARN',    // For logging potentially harmful situations that are not necessarily errors.
-  INFO = 'INFO',    // For logging informational messages that highlight the progress of the application.
-  DEBUG = 'DEBUG',  // For logging detailed information on the flow through the system, primarily used for debugging.
-  TRACE = 'TRACE'   // For logging more detailed information than the DEBUG level.
-}
-export class Logger {
-  static colors = {
-    ERROR: '\x1b[31m%s\x1b[0m', // Red
-    WARN: '\x1b[33m%s\x1b[0m',  // Yellow
-    INFO: '\x1b[34m%s\x1b[0m',  // Blue
-    DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
-    TRACE: '\x1b[35m%s\x1b[0m'  // Magenta
-  };
-
-  static log (message: string, level: LogLevel) {
-    const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
-    const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
-    const currentLevelIndex = levels.indexOf(logLevel);
-    const messageLevelIndex = levels.indexOf(level);
-
-    if (currentLevelIndex >= messageLevelIndex) {
-      const color = Logger.colors[level];
-      console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
-
-      // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
-      // if (useDbAuthentication) {
-      // save to supabase? another place?
-      // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
-      // }
+const logFormat = winston.format.printf(info => 
+  `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
+    info.metadata,
+    (_, value) => {
+      if (value instanceof Error) {
+        return {
+          ...value,
+          name: value.name,
+          message: value.message,
+          stack: value.stack,
+          cause: value.cause,
+        }
+      } else {
+        return value;
      }
    }
-  static error(message: string | any) {
-    Logger.log(message, LogLevel.ERROR);
+  ) : ""}`
+)
+
+export const logger = winston.createLogger({
+  level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
+  format: winston.format.json({
+    replacer(key, value) {
+      if (value instanceof Error) {
+        return {
+          ...value,
+          name: value.name,
+          message: value.message,
+          stack: value.stack,
+          cause: value.cause,
+        }
+      } else {
+        return value;
+      }
+    }
+  }),
+  transports: [
+    new winston.transports.Console({
+      format: winston.format.combine(
+        winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
+        winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
+        ...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
+      ),
+    }),
+  ],
+});
+
+export type ArrayTransportOptions = Transport.TransportStreamOptions & {
+  array: any[];
+  scrapeId?: string;
+};
+
+export class ArrayTransport extends Transport {
+  private array: any[];
+  private scrapeId?: string;
+
+  constructor(opts: ArrayTransportOptions) {
+    super(opts);
+    this.array = opts.array;
+    this.scrapeId = opts.scrapeId;
  }

-  static warn(message: string) {
-    Logger.log(message, LogLevel.WARN);
+  log(info, next) {
+    setImmediate(() => {
+      this.emit("logged", info);
+    });
+
+    if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
+      return next();
    }

-  static info(message: string) {
-    Logger.log(message, LogLevel.INFO);
-  }
+    this.array.push(info);

-  static debug(message: string) {
-    Logger.log(message, LogLevel.DEBUG);
-  }
-
-  static trace(message: string) {
-    Logger.log(message, LogLevel.TRACE);
+    next();
  }
 }
--- a/apps/api/src/lib/map-cosine.ts
+++ b/apps/api/src/lib/map-cosine.ts
@ -1,4 +1,4 @@
-import { Logger } from "./logger";
+import { logger } from "./logger";

 export function performCosineSimilarity(links: string[], searchQuery: string) {
  try {
@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
    links = a.map((item) => item.link);
    return links;
  } catch (error) {
-    Logger.error(`Error performing cosine similarity: ${error}`);
+    logger.error(`Error performing cosine similarity: ${error}`);
    return links;
  }
 }
--- a/apps/api/src/lib/scrape-events.ts
+++ b/apps/api/src/lib/scrape-events.ts
@ -1,8 +1,8 @@
 import { Job } from "bullmq";
-import type { baseScrapers } from "../scraper/WebScraper/single_url";
 import { supabase_service as supabase } from "../services/supabase";
-import { Logger } from "./logger";
+import { logger } from "./logger";
 import { configDotenv } from "dotenv";
+import { Engine } from "../scraper/scrapeURL/engines";
 configDotenv();

 export type ScrapeErrorEvent = {
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
  type: "scrape",
  url: string,
  worker?: string,
-  method: (typeof baseScrapers)[number],
+  method: Engine,
  result: null | {
    success: boolean,
    response_code?: number,
@ -49,7 +49,7 @@ export class ScrapeEvents {
        }).select().single();
        return (result.data as any).id;
      } catch (error) {
-        // Logger.error(`Error inserting scrape event: ${error}`);
+        // logger.error(`Error inserting scrape event: ${error}`);
        return null;
      }
    }
@ -69,7 +69,7 @@ export class ScrapeEvents {
        }
      }).eq("id", logId);
    } catch (error) {
-      Logger.error(`Error updating scrape result: ${error}`);
+      logger.error(`Error updating scrape result: ${error}`);
    }
  }

@ -81,7 +81,7 @@ export class ScrapeEvents {
        worker: process.env.FLY_MACHINE_ID,
      });
    } catch (error) {
-      Logger.error(`Error logging job event: ${error}`);
+      logger.error(`Error logging job event: ${error}`);
    }
  }
 }
--- a/apps/api/src/lib/supabase-jobs.ts
+++ b/apps/api/src/lib/supabase-jobs.ts
@ -1,5 +1,5 @@
 import { supabase_service } from "../services/supabase";
-import { Logger } from "./logger";
+import { logger } from "./logger";
 import * as Sentry from "@sentry/node";

 /**
@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
    .in("job_id", jobIds);

  if (error) {
-    Logger.error(`Error in supabaseGetJobsById: ${error}`);
+    logger.error(`Error in supabaseGetJobsById: ${error}`);
    Sentry.captureException(error);
    return [];
  }
@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
    .eq("crawl_id", crawlId)

  if (error) {
-    Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
+    logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
    Sentry.captureException(error);
    return [];
  }
--- a/apps/api/src/lib/withAuth.ts
+++ b/apps/api/src/lib/withAuth.ts
@ -1,30 +1,25 @@
 import { AuthResponse } from "../../src/types";
-import { Logger } from "./logger";
+import { logger } from "./logger";
 import * as Sentry from "@sentry/node";
 import { configDotenv } from "dotenv";
 configDotenv();

 let warningCount = 0;

-export function withAuth<T extends AuthResponse, U extends any[]>(
-  originalFunction: (...args: U) => Promise<T>
+export function withAuth<T, U extends any[]>(
+  originalFunction: (...args: U) => Promise<T>,
+  mockSuccess: T,
 ) {
  return async function (...args: U): Promise<T> {
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
    if (!useDbAuthentication) {
      if (warningCount < 5) {
-        Logger.warn("You're bypassing authentication");
+        logger.warn("You're bypassing authentication");
        warningCount++;
      }
      return { success: true } as T;
    } else {
-      try {
      return await originalFunction(...args);
-      } catch (error) {
-        Sentry.captureException(error);
-        Logger.error(`Error in withAuth function: ${error}`);
-        return { success: false, error: error.message } as T;
-      }
    }
  };
 }
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -1,151 +1,127 @@
 import { Job } from "bullmq";
 import {
-  CrawlResult,
  WebScraperOptions,
  RunWebScraperParams,
  RunWebScraperResult,
 } from "../types";
-import { WebScraperDataProvider } from "../scraper/WebScraper";
-import { DocumentUrl, Progress } from "../lib/entities";
 import { billTeam } from "../services/billing/credit_billing";
-import { Document } from "../lib/entities";
+import { Document } from "../controllers/v1/types";
 import { supabase_service } from "../services/supabase";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
 import { ScrapeEvents } from "../lib/scrape-events";
 import { configDotenv } from "dotenv";
+import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
+import { Engine } from "../scraper/scrapeURL/engines";
 configDotenv();

 export async function startWebScraperPipeline({
  job,
  token,
 }: {
-  job: Job<WebScraperOptions>;
+  job: Job<WebScraperOptions> & { id: string };
  token: string;
 }) {
-  let partialDocs: Document[] = [];
  return (await runWebScraper({
    url: job.data.url,
    mode: job.data.mode,
-    crawlerOptions: job.data.crawlerOptions,
-    extractorOptions: job.data.extractorOptions,
-    pageOptions: {
-      ...job.data.pageOptions,
+    scrapeOptions: {
+      ...job.data.scrapeOptions,
      ...(job.data.crawl_id ? ({
-        includeRawHtml: true,
+        formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
      }): {}),
    },
-    inProgress: (progress) => {
-      Logger.debug(`🐂 Job in progress ${job.id}`);
-      if (progress.currentDocument) {
-        partialDocs.push(progress.currentDocument);
-        if (partialDocs.length > 50) {
-          partialDocs = partialDocs.slice(-50);
-        }
-        // job.updateProgress({ ...progress, partialDocs: partialDocs });
-      }
-    },
-    onSuccess: (result, mode) => {
-      Logger.debug(`🐂 Job completed ${job.id}`);
-      saveJob(job, result, token, mode);
-    },
-    onError: (error) => {
-      Logger.error(`🐂 Job failed ${job.id}`);
-      ScrapeEvents.logJobEvent(job, "failed");
-      job.moveToFailed(error, token, false);
-    },
+    internalOptions: job.data.internalOptions,
+    // onSuccess: (result, mode) => {
+    //   logger.debug(`🐂 Job completed ${job.id}`);
+    //   saveJob(job, result, token, mode);
+    // },
+    // onError: (error) => {
+    //   logger.error(`🐂 Job failed ${job.id}`);
+    //   ScrapeEvents.logJobEvent(job, "failed");
+    // },
    team_id: job.data.team_id,
    bull_job_id: job.id.toString(),
    priority: job.opts.priority,
    is_scrape: job.data.is_scrape ?? false,
-  })) as { success: boolean; message: string; docs: Document[] };
+  }));
 }

 export async function runWebScraper({
  url,
  mode,
-  crawlerOptions,
-  pageOptions,
-  extractorOptions,
-  inProgress,
-  onSuccess,
-  onError,
+  scrapeOptions,
+  internalOptions,
+  // onSuccess,
+  // onError,
  team_id,
  bull_job_id,
  priority,
  is_scrape=false,
-}: RunWebScraperParams): Promise<RunWebScraperResult> {
+}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
+  let response: ScrapeUrlResponse | undefined = undefined;
+  let engines: EngineResultsTracker = {};
  try {
-    const provider = new WebScraperDataProvider();
-    if (mode === "crawl") {
-      await provider.setOptions({
-        jobId: bull_job_id,
-        mode: mode,
-        urls: [url],
-        extractorOptions,
-        crawlerOptions: crawlerOptions,
-        pageOptions: pageOptions,
-        bullJobId: bull_job_id,
-        priority,
-      });
+    response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
+    if (!response.success) {
+      if (response.error instanceof Error) {
+        throw response.error;
      } else {
-      await provider.setOptions({
-        jobId: bull_job_id,
-        mode: mode,
-        urls: url.split(","),
-        extractorOptions,
-        crawlerOptions: crawlerOptions,
-        pageOptions: pageOptions,
-        priority,
-        teamId: team_id
-      });
+        throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
      }
-    const docs = (await provider.getDocuments(false, (progress: Progress) => {
-      inProgress(progress);
-    })) as Document[];
-
-    if (docs.length === 0) {
-      return {
-        success: true,
-        message: "No pages found",
-        docs: [],
-      };
    }

-    // remove docs with empty content
-    const filteredDocs = crawlerOptions?.returnOnlyUrls
-      ? docs.map((doc) => {
-          if (doc.metadata.sourceURL) {
-            return { url: doc.metadata.sourceURL };
-          }
-        })
-      : docs;
-
    if(is_scrape === false) {
      let creditsToBeBilled = 1; // Assuming 1 credit per document
-      if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
+      if (scrapeOptions.extract) {
        creditsToBeBilled = 5;
      }

-      billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
-        Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
+      billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
+        logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
        // Optionally, you could notify an admin or add to a retry queue here
      });
    }

-    
-
    // This is where the returnvalue from the job is set
-    onSuccess(filteredDocs, mode);
+    // onSuccess(response.document, mode);

-    // this return doesn't matter too much for the job completion result
-    return { success: true, message: "", docs: filteredDocs };
+    engines = response.engines;
+    return response;
  } catch (error) {
-    onError(error);
-    return { success: false, message: error.message, docs: [] };
+    engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
+
+    if (response !== undefined) {
+      return {
+        ...response,
+        success: false,
+        error,
+      }
+    } else {
+      return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
+    }
+    // onError(error);
+  } finally {
+    const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
+
+    for (const engine of engineOrder) {
+      const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
+      ScrapeEvents.insert(bull_job_id, {
+        type: "scrape",
+        url,
+        method: engine,
+        result: {
+          success: result.state === "success",
+          response_code: (result.state === "success" ? result.result.statusCode : undefined),
+          response_size: (result.state === "success" ? result.result.html.length : undefined),
+          error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
+          time_taken: result.finishedAt - result.startedAt,
+        },
+      });
+    }
  }
 }

-const saveJob = async (job: Job, result: any, token: string, mode: string) => {
+const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
  try {
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
    if (useDbAuthentication) {
@ -173,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
    }
    ScrapeEvents.logJobEvent(job, "completed");
  } catch (error) {
-    Logger.error(`🐂 Failed to update job status: ${error}`);
+    logger.error(`🐂 Failed to update job status: ${error}`);
  }
 };
--- a/apps/api/src/routes/admin.ts
+++ b/apps/api/src/routes/admin.ts
@ -6,8 +6,8 @@ import {
  cleanBefore24hCompleteJobsController,
  queuesController,
 } from "../controllers/v0/admin/queue";
-import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
 import { wrap } from "./v1";
+import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";

 export const adminRouter = express.Router();

--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -14,7 +14,7 @@ import expressWs from "express-ws";
 import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 import { crawlCancelController } from "../controllers/v1/crawl-cancel";
-import { Logger } from "../lib/logger";
+import { logger } from "../lib/logger";
 import { scrapeStatusController } from "../controllers/v1/scrape-status";
 import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
 import { batchScrapeController } from "../controllers/v1/batch-scrape";
@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
            if (!minimum && req.body) {
                minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
            }
-            const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
+            const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
+            if (chunk) {
                req.acuc = chunk;
+            }
            if (!success) {
-                Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
+                logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
                if (!res.headersSent) {
                    return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
                }
@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
 export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
    return (req, res, next) => {
        (async () => {
-            const { success, team_id, error, status, plan, chunk } = await authenticateUser(
+            const auth = await authenticateUser(
                req,
                res,
                rateLimiterMode,
            );

-            if (!success) {
+            if (!auth.success) {
                if (!res.headersSent) {
-                    return res.status(status).json({ success: false, error });
+                    return res.status(auth.status).json({ success: false, error: auth.error });
+                } else {
+                    return;
                }
            }

+            const { team_id, plan, chunk } = auth;
+
            req.auth = { team_id, plan };
-            req.acuc = chunk;
+            req.acuc = chunk ?? undefined;
+            if (chunk) {
+                req.account = { remainingCredits: chunk.remaining_credits };
+            }
            next();
        })()
            .catch(err => next(err));
--- a/apps/api/src/scraper/WebScraper/tests/crawler.test.ts
+++ b/apps/api/src/scraper/WebScraper/tests/crawler.test.ts
@ -2,7 +2,6 @@
 import { WebCrawler } from '../crawler';
 import axios from 'axios';
 import robotsParser from 'robots-parser';
-import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';

 jest.mock('axios');
 jest.mock('robots-parser');
@ -35,165 +34,6 @@ describe('WebCrawler', () => {
    });
  });

-  it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
-    const initialUrl = 'http://example.com'; // Set initial URL for this test
-    const enteredMaxCrawledDepth = 2;
-    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-
-
-    crawler = new WebCrawler({
-      jobId: "TEST",
-      initialUrl: initialUrl,
-      includes: [],
-      excludes: [],
-      limit: 100,
-      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
-    });
-
-    // Mock sitemap fetching function to return controlled links
-    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
-      initialUrl, // depth 0
-      initialUrl + '/page1', // depth 1
-      initialUrl + '/page1/page2', // depth 2
-      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
-    ]);
-
-    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
-    expect(results).toEqual([
-      { url: initialUrl, html: '' },
-      { url: initialUrl + '/page1', html: '' },
-      { url: initialUrl + '/page1/page2', html: '' }
-    ]);
-
-
-    // Ensure that the link with depth 3 is not included
-    expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
-  });
-
-  it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
-    const initialUrl = 'http://example.com'; // Set initial URL for this test
-    const enteredMaxCrawledDepth = 0;
-    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-   
-
-    crawler = new WebCrawler({
-      jobId: "TEST",
-      initialUrl: initialUrl,
-      includes: [],
-      excludes: [],
-      limit: 100,
-      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
-    });
-
-    // Mock sitemap fetching function to return controlled links
-    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
-      initialUrl, // depth 0
-      initialUrl + '/page1', // depth 1
-      initialUrl + '/page1/page2', // depth 2
-      initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
-    ]);
-
-    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
-    expect(results).toEqual([
-      { url: initialUrl, html: '' },
-    ]);  
-  });
-
-  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
-    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
-    const enteredMaxCrawledDepth = 1;
-    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
-  
-
-    crawler = new WebCrawler({
-      jobId: "TEST",
-      initialUrl: initialUrl,
-      includes: [],
-      excludes: [],
-      limit: 100,
-      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
-    });
-
-    // Mock sitemap fetching function to return controlled links
-    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
-      initialUrl, // depth 0
-      initialUrl + '/page2', // depth 1
-      initialUrl + '/page2/page3', // depth 2
-      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
-    ]);
-
-    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
-    expect(results).toEqual([
-      { url: initialUrl, html: '' },
-      { url: initialUrl + '/page2', html: '' }
-    ]);
-  });
-
-  it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
-    const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
-    const enteredMaxCrawledDepth = 2;
-    maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
- 
-
-    crawler = new WebCrawler({
-      jobId: "TEST",
-      initialUrl: initialUrl,
-      includes: [],
-      excludes: [],
-      limit: 100,
-      maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
-    });
-
-    // Mock sitemap fetching function to return controlled links
-    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
-      initialUrl, // depth 0
-      initialUrl + '/page2', // depth 1
-      initialUrl + '/page2/page3', // depth 2
-      initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
-    ]);
-
-    const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
-    expect(results).toEqual([
-      { url: initialUrl, html: '' },
-      { url: initialUrl + '/page2', html: '' },
-      { url: initialUrl + '/page2/page3', html: '' }
-    ]);   
-  });
-
-  it('should handle allowBackwardCrawling option correctly', async () => {
-    const initialUrl = 'https://mendable.ai/blog';
-  
-    // Setup the crawler with the specific test case options
-    const crawler = new WebCrawler({
-      jobId: "TEST",
-      initialUrl: initialUrl,
-      includes: [],
-      excludes: [],
-      limit: 100,
-      maxCrawledDepth: 3, // Example depth
-      allowBackwardCrawling: true
-    });
-  
-    // Mock the sitemap fetching function to simulate backward crawling
-    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
-      initialUrl,
-      'https://mendable.ai', // backward link
-      initialUrl + '/page1',
-      initialUrl + '/page1/page2'
-    ]);
-  
-    const results = await crawler.start();
-    expect(results).toEqual([
-      { url: initialUrl, html: '' },
-      { url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
-      { url: initialUrl + '/page1', html: '' },
-      { url: initialUrl + '/page1/page2', html: '' }
-    ]);
-  
-    // Check that the backward link is included if allowBackwardCrawling is true
-    expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
-  });
-
  it('should respect the limit parameter by not returning more links than specified', async () => {
    const initialUrl = 'http://example.com';
    const limit = 2;  // Set a limit for the number of links
--- a/apps/api/src/scraper/WebScraper/tests/single_url.test.ts
+++ b/apps/api/src/scraper/WebScraper/tests/single_url.test.ts
@ -1,37 +0,0 @@
-import { scrapSingleUrl } from '../single_url';
-import { PageOptions } from '../../../lib/entities';
-
-
-jest.mock('../single_url', () => {
-  const originalModule = jest.requireActual('../single_url');
-  originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
-
-  return originalModule;
-});
-
-describe('scrapSingleUrl', () => {
-  it('should handle includeHtml option correctly', async () => {
-    const url = 'https://roastmywebsite.ai';
-    const pageOptionsWithHtml: PageOptions = { includeHtml: true };
-    const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
-
-    const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
-    const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
-
-    expect(resultWithHtml.html).toBeDefined();
-    expect(resultWithoutHtml.html).toBeUndefined();
-  }, 10000);
-});
-
-it('should return a list of links on the firecrawl.ai page', async () => {
-  const url = 'https://flutterbricks.com';
-  const pageOptions: PageOptions = { includeHtml: true };
-
-  const result = await scrapSingleUrl("TEST", url, pageOptions);
-
-  // Check if the result contains a list of links
-  expect(result.linksOnPage).toBeDefined();
-  expect(Array.isArray(result.linksOnPage)).toBe(true);
-  expect(result.linksOnPage.length).toBeGreaterThan(0);
-  expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
-}, 15000);
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios";
 import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
-import async from "async";
-import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
-import { scrapSingleUrl } from "./single_url";
 import robotsParser from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 import { axiosTimeout } from "../../../src/lib/timeout";
-import { Logger } from "../../../src/lib/logger";
+import { logger } from "../../../src/lib/logger";
 import https from "https";
 export class WebCrawler {
  private jobId: string;
@ -73,7 +70,7 @@ export class WebCrawler {
        try {
          url = new URL(link.trim(), this.baseUrl);
        } catch (error) {
-          Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
+          logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
          return false;
        }
        const path = url.pathname;
@ -132,7 +129,7 @@ export class WebCrawler {
        const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
        // Check if the link is disallowed by robots.txt
        if (!isAllowed) {
-          Logger.debug(`Link disallowed by robots.txt: ${link}`);
+          logger.debug(`Link disallowed by robots.txt: ${link}`);
          return false;
        }

@ -161,7 +158,7 @@ export class WebCrawler {
  }

  public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
-    Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
+    logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if (sitemapLinks.length > 0) {
      let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
@ -170,115 +167,6 @@ export class WebCrawler {
    return null;
  }

-  public async start(
-    inProgress?: (progress: Progress) => void,
-    pageOptions?: PageOptions,
-    crawlerOptions?: CrawlerOptions,
-    concurrencyLimit: number = 5,
-    limit: number = 10000,
-    maxDepth: number = 10
-  ): Promise<{ url: string, html: string }[]> {
-
-    Logger.debug(`Crawler starting with ${this.initialUrl}`);
-    // Fetch and parse robots.txt
-    try {
-      const txt = await this.getRobotsTxt();
-      this.importRobotsTxt(txt);
-      Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
-    } catch (error) {
-      Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
-    }
-
-    if (!crawlerOptions?.ignoreSitemap){
-      const sm = await this.tryGetSitemap();
-      if (sm !== null) {
-        return sm;
-      }
-    }
-
-    const urls = await this.crawlUrls(
-      [this.initialUrl],
-      pageOptions,
-      concurrencyLimit,
-      inProgress
-    );
-    
-    if (
-      urls.length === 0 &&
-      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
-    ) {
-      return [{ url: this.initialUrl, html: "" }];
-    }
-
-    // make sure to run include exclude here again
-    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
-    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
-  }
-
-  private async crawlUrls(
-    urls: string[],
-    pageOptions: PageOptions,
-    concurrencyLimit: number,
-    inProgress?: (progress: Progress) => void,
-  ): Promise<{ url: string, html: string }[]> {
-    const queue = async.queue(async (task: string, callback) => {
-      Logger.debug(`Crawling ${task}`);
-      if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
-        if (callback && typeof callback === "function") {
-          callback();
-        }
-        return;
-      }
-      const newUrls = await this.crawl(task, pageOptions);
-      // add the initial url if not already added
-      // if (this.visited.size === 1) {
-      //   let normalizedInitial = this.initialUrl;
-      //   if (!normalizedInitial.endsWith("/")) {
-      //     normalizedInitial = normalizedInitial + "/";
-      //   }
-      //   if (!newUrls.some(page => page.url === this.initialUrl)) {
-      //     newUrls.push({ url: this.initialUrl, html: "" });
-      //   }
-      // }
-
-      newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
-      
-      if (inProgress && newUrls.length > 0) {
-        inProgress({
-          current: this.crawledUrls.size,
-          total: Math.min(this.maxCrawledLinks, this.limit),
-          status: "SCRAPING",
-          currentDocumentUrl: newUrls[newUrls.length - 1].url,
-        });
-      } else if (inProgress) {
-        inProgress({
-          current: this.crawledUrls.size,
-          total: Math.min(this.maxCrawledLinks, this.limit),
-          status: "SCRAPING",
-          currentDocumentUrl: task,
-        });
-      }
-      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
-      if (callback && typeof callback === "function") {
-        callback();
-      }
-    }, concurrencyLimit);
-
-    Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
-    queue.push(
-      urls.filter(
-        (url) =>
-          !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
-      ),
-      (err) => {
-        if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
-      }
-    );
-    await queue.drain();
-    Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
-    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
-  }
-
  public filterURL(href: string, url: string): string | null {
    let fullUrl = href;
    if (!href.startsWith("http")) {
@ -346,79 +234,9 @@ export class WebCrawler {
    return links;
  }

-  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
-    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
-      return [];
-    }
-    this.visited.add(url);
-
-    if (!url.startsWith("http")) {
-      url = "https://" + url;
-    }
-    if (url.endsWith("/")) {
-      url = url.slice(0, -1);
-    }
-
-    if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
-      return [];
-    }
-
-    try {
-      let content: string = "";
-      let pageStatusCode: number;
-      let pageError: string | undefined = undefined;
-
-      // If it is the first link, fetch with single url
-      if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
-        content = page.html ?? "";
-        pageStatusCode = page.metadata?.pageStatusCode;
-        pageError = page.metadata?.pageError || undefined;
-      } else {
-        const response = await axios.get(url, { timeout: axiosTimeout });
-        content = response.data ?? "";
-        pageStatusCode = response.status;
-        pageError = response.statusText != "OK" ? response.statusText : undefined;
-      }
-
-      const $ = load(content);
-      let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
-
-      // Add the initial URL to the list of links
-      if (this.visited.size === 1) {
-        links.push({ url, html: content, pageStatusCode, pageError });
-      }
-
-      links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
-      
-      if (this.visited.size === 1) {
-        return links;
-      }
-
-      // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(link.url));
-    } catch (error) {
-      return [];
-    }
-  }
-
  private isRobotsAllowed(url: string): boolean {
    return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
  }
-  private normalizeCrawlUrl(url: string): string {
-    try{
-      const urlObj = new URL(url);
-      urlObj.searchParams.sort(); // Sort query parameters to normalize
-      return urlObj.toString();
-    } catch (error) {
-      return url;
-    }
-  }
-
-  private matchesIncludes(url: string): boolean {
-    if (this.includes.length === 0 || this.includes[0] == "") return true;
-    return this.includes.some((pattern) => new RegExp(pattern).test(url));
-  }

  private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
    return this.excludes.some((pattern) => {
@ -503,7 +321,7 @@ export class WebCrawler {
      const urlWithoutQuery = url.split('?')[0].toLowerCase();
      return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
    } catch (error) {
-      Logger.error(`Error processing URL in isFile: ${error}`);
+      logger.error(`Error processing URL in isFile: ${error}`);
      return false;
    }
  }
@ -524,7 +342,6 @@ export class WebCrawler {
    return socialMediaOrEmail.some((ext) => url.includes(ext));
  }

-  // 
  private async tryFetchSitemapLinks(url: string): Promise<string[]> {
    const normalizeUrl = (url: string) => {
      url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
@ -546,7 +363,7 @@ export class WebCrawler {
        sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
      }
    } catch (error) { 
-      Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
+      logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
      if (error instanceof AxiosError && error.response?.status === 404) {
        // ignore 404
      } else {
@ -565,7 +382,7 @@ export class WebCrawler {
          sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
        }
      } catch (error) {
-        Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
+        logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
        if (error instanceof AxiosError && error.response?.status === 404) {
          // ignore 404
        } else {
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -1,4 +1,4 @@
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";

 export async function handleCustomScraping(
  text: string,
@ -6,7 +6,7 @@ export async function handleCustomScraping(
 ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
  // Check for Readme Docs special case
  if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
-    Logger.debug(
+    logger.debug(
      `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
    );
    return {
@ -21,7 +21,7 @@ export async function handleCustomScraping(

  // Check for Vanta security portals
  if (text.includes('<link href="https://static.vanta.com')) {
-    Logger.debug(
+    logger.debug(
      `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
    );
    return {
@ -36,7 +36,7 @@ export async function handleCustomScraping(
  const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
  if (googleDriveMetaMatch) {
    const url = googleDriveMetaMatch[1];
-    Logger.debug(`Google Drive PDF link detected: ${url}`);
+    logger.debug(`Google Drive PDF link detected: ${url}`);

    const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
    if (fileIdMatch) {
--- a/apps/api/src/scraper/WebScraper/global.ts
+++ b/apps/api/src/scraper/WebScraper/global.ts
@ -1 +0,0 @@
-export const universalTimeout = 15000;
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -1,743 +0,0 @@
-import {
-  Document,
-  ExtractorOptions,
-  PageOptions,
-  WebScraperOptions,
-} from "../../lib/entities";
-import { Progress } from "../../lib/entities";
-import { scrapSingleUrl } from "./single_url";
-import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
-import { WebCrawler } from "./crawler";
-import { getValue, setValue } from "../../services/redis";
-import { getImageDescription } from "./utils/imageDescription";
-import { fetchAndProcessPdf } from "./utils/pdfProcessor";
-import {
-  replaceImgPathsWithAbsolutePaths,
-  replacePathsWithAbsolutePaths,
-} from "./utils/replacePaths";
-import { generateCompletions } from "../../lib/LLM-extraction";
-import { fetchAndProcessDocx } from "./utils/docxProcessor";
-import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
-import { Logger } from "../../lib/logger";
-import { ScrapeEvents } from "../../lib/scrape-events";
-
-export class WebScraperDataProvider {
-  private jobId: string;
-  private bullJobId: string;
-  private urls: string[] = [""];
-  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
-  private includes: string | string[];
-  private excludes: string | string[];
-  private maxCrawledLinks: number;
-  private maxCrawledDepth: number = 10;
-  private returnOnlyUrls: boolean;
-  private limit: number = 10000;
-  private concurrentRequests: number = 20;
-  private generateImgAltText: boolean = false;
-  private ignoreSitemap: boolean = false;
-  private pageOptions?: PageOptions;
-  private extractorOptions?: ExtractorOptions;
-  private replaceAllPathsWithAbsolutePaths?: boolean = false;
-  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
-    "gpt-4-turbo";
-  private crawlerMode: string = "default";
-  private allowBackwardCrawling: boolean = false;
-  private allowExternalContentLinks: boolean = false;
-  private priority?: number;
-  private teamId?: string;
-
-  authorize(): void {
-    throw new Error("Method not implemented.");
-  }
-
-  authorizeNango(): Promise<void> {
-    throw new Error("Method not implemented.");
-  }
-
-  private async convertUrlsToDocuments(
-    urls: string[],
-    inProgress?: (progress: Progress) => void,
-    allHtmls?: string[]
-  ): Promise<Document[]> {
-    const totalUrls = urls.length;
-    let processedUrls = 0;
-
-    const results: (Document | null)[] = new Array(urls.length).fill(null);
-    for (let i = 0; i < urls.length; i += this.concurrentRequests) {
-      const batchUrls = urls.slice(i, i + this.concurrentRequests);
-      await Promise.all(
-        batchUrls.map(async (url, index) => {
-          const existingHTML = allHtmls ? allHtmls[i + index] : "";
-          const result = await scrapSingleUrl(
-            this.jobId,
-            url,
-            this.pageOptions,
-            this.extractorOptions,
-            existingHTML,
-            this.priority,
-            this.teamId,
-          );
-          processedUrls++;
-          if (inProgress) {
-            inProgress({
-              current: processedUrls,
-              total: totalUrls,
-              status: "SCRAPING",
-              currentDocumentUrl: url,
-              currentDocument: { ...result, index: processedUrls },
-            });
-          }
-
-          results[i + index] = result;
-        })
-      );
-    }
-    return results.filter((result) => result !== null) as Document[];
-  }
-
-  async getDocuments(
-    useCaching: boolean = false,
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    this.validateInitialUrl();
-    if (!useCaching) {
-      return this.processDocumentsWithoutCache(inProgress);
-    }
-
-    return this.processDocumentsWithCache(inProgress);
-  }
-
-  private validateInitialUrl(): void {
-    if (this.urls[0].trim() === "") {
-      throw new Error("Url is required");
-    }
-  }
-
-  /**
-   * Process documents without cache handling each mode
-   * @param inProgress inProgress
-   * @returns documents
-   */
-  private async processDocumentsWithoutCache(
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    switch (this.mode) {
-      case "crawl":
-        return this.handleCrawlMode(inProgress);
-      case "single_urls":
-        return this.handleSingleUrlsMode(inProgress);
-      case "sitemap":
-        return this.handleSitemapMode(inProgress);
-      default:
-        return [];
-    }
-  }
-
-  private async cleanIrrelevantPath(links: string[]) {
-    return links.filter((link) => {
-      const normalizedInitialUrl = new URL(this.urls[0]);
-      const normalizedLink = new URL(link);
-
-      // Normalize the hostname to account for www and non-www versions
-      const initialHostname = normalizedInitialUrl.hostname.replace(
-        /^www\./,
-        ""
-      );
-      const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
-
-      // Ensure the protocol and hostname match, and the path starts with the initial URL's path
-      return (
-        linkHostname === initialHostname &&
-        normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
-      );
-    });
-  }
-
-  private async handleCrawlMode(
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    let includes: string[];
-    if (Array.isArray(this.includes)) {
-      if (this.includes[0] != "") {
-        includes = this.includes;
-      }
-    } else {
-      includes = this.includes.split(',');
-    }
-
-    let excludes: string[];
-    if (Array.isArray(this.excludes)) {
-      if (this.excludes[0] != "") {
-        excludes = this.excludes;
-      }
-    } else {
-      excludes = this.excludes.split(',');
-    }
-
-    const crawler = new WebCrawler({
-      jobId: this.jobId,
-      initialUrl: this.urls[0],
-      includes,
-      excludes,
-      maxCrawledLinks: this.maxCrawledLinks,
-      maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
-      limit: this.limit,
-      generateImgAltText: this.generateImgAltText,
-      allowBackwardCrawling: this.allowBackwardCrawling,
-      allowExternalContentLinks: this.allowExternalContentLinks,
-    });
-
-    let links = await crawler.start(
-      inProgress,
-      this.pageOptions,
-      {
-        ignoreSitemap: this.ignoreSitemap,
-      },
-      5,
-      this.limit,
-      this.maxCrawledDepth
-    );
-
-    let allLinks = links.map((e) => e.url);
-    const allHtmls = links.map((e) => e.html);
-
-    if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(allLinks, inProgress);
-    }
-
-    let documents = [];
-    // check if fast mode is enabled and there is html inside the links
-    if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
-      documents = await this.processLinks(allLinks, inProgress, allHtmls);
-    } else {
-      documents = await this.processLinks(allLinks, inProgress);
-    }
-
-    return this.cacheAndFinalizeDocuments(documents, allLinks);
-  }
-
-  private async handleSingleUrlsMode(
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    const links = this.urls;
-
-    let documents = await this.processLinks(links, inProgress);
-    return documents;
-  }
-
-  private async handleSitemapMode(
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
-    links = await this.cleanIrrelevantPath(links);
-
-    if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(links, inProgress);
-    }
-
-    let documents = await this.processLinks(links, inProgress);
-    return this.cacheAndFinalizeDocuments(documents, links);
-  }
-
-  private async returnOnlyUrlsResponse(
-    links: string[],
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    inProgress?.({
-      current: links.length,
-      total: links.length,
-      status: "COMPLETED",
-      currentDocumentUrl: this.urls[0],
-    });
-    return links.map((url) => ({
-      content: "",
-      html: this.pageOptions?.includeHtml ? "" : undefined,
-      markdown: "",
-      metadata: { sourceURL: url, pageStatusCode: 200 },
-    }));
-  }
-
-  private async processLinks(
-    links: string[],
-    inProgress?: (progress: Progress) => void,
-    allHtmls?: string[]
-  ): Promise<Document[]> {
-    const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
-    const docLinks = links.filter(
-      (link) => link.endsWith(".doc") || link.endsWith(".docx")
-    );
-
-    const [pdfDocuments, docxDocuments] = await Promise.all([
-      this.fetchPdfDocuments(pdfLinks),
-      this.fetchDocxDocuments(docLinks),
-    ]);
-
-    links = links.filter(
-      (link) => !pdfLinks.includes(link) && !docLinks.includes(link)
-    );
-
-    let [documents, sitemapData] = await Promise.all([
-      this.convertUrlsToDocuments(links, inProgress, allHtmls),
-      this.mode === "single_urls" && links.length > 0
-        ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
-            (error) => {
-              Logger.debug(`Failed to fetch sitemap data: ${error}`);
-              return null;
-            }
-          )
-        : Promise.resolve(null),
-    ]);
-
-    if (this.mode === "single_urls" && documents.length > 0) {
-      documents[0].metadata.sitemap = sitemapData ?? undefined;
-    } else {
-      documents = await this.getSitemapData(this.urls[0], documents);
-    }
-
-    if (this.pageOptions.includeMarkdown) {
-      documents = this.applyPathReplacements(documents);
-    }
-
-    if (!this.pageOptions.includeHtml) {
-      for (let document of documents) {
-        delete document.html;
-      }
-    }
-    
-    // documents = await this.applyImgAltText(documents);
-    if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
-      const extractionMode = this.extractorOptions?.mode ?? "markdown";
-      const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
-
-      if (
-        extractionMode === "llm-extraction" ||
-        extractionMode === "llm-extraction-from-markdown" ||
-        extractionMode === "llm-extraction-from-raw-html"
-      ) {
-        documents = await generateCompletions(
-          documents,
-          this.extractorOptions,
-          completionMode
-        );
-      }
-    }
-    return documents.concat(pdfDocuments).concat(docxDocuments);
-  }
-
-  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
-    return Promise.all(
-      pdfLinks.map(async (pdfLink) => {
-        const timer = Date.now();
-        const logInsertPromise = ScrapeEvents.insert(this.jobId, {
-          type: "scrape",
-          url: pdfLink,
-          worker: process.env.FLY_MACHINE_ID,
-          method: "pdf-scrape",
-          result: null,
-        });
-
-        const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
-          pdfLink,
-          this.pageOptions.parsePDF
-        );
-
-        const insertedLogId = await logInsertPromise;
-        ScrapeEvents.updateScrapeResult(insertedLogId, {
-          response_size: content.length,
-          success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
-          error: pageError,
-          response_code: pageStatusCode,
-          time_taken: Date.now() - timer,
-        });
-        return {
-          content: content,
-          markdown: content,
-          metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
-          provider: "web-scraper",
-        };
-      })
-    );
-  }
-  private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
-    return Promise.all(
-      docxLinks.map(async (docxLink) => {
-        const timer = Date.now();
-        const logInsertPromise = ScrapeEvents.insert(this.jobId, {
-          type: "scrape",
-          url: docxLink,
-          worker: process.env.FLY_MACHINE_ID,
-          method: "docx-scrape",
-          result: null,
-        });
-
-        const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
-          docxLink
-        );
-
-        const insertedLogId = await logInsertPromise;
-        ScrapeEvents.updateScrapeResult(insertedLogId, {
-          response_size: content.length,
-          success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
-          error: pageError,
-          response_code: pageStatusCode,
-          time_taken: Date.now() - timer,
-        });
-
-        return {
-          content,
-          metadata: { sourceURL: docxLink, pageStatusCode, pageError },
-          provider: "web-scraper",
-        };
-      })
-    );
-  }
-
-  private applyPathReplacements(documents: Document[]): Document[] {
-    if (this.replaceAllPathsWithAbsolutePaths) {
-      documents = replacePathsWithAbsolutePaths(documents);
-    }
-    return replaceImgPathsWithAbsolutePaths(documents);
-  }
-
-  private async applyImgAltText(documents: Document[]): Promise<Document[]> {
-    return this.generateImgAltText
-      ? this.generatesImgAltText(documents)
-      : documents;
-  }
-
-  private async cacheAndFinalizeDocuments(
-    documents: Document[],
-    links: string[]
-  ): Promise<Document[]> {
-    // await this.setCachedDocuments(documents, links);
-    documents = this.removeChildLinks(documents);
-    return documents.splice(0, this.limit);
-  }
-
-  private async processDocumentsWithCache(
-    inProgress?: (progress: Progress) => void
-  ): Promise<Document[]> {
-    let documents = await this.getCachedDocuments(
-      this.urls.slice(0, this.limit)
-    );
-    if (documents.length < this.limit) {
-      const newDocuments: Document[] = await this.getDocuments(
-        false,
-        inProgress
-      );
-      documents = this.mergeNewDocuments(documents, newDocuments);
-    }
-    documents = this.filterDocsExcludeInclude(documents);
-    documents = this.filterDepth(documents);
-    documents = this.removeChildLinks(documents);
-    return documents.splice(0, this.limit);
-  }
-
-  private mergeNewDocuments(
-    existingDocuments: Document[],
-    newDocuments: Document[]
-  ): Document[] {
-    newDocuments.forEach((doc) => {
-      if (
-        !existingDocuments.some(
-          (d) =>
-            this.normalizeUrl(d.metadata.sourceURL) ===
-            this.normalizeUrl(doc.metadata?.sourceURL)
-        )
-      ) {
-        existingDocuments.push(doc);
-      }
-    });
-    return existingDocuments;
-  }
-
-  private filterDocsExcludeInclude(documents: Document[]): Document[] {
-    return documents.filter((document) => {
-      const url = new URL(document.metadata.sourceURL);
-      const path = url.pathname;
-
-      if (!Array.isArray(this.excludes)) {
-        this.excludes = this.excludes.split(',');
-      }
-
-      if (this.excludes.length > 0 && this.excludes[0] !== "") {
-        // Check if the link should be excluded
-        if (
-          this.excludes.some((excludePattern) =>
-            new RegExp(excludePattern).test(path)
-          )
-        ) {
-          return false;
-        }
-      }
-
-      if (!Array.isArray(this.includes)) {
-        this.includes = this.includes.split(',');
-      }
-
-      if (this.includes.length > 0 && this.includes[0] !== "") {
-        // Check if the link matches the include patterns, if any are specified
-        if (this.includes.length > 0) {
-          return this.includes.some((includePattern) =>
-            new RegExp(includePattern).test(path)
-          );
-        }
-      }
-      return true;
-    });
-  }
-
-  private normalizeUrl(url: string): string {
-    if (url.includes("//www.")) {
-      return url.replace("//www.", "//");
-    }
-    return url;
-  }
-
-  private removeChildLinks(documents: Document[]): Document[] {
-    for (let document of documents) {
-      if (document?.childrenLinks) delete document.childrenLinks;
-    }
-    return documents;
-  }
-
-  async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
-    for (const document of documents) {
-      if (document.content.trim().length === 0) {
-        continue;
-      }
-      const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
-      await setValue(
-        "web-scraper-cache:" + normalizedUrl,
-        JSON.stringify({
-          ...document,
-          childrenLinks: childrenLinks || [],
-        }),
-        60 * 60
-      ); // 10 days
-    }
-  }
-
-  async getCachedDocuments(urls: string[]): Promise<Document[]> {
-    let documents: Document[] = [];
-    for (const url of urls) {
-      const normalizedUrl = this.normalizeUrl(url);
-      Logger.debug(
-        "Getting cached document for web-scraper-cache:" + normalizedUrl
-      );
-      const cachedDocumentString = await getValue(
-        "web-scraper-cache:" + normalizedUrl
-      );
-      if (cachedDocumentString) {
-        const cachedDocument = JSON.parse(cachedDocumentString);
-        documents.push(cachedDocument);
-
-        // get children documents
-        for (const childUrl of cachedDocument.childrenLinks || []) {
-          const normalizedChildUrl = this.normalizeUrl(childUrl);
-          const childCachedDocumentString = await getValue(
-            "web-scraper-cache:" + normalizedChildUrl
-          );
-          if (childCachedDocumentString) {
-            const childCachedDocument = JSON.parse(childCachedDocumentString);
-            if (
-              !documents.find(
-                (doc) =>
-                  doc.metadata.sourceURL ===
-                  childCachedDocument.metadata.sourceURL
-              )
-            ) {
-              documents.push(childCachedDocument);
-            }
-          }
-        }
-      }
-    }
-    return documents;
-  }
-
-  setOptions(options: WebScraperOptions): void {
-    if (!options.urls) {
-      throw new Error("Urls are required");
-    }
-
-    this.jobId = options.jobId;
-    this.bullJobId = options.bullJobId;
-    this.urls = options.urls;
-    this.mode = options.mode;
-    this.concurrentRequests = options.concurrentRequests ?? 20;
-    this.includes = options.crawlerOptions?.includes ?? [];
-    this.excludes = options.crawlerOptions?.excludes ?? [];
-    this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
-    this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
-    this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
-    this.limit = options.crawlerOptions?.limit ?? 10000;
-    this.generateImgAltText =
-      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = {
-      onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
-      includeHtml: options.pageOptions?.includeHtml ?? false,
-      replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
-      parsePDF: options.pageOptions?.parsePDF ?? true,
-      onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
-      removeTags: options.pageOptions?.removeTags ?? [],
-      includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
-      includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
-      includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false, 
-      waitFor: options.pageOptions?.waitFor ?? undefined,
-      headers: options.pageOptions?.headers ?? undefined,
-      includeLinks: options.pageOptions?.includeLinks ?? true,
-      fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
-      screenshot: options.pageOptions?.screenshot ?? false,
-      useFastMode: options.pageOptions?.useFastMode ?? false,
-      disableJsDom: options.pageOptions?.disableJsDom ?? false,
-      atsv: options.pageOptions?.atsv ?? false,
-      actions: options.pageOptions?.actions ?? undefined,
-      geolocation: options.pageOptions?.geolocation ?? undefined,
-      skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
-      removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
-      mobile: options.pageOptions?.mobile ?? false,
-    };
-    this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
-    this.replaceAllPathsWithAbsolutePaths =
-      options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
-      options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
-      false;
-
-    if (typeof options.crawlerOptions?.excludes === 'string') {
-      this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
-    }
-
-    if (typeof options.crawlerOptions?.includes === 'string') {
-      this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
-    }
-
-    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
-    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
-    this.allowBackwardCrawling =
-      options.crawlerOptions?.allowBackwardCrawling ?? false;
-    this.allowExternalContentLinks =
-      options.crawlerOptions?.allowExternalContentLinks ?? false;
-    this.priority = options.priority;
-    this.teamId = options.teamId ?? null;
-
-
-
-    // make sure all urls start with https://
-    this.urls = this.urls.map((url) => {
-      if (!url.trim().startsWith("http")) {
-        return `https://${url}`;
-      }
-      return url;
-    });
-  }
-
-  private async getSitemapData(baseUrl: string, documents: Document[]) {
-    const sitemapData = await fetchSitemapData(baseUrl);
-    if (sitemapData) {
-      for (let i = 0; i < documents.length; i++) {
-        const docInSitemapData = sitemapData.find(
-          (data) =>
-            this.normalizeUrl(data.loc) ===
-            this.normalizeUrl(documents[i].metadata.sourceURL)
-        );
-        if (docInSitemapData) {
-          let sitemapDocData: Partial<SitemapEntry> = {};
-          if (docInSitemapData.changefreq) {
-            sitemapDocData.changefreq = docInSitemapData.changefreq;
-          }
-          if (docInSitemapData.priority) {
-            sitemapDocData.priority = Number(docInSitemapData.priority);
-          }
-          if (docInSitemapData.lastmod) {
-            sitemapDocData.lastmod = docInSitemapData.lastmod;
-          }
-          if (Object.keys(sitemapDocData).length !== 0) {
-            documents[i].metadata.sitemap = sitemapDocData;
-          }
-        }
-      }
-    }
-    return documents;
-  }
-  private async getSitemapDataForSingleUrl(
-    baseUrl: string,
-    url: string,
-    timeout?: number
-  ) {
-    const sitemapData = await fetchSitemapData(baseUrl, timeout);
-    if (sitemapData) {
-      const docInSitemapData = sitemapData.find(
-        (data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
-      );
-      if (docInSitemapData) {
-        let sitemapDocData: Partial<SitemapEntry> = {};
-        if (docInSitemapData.changefreq) {
-          sitemapDocData.changefreq = docInSitemapData.changefreq;
-        }
-        if (docInSitemapData.priority) {
-          sitemapDocData.priority = Number(docInSitemapData.priority);
-        }
-        if (docInSitemapData.lastmod) {
-          sitemapDocData.lastmod = docInSitemapData.lastmod;
-        }
-        if (Object.keys(sitemapDocData).length !== 0) {
-          return sitemapDocData;
-        }
-      }
-    }
-    return null;
-  }
-  generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
-    await Promise.all(
-      documents.map(async (document) => {
-        const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
-
-        await Promise.all(
-          images.map(async (image: string) => {
-            let imageUrl = image.match(/\(([^)]+)\)/)[1];
-            let altText = image.match(/\[(.*?)\]/)[1];
-
-            if (
-              !altText &&
-              !imageUrl.startsWith("data:image") &&
-              /\.(png|jpeg|gif|webp)$/.test(imageUrl)
-            ) {
-              const imageIndex = document.content.indexOf(image);
-              const contentLength = document.content.length;
-              let backText = document.content.substring(
-                imageIndex + image.length,
-                Math.min(imageIndex + image.length + 1000, contentLength)
-              );
-              let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
-              let frontText = document.content.substring(
-                frontTextStartIndex,
-                imageIndex
-              );
-              altText = await getImageDescription(
-                imageUrl,
-                backText,
-                frontText,
-                this.generateImgAltTextModel
-              );
-            }
-
-            document.content = document.content.replace(
-              image,
-              `![${altText}](${imageUrl})`
-            );
-          })
-        );
-      })
-    );
-
-    return documents;
-  };
-
-  filterDepth(documents: Document[]): Document[] {
-    return documents.filter((document) => {
-      const url = new URL(document.metadata.sourceURL);
-      return getURLDepth(url.toString()) <= this.maxCrawledDepth;
-    });
-  }
-}
--- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts
@ -1,89 +0,0 @@
-import axios from "axios";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with Axios
- * @param url The URL to scrape
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithFetch(
-  url: string,
-  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
-  const logParams = {
-    url,
-    scraper: "fetch",
-    success: false,
-    response_code: null,
-    time_taken_seconds: null,
-    error_message: null,
-    html: "",
-    startTime: Date.now(),
-  };
-
-  try {
-    const response = await axios.get(url, {
-      headers: {
-        "Content-Type": "application/json",
-      },
-      timeout: universalTimeout,
-      transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
-    });
-
-    if (response.status !== 200) {
-      Logger.debug(
-        `⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
-      );
-      logParams.error_message = response.statusText;
-      logParams.response_code = response.status;
-      return {
-        content: "",
-        pageStatusCode: response.status,
-        pageError: response.statusText,
-      };
-    }
-
-    const contentType = response.headers["content-type"];
-    if (contentType && contentType.includes("application/pdf")) {
-      logParams.success = true;
-      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
-        url,
-        pageOptions?.parsePDF
-      );
-      logParams.response_code = pageStatusCode;
-      logParams.error_message = pageError;
-      return { content, pageStatusCode: response.status, pageError };
-    } else {
-      const text = response.data;
-      logParams.success = true;
-      logParams.html = text;
-      logParams.response_code = response.status;
-      return {
-        content: text,
-        pageStatusCode: response.status,
-        pageError: null,
-      };
-    }
-  } catch (error) {
-    if (error.code === "ECONNABORTED") {
-      logParams.error_message = "Request timed out";
-      Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
-    } else {
-      logParams.error_message = error.message || error;
-      Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
-    }
-    return {
-      content: "",
-      pageStatusCode: error.response?.status ?? null,
-      pageError: logParams.error_message,
-    };
-  } finally {
-    const endTime = Date.now();
-    logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
-    await logScrape(logParams);
-  }
-}
--- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
@ -1,230 +0,0 @@
-import axios from "axios";
-import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-import * as Sentry from "@sentry/node";
-import axiosRetry from 'axios-retry';
-
-axiosRetry(axios, { retries: 3 , onRetry:()=>{
-  console.log("Retrying (fire-engine)...");
-}, retryDelay: axiosRetry.exponentialDelay});
-/**
- * Scrapes a URL with Fire-Engine
- * @param url The URL to scrape
- * @param waitFor The time to wait for the page to load
- * @param screenshot Whether to take a screenshot
- * @param fullPageScreenshot Whether to take a full page screenshot
- * @param pageOptions The options for the page
- * @param headers The headers to send with the request
- * @param options The options for the request
- * @returns The scraped content
- */
-export async function scrapWithFireEngine({
-  url,
-  actions,
-  waitFor = 0,
-  screenshot = false,
-  fullPageScreenshot = false,
-  pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
-  fireEngineOptions = {},
-  headers,
-  options,
-  priority,
-  teamId,
-}: {
-  url: string;
-  actions?: Action[];
-  waitFor?: number;
-  screenshot?: boolean;
-  fullPageScreenshot?: boolean;
-  pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
-  fireEngineOptions?: FireEngineOptions;
-  headers?: Record<string, string>;
-  options?: any;
-  priority?: number;
-  teamId?: string;
-}): Promise<FireEngineResponse> {
-  const logParams = {
-    url,
-    scraper: "fire-engine",
-    success: false,
-    response_code: null,
-    time_taken_seconds: null,
-    error_message: null,
-    html: "",
-    startTime: Date.now(),
-  };
-
-  try {
-    const reqParams = await generateRequestParams(url);
-    let waitParam = reqParams["params"]?.wait ?? waitFor;
-    let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine  ?? "chrome-cdp";
-    let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
-    let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
-    let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
-
-
-    let endpoint = "/scrape";
-
-    if(options?.endpoint === "request") {
-      endpoint = "/request";
-    }
-
-    let engine = engineParam; // do we want fireEngineOptions as first choice?
-
-    if (pageOptions?.useFastMode) {
-      fireEngineOptionsParam.engine = "tlsclient";
-      engine = "tlsclient";
-    }
-
-    Logger.info(
-      `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
-    );
-
-    // atsv is only available for beta customers
-    const betaCustomersString = process.env.BETA_CUSTOMERS;
-    const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
-
-    if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
-      fireEngineOptionsParam.atsv = true;
-    } else {
-      pageOptions.atsv = false;
-    }
-
-    const axiosInstance = axios.create({
-      headers: { "Content-Type": "application/json" }
-    });
-
-    const startTime = Date.now();
-    const _response = await Sentry.startSpan({
-      name: "Call to fire-engine"
-    }, async span => {
-      
-      return await axiosInstance.post(
-        process.env.FIRE_ENGINE_BETA_URL + endpoint,
-        {
-          url: url,
-          headers: headers,
-          wait: waitParam,
-          screenshot: screenshotParam,
-          fullPageScreenshot: fullPageScreenshotParam,
-          disableJsDom: pageOptions?.disableJsDom ?? false,
-          priority,
-          engine,
-          instantReturn: true,
-          mobile: pageOptions?.mobile ?? false,
-          ...fireEngineOptionsParam,
-          atsv: pageOptions?.atsv ?? false,
-          scrollXPaths: pageOptions?.scrollXPaths ?? [],
-          geolocation: pageOptions?.geolocation,
-          skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
-          removeBase64Images: pageOptions?.removeBase64Images ?? true,
-          actions: actions,
-        },
-        {
-          headers: {
-            "Content-Type": "application/json",
-            ...(Sentry.isInitialized() ? ({
-                "sentry-trace": Sentry.spanToTraceHeader(span),
-                "baggage": Sentry.spanToBaggageHeader(span),
-            }) : {}),
-          }
-        }
-      );
-    });
-
-    const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
-
-    let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
-
-    // added 5 seconds to the timeout to account for 'smart wait'
-    while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
-      await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
-      checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
-    }
-
-    if (checkStatusResponse.data.processing) {
-      Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
-      axiosInstance.delete(
-        process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
-          validateStatus: (status) => true
-        }
-      ).catch((error) => {
-        Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);        
-      });
-      
-      Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
-      logParams.error_message = "Request timed out";
-      return { html: "", pageStatusCode: null, pageError: "" };
-    }
-
-    if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
-      Logger.debug(
-        `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
-      );
-      
-      logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
-      logParams.response_code = checkStatusResponse.data?.pageStatusCode;
-
-      if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
-        Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
-      }
-
-      const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
-
-      return {
-        html: "",
-        pageStatusCode,
-        pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
-      };
-    }
-
-    const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
-
-    if (contentType && contentType.includes("application/pdf")) {
-      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
-        url,
-        pageOptions?.parsePDF
-      );
-      logParams.success = true;
-      logParams.response_code = pageStatusCode;
-      logParams.error_message = pageError;
-      return { html: content, pageStatusCode, pageError };
-    } else {
-      const data = checkStatusResponse.data;
-      
-      logParams.success =
-        (data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
-        data.pageStatusCode === 404;
-      logParams.html = data.content ?? "";
-      logParams.response_code = data.pageStatusCode;
-      logParams.error_message = data.pageError ?? data.error;
-
-      return {
-        html: data.content ?? "",
-        screenshots: data.screenshots ?? [data.screenshot] ?? [],
-        pageStatusCode: data.pageStatusCode,
-        pageError: data.pageError ?? data.error,
-        scrapeActionContent: data?.actionContent ?? [],
-      };
-    }
-  } catch (error) {
-    if (error.code === "ECONNABORTED") {
-      Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
-      logParams.error_message = "Request timed out";
-    } else {
-      Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
-      logParams.error_message = error.message || error;
-    }
-    return { html: "", pageStatusCode: null, pageError: logParams.error_message };
-  } finally {
-    const endTime = Date.now();
-    logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
-    await logScrape(logParams, pageOptions);
-  }
-}
-
-
--- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts
@ -1,111 +0,0 @@
-import axios from "axios";
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with Playwright
- * @param url The URL to scrape
- * @param waitFor The time to wait for the page to load
- * @param headers The headers to send with the request
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithPlaywright(
-  url: string,
-  waitFor: number = 0,
-  headers?: Record<string, string>,
-  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
-  const logParams = {
-    url,
-    scraper: "playwright",
-    success: false,
-    response_code: null,
-    time_taken_seconds: null,
-    error_message: null,
-    html: "",
-    startTime: Date.now(),
-  };
-
-  try {
-    const reqParams = await generateRequestParams(url);
-    // If the user has passed a wait parameter in the request, use that
-    const waitParam = reqParams["params"]?.wait ?? waitFor;
-
-    const response = await axios.post(
-      process.env.PLAYWRIGHT_MICROSERVICE_URL,
-      {
-        url: url,
-        wait_after_load: waitParam,
-        timeout: universalTimeout + waitParam,
-        headers: headers,
-      },
-      {
-        headers: {
-          "Content-Type": "application/json",
-        },
-        timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
-        transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
-      }
-    );
-
-    if (response.status !== 200) {
-      Logger.debug(
-        `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
-      );
-      logParams.error_message = response.data?.pageError;
-      logParams.response_code = response.data?.pageStatusCode;
-      return {
-        content: "",
-        pageStatusCode: response.data?.pageStatusCode,
-        pageError: response.data?.pageError,
-      };
-    }
-
-    const contentType = response.headers["content-type"];
-    if (contentType && contentType.includes("application/pdf")) {
-      logParams.success = true;
-      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
-      logParams.response_code = pageStatusCode;
-      logParams.error_message = pageError;
-      return { content, pageStatusCode, pageError };
-    } else {
-      const textData = response.data;
-      try {
-        const data = JSON.parse(textData);
-        const html = data.content;
-        logParams.success = true;
-        logParams.html = html;
-        logParams.response_code = data.pageStatusCode;
-        logParams.error_message = data.pageError;
-        return {
-          content: html ?? "",
-          pageStatusCode: data.pageStatusCode,
-          pageError: data.pageError,
-        };
-      } catch (jsonError) {
-        logParams.error_message = jsonError.message || jsonError;
-        Logger.debug(
-          `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
-        );
-        return { content: "", pageStatusCode: null, pageError: logParams.error_message };
-      }
-    }
-  } catch (error) {
-    if (error.code === "ECONNABORTED") {
-      logParams.error_message = "Request timed out";
-      Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
-    } else {
-      logParams.error_message = error.message || error;
-      Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
-    }
-    return { content: "", pageStatusCode: null, pageError: logParams.error_message };
-  } finally {
-    const endTime = Date.now();
-    logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
-    await logScrape(logParams);
-  }
-}
--- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
@ -1,92 +0,0 @@
-import { logScrape } from "../../../services/logging/scrape_log";
-import { generateRequestParams } from "../single_url";
-import { fetchAndProcessPdf } from "../utils/pdfProcessor";
-import { universalTimeout } from "../global";
-import { ScrapingBeeClient } from "scrapingbee";
-import { Logger } from "../../../lib/logger";
-
-/**
- * Scrapes a URL with ScrapingBee
- * @param url The URL to scrape
- * @param wait_browser The browser event to wait for
- * @param timeout The timeout for the scrape
- * @param pageOptions The options for the page
- * @returns The scraped content
- */
-export async function scrapWithScrapingBee(
-    url: string,
-    wait_browser: string = "domcontentloaded",
-    timeout: number = universalTimeout,
-    pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-  ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
-    const logParams = {
-      url,
-      scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
-      success: false,
-      response_code: null,
-      time_taken_seconds: null,
-      error_message: null,
-      html: "",
-      startTime: Date.now(),
-    };
-    try {
-      const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
-      const clientParams = await generateRequestParams(
-        url,
-        wait_browser,
-        timeout
-      );
-      const response = await client.get({
-        ...clientParams,
-        params: {
-          ...clientParams.params,
-          transparent_status_code: "True",
-        },
-      });
-      Logger.info(
-        `⛏️ ScrapingBee: Scraping ${url}`
-      );
-      const contentType = response.headers["content-type"];
-      if (contentType && contentType.includes("application/pdf")) {
-        logParams.success = true;
-        const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
-        logParams.response_code = pageStatusCode;
-        logParams.error_message = pageError;
-        return { content, pageStatusCode, pageError };
-      } else {
-        let text = "";
-        try {
-          const decoder = new TextDecoder();
-          text = decoder.decode(response.data);
-          logParams.success = true;
-        } catch (decodeError) {
-          Logger.debug(
-            `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
-          );
-          logParams.error_message = decodeError.message || decodeError;
-        }
-        logParams.response_code = response.status;
-        logParams.html = text;
-        logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
-        logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
-        return {
-          content: text,
-          pageStatusCode: response.status,
-          pageError: response.statusText !== "OK" ? response.statusText : undefined,
-        };
-      }
-    } catch (error) {
-      Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
-      logParams.error_message = error.message || error;
-      logParams.response_code = error.response?.status;
-      return {
-        content: "",
-        pageStatusCode: error.response?.status,
-        pageError: error.response?.statusText,
-      };
-    } finally {
-      const endTime = Date.now();
-      logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
-      await logScrape(logParams);
-    }
-  }
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -1,506 +0,0 @@
-import * as cheerio from "cheerio";
-import { extractMetadata } from "./utils/metadata";
-import dotenv from "dotenv";
-import {
-  Document,
-  PageOptions,
-  FireEngineResponse,
-  ExtractorOptions,
-  Action,
-} from "../../lib/entities";
-import { parseMarkdown } from "../../lib/html-to-markdown";
-import { urlSpecificParams } from "./utils/custom/website_params";
-import { fetchAndProcessPdf } from "./utils/pdfProcessor";
-import { handleCustomScraping } from "./custom/handleCustomScraping";
-import { removeUnwantedElements } from "./utils/removeUnwantedElements";
-import { scrapWithFetch } from "./scrapers/fetch";
-import { scrapWithFireEngine } from "./scrapers/fireEngine";
-import { scrapWithPlaywright } from "./scrapers/playwright";
-import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
-import { extractLinks } from "./utils/utils";
-import { Logger } from "../../lib/logger";
-import { ScrapeEvents } from "../../lib/scrape-events";
-import { clientSideError } from "../../strings";
-import { ScrapeActionContent } from "../../lib/entities";
-import { removeBase64Images } from "./utils/removeBase64Images";
-
-dotenv.config();
-
-const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
-const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
-
-export const baseScrapers = [
-  useFireEngine ? "fire-engine;chrome-cdp" : undefined,
-  useFireEngine ? "fire-engine" : undefined,
-  useScrapingBee ? "scrapingBee" : undefined,
-  useFireEngine ? undefined : "playwright",
-  useScrapingBee ? "scrapingBeeLoad" : undefined,
-  "fetch",
-].filter(Boolean);
-
-export async function generateRequestParams(
-  url: string,
-  wait_browser: string = "domcontentloaded",
-  timeout: number = 15000
-): Promise<any> {
-  const defaultParams = {
-    url: url,
-    params: { timeout: timeout, wait_browser: wait_browser },
-    headers: { "ScrapingService-Request": "TRUE" },
-  };
-
-  try {
-    const urlKey = new URL(url).hostname.replace(/^www\./, "");
-    if (urlSpecificParams.hasOwnProperty(urlKey)) {
-      return { ...defaultParams, ...urlSpecificParams[urlKey] };
-    } else {
-      return defaultParams;
-    }
-  } catch (error) {
-    Logger.error(`Error generating URL key: ${error}`);
-    return defaultParams;
-  }
-}
-
-/**
- * Get the order of scrapers to be used for scraping a URL
- * If the user doesn't have envs set for a specific scraper, it will be removed from the order.
- * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
- * @returns The order of scrapers to be used for scraping a URL
- */
-function getScrapingFallbackOrder(
-  defaultScraper?: string,
-  isWaitPresent: boolean = false,
-  isScreenshotPresent: boolean = false,
-  isHeadersPresent: boolean = false,
-  isActionsPresent: boolean = false,
-) {
-  if (isActionsPresent) {
-    return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
-  }
-
-  const availableScrapers = baseScrapers.filter((scraper) => {
-    switch (scraper) {
-      case "scrapingBee":
-      case "scrapingBeeLoad":
-        return !!process.env.SCRAPING_BEE_API_KEY;
-      case "fire-engine":
-        return !!process.env.FIRE_ENGINE_BETA_URL;
-      case "fire-engine;chrome-cdp":
-        return !!process.env.FIRE_ENGINE_BETA_URL;  
-      case "playwright":
-        return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
-      default:
-        return true;
-    }
-  });
-
-  let defaultOrder = [
-    useFireEngine ? "fire-engine;chrome-cdp" : undefined,
-    useFireEngine ? "fire-engine" : undefined,
-    useScrapingBee ? "scrapingBee" : undefined,
-    useScrapingBee ? "scrapingBeeLoad" : undefined,
-    useFireEngine ? undefined : "playwright",
-    "fetch",
-  ].filter(Boolean);
-
-  // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
-  //   defaultOrder = [
-  //     "fire-engine",
-  //     useFireEngine ? undefined : "playwright",
-  //     ...defaultOrder.filter(
-  //       (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
-  //     ),
-  //   ].filter(Boolean);
-  // }
-
-  const filteredDefaultOrder = defaultOrder.filter(
-    (scraper: (typeof baseScrapers)[number]) =>
-      availableScrapers.includes(scraper)
-  );
-  const uniqueScrapers = new Set(
-    defaultScraper
-      ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
-      : [...filteredDefaultOrder, ...availableScrapers]
-  );
-
-  const scrapersInOrder = Array.from(uniqueScrapers);
-  return scrapersInOrder as (typeof baseScrapers)[number][];
-}
-
-
-
-export async function scrapSingleUrl(
-  jobId: string,
-  urlToScrap: string,
-  pageOptions: PageOptions,
-  extractorOptions?: ExtractorOptions,
-  existingHtml?: string,
-  priority?: number,
-  teamId?: string
-): Promise<Document> {
-  pageOptions = {
-    includeMarkdown: pageOptions.includeMarkdown ?? true,
-    includeExtract: pageOptions.includeExtract ?? false,
-    onlyMainContent: pageOptions.onlyMainContent ?? false,
-    includeHtml: pageOptions.includeHtml ?? false,
-    includeRawHtml: pageOptions.includeRawHtml ?? false,
-    waitFor: pageOptions.waitFor ?? undefined,
-    screenshot: pageOptions.screenshot ?? false,
-    fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
-    headers: pageOptions.headers ?? undefined,
-    includeLinks: pageOptions.includeLinks ?? true,
-    replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
-    parsePDF: pageOptions.parsePDF ?? true,
-    removeTags: pageOptions.removeTags ?? [],
-    onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
-    useFastMode: pageOptions.useFastMode ?? false,
-    disableJsDom: pageOptions.disableJsDom ?? false,
-    atsv: pageOptions.atsv ?? false,
-    actions: pageOptions.actions ?? undefined,
-    geolocation: pageOptions.geolocation ?? undefined,
-    skipTlsVerification: pageOptions.skipTlsVerification ?? false,
-    removeBase64Images: pageOptions.removeBase64Images ?? true,
-    mobile: pageOptions.mobile ?? false,
-  }
-
-  if (extractorOptions) {
-    extractorOptions = {
-      mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
-    }
-  }
-
-  if (!existingHtml) {
-    existingHtml = "";
-  }
-
-  urlToScrap = urlToScrap.trim();
-
-  const attemptScraping = async (
-    url: string,
-    method: (typeof baseScrapers)[number]
-  ) => {
-    let scraperResponse: {
-      text: string;
-      screenshot: string;
-      actions?: {
-        screenshots?: string[];
-        scrapes?: ScrapeActionContent[];
-      };
-      metadata: { pageStatusCode?: number; pageError?: string | null };
-    } = { text: "", screenshot: "", metadata: {} };
-    let screenshot = "";
-
-    const timer = Date.now();
-    const logInsertPromise = ScrapeEvents.insert(jobId, {
-      type: "scrape",
-      url,
-      worker: process.env.FLY_MACHINE_ID,
-      method,
-      result: null,
-    });
-
-    switch (method) {
-      case "fire-engine":
-      case "fire-engine;chrome-cdp":  
-
-        let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
-        if (method === "fire-engine;chrome-cdp") {
-          engine = "chrome-cdp";
-        }
-
-        if (process.env.FIRE_ENGINE_BETA_URL) {
-          const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
-            if (action.type === "click" || action.type === "write" || action.type === "press") {
-              const result: Action[] = [];
-              // Don't add a wait if the previous action is a wait
-              // if (index === 0 || array[index - 1].type !== "wait") {
-              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
-              // }
-              // Fire-engine now handles wait times automatically, leaving the code here for now
-              result.push(action);
-              // Don't add a wait if the next action is a wait
-              // if (index === array.length - 1 || array[index + 1].type !== "wait") {
-              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
-              // }
-              return result;
-            }
-            return [action as Action];
-          }) ?? [] as Action[];
-          
-          const response = await scrapWithFireEngine({
-            url,
-            ...(engine === "chrome-cdp" ? ({
-              actions: [
-                ...(pageOptions.waitFor ? [{
-                  type: "wait" as const,
-                  milliseconds: pageOptions.waitFor,
-                }] : []),
-                ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
-                  type: "screenshot" as const,
-                  fullPage: !!pageOptions.fullPageScreenshot,
-                }] : []),
-                ...processedActions,
-              ],
-            }) : ({
-              waitFor: pageOptions.waitFor,
-              screenshot: pageOptions.screenshot,
-              fullPageScreenshot: pageOptions.fullPageScreenshot,
-            })),
-            pageOptions: pageOptions,
-            headers: pageOptions.headers,
-            fireEngineOptions: {
-              engine: engine,
-              atsv: pageOptions.atsv,
-              disableJsDom: pageOptions.disableJsDom,
-            },
-            priority,
-            teamId,
-          });
-          scraperResponse.text = response.html;
-          if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
-            scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
-          }
-          if (pageOptions.actions) {
-            scraperResponse.actions = {
-              screenshots: response.screenshots ?? [],
-              scrapes: response.scrapeActionContent ?? [],
-            };
-          }
-          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
-          scraperResponse.metadata.pageError = response.pageError;
-        }
-        break;
-      case "scrapingBee":
-        if (process.env.SCRAPING_BEE_API_KEY) {
-          const response = await scrapWithScrapingBee(
-            url,
-            "domcontentloaded",
-            pageOptions.fallback === false ? 7000 : 15000
-          );
-          scraperResponse.text = response.content;
-          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
-          scraperResponse.metadata.pageError = response.pageError;
-        }
-        break;
-      case "playwright":
-        if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
-          const response = await scrapWithPlaywright(
-            url,
-            pageOptions.waitFor,
-            pageOptions.headers
-          );
-          scraperResponse.text = response.content;
-          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
-          scraperResponse.metadata.pageError = response.pageError;
-        }
-        break;
-      case "scrapingBeeLoad":
-        if (process.env.SCRAPING_BEE_API_KEY) {
-          const response = await scrapWithScrapingBee(url, "networkidle2");
-          scraperResponse.text = response.content;
-          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
-          scraperResponse.metadata.pageError = response.pageError;
-        }
-        break;
-      case "fetch":
-        const response = await scrapWithFetch(url);
-        scraperResponse.text = response.content;
-        scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
-        scraperResponse.metadata.pageError = response.pageError;
-        break;
-    }
-
-    let customScrapedContent: FireEngineResponse | null = null;
-
-    // Check for custom scraping conditions
-    const customScraperResult = await handleCustomScraping(
-      scraperResponse.text,
-      url
-    );
-
-    if (customScraperResult) {
-      switch (customScraperResult.scraper) {
-        case "fire-engine":
-          customScrapedContent = await scrapWithFireEngine({
-            url: customScraperResult.url,
-            actions: customScraperResult.waitAfterLoad ? ([
-              {
-                type: "wait",
-                milliseconds: customScraperResult.waitAfterLoad,
-              }
-            ]) : ([]),
-            pageOptions: customScraperResult.pageOptions,
-          });
-          break;
-        case "pdf":
-          const { content, pageStatusCode, pageError } =
-            await fetchAndProcessPdf(
-              customScraperResult.url,
-              pageOptions?.parsePDF
-            );
-          customScrapedContent = {
-            html: content,
-            pageStatusCode,
-            pageError,
-          };
-          break;
-      }
-    }
-
-    if (customScrapedContent) {
-      scraperResponse.text = customScrapedContent.html;
-    }
-    //* TODO: add an optional to return markdown or structured/extracted content
-    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
-    let text = await parseMarkdown(cleanedHtml);
-    if (pageOptions.removeBase64Images) {
-      text = await removeBase64Images(text);
-    }
-
-    const insertedLogId = await logInsertPromise;
-    ScrapeEvents.updateScrapeResult(insertedLogId, {
-      response_size: scraperResponse.text.length,
-      success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
-      error: scraperResponse.metadata.pageError,
-      response_code: scraperResponse.metadata.pageStatusCode,
-      time_taken: Date.now() - timer,
-    });
-
-    return {
-      text,
-      html: cleanedHtml,
-      rawHtml: scraperResponse.text,
-      screenshot: scraperResponse.screenshot,
-      actions: scraperResponse.actions,
-      pageStatusCode: scraperResponse.metadata.pageStatusCode,
-      pageError: scraperResponse.metadata.pageError || undefined,
-    };
-  };
-
-  let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
-    text: "",
-    html: "",
-    rawHtml: "",
-    screenshot: "",
-    actions: undefined,
-    pageStatusCode: 200,
-    pageError: undefined,
-  };
-  try {
-    let urlKey = urlToScrap;
-    try {
-      urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
-    } catch (error) {
-      Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
-    }
-    const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
-    const scrapersInOrder = getScrapingFallbackOrder(
-      defaultScraper,
-      pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
-      pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
-      pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
-      pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
-    );
-
-    for (const scraper of scrapersInOrder) {
-      // If exists text coming from crawler, use it
-      if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
-        let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
-        text = await parseMarkdown(cleanedHtml);
-        html = cleanedHtml;
-        break;
-      }
-
-      const attempt = await attemptScraping(urlToScrap, scraper);
-      text = attempt.text ?? "";
-      html = attempt.html ?? "";
-      rawHtml = attempt.rawHtml ?? "";
-      screenshot = attempt.screenshot ?? "";
-      actions = attempt.actions ?? undefined;
-
-      if (attempt.pageStatusCode) {
-        pageStatusCode = attempt.pageStatusCode;
-      }
-      
-      if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
-        pageError = attempt.pageError;
-        
-        if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
-          pageStatusCode = 500;
-        }
-      } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
-        pageError = undefined;
-      }
-
-      if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
-        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
-        break;
-      }
-      if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
-        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
-        break;
-      }
-      // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
-      // if (nextScraperIndex < scrapersInOrder.length) {
-      //   Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
-      // }
-    }
-
-    if (!text) {
-      throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
-    }
-
-    const soup = cheerio.load(rawHtml);
-    const metadata = extractMetadata(soup, urlToScrap);
-
-    let linksOnPage: string[] | undefined;
-
-    if (pageOptions.includeLinks) {
-      linksOnPage = extractLinks(rawHtml, urlToScrap);
-    }
-
-    let document: Document = {
-      content: text,
-      markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
-      html: pageOptions.includeHtml ? html : undefined,
-      rawHtml:
-        pageOptions.includeRawHtml ||
-          (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
-          ? rawHtml
-          : undefined,
-      linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
-      actions,
-      metadata: {
-        ...metadata,
-        ...(screenshot && screenshot.length > 0 ? ({
-          screenshot,
-        }) : {}),
-        sourceURL: urlToScrap,
-        pageStatusCode: pageStatusCode,
-        pageError: pageError,
-      },
-    };
-
-    return document;
-  } catch (error) {
-    Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
-    ScrapeEvents.insert(jobId, {
-      type: "error",
-      message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
-      stack: error.stack,
-    });
-
-    return {
-      content: "",
-      markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
-      html: "",
-      linksOnPage: pageOptions.includeLinks ? [] : undefined,
-      metadata: {
-        sourceURL: urlToScrap,
-        pageStatusCode: pageStatusCode,
-        pageError: pageError,
-      },
-    } as Document;
-  }
-}
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -1,9 +1,10 @@
 import axios from "axios";
 import { axiosTimeout } from "../../lib/timeout";
 import { parseStringPromise } from "xml2js";
-import { scrapWithFireEngine } from "./scrapers/fireEngine";
 import { WebCrawler } from "./crawler";
-import { Logger } from "../../lib/logger";
+import { logger } from "../../lib/logger";
+import { scrapeURL } from "../scrapeURL";
+import { scrapeOptions } from "../../controllers/v1/types";

 export async function getLinksFromSitemap(
  {
@ -17,17 +18,20 @@ export async function getLinksFromSitemap(
  }
 ): Promise<string[]> {
  try {
-    let content: string;
+    let content: string = "";
    try {
      if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
        const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
        content = response.data;
      } else if (mode === 'fire-engine') {
-        const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } });
-        content = response.html;
+        const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
+        if (!response.success) {
+          throw response.error;
+        }
+        content = response.document.rawHtml!;
      }
    } catch (error) {
-      Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
+      logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);

      return allUrls;
    }
@ -47,7 +51,7 @@ export async function getLinksFromSitemap(
      allUrls.push(...validUrls);
    }
  } catch (error) {
-    Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
+    logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
  }

  return allUrls;
--- a/apps/api/src/scraper/WebScraper/utils/tests/docxProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/docxProcessor.test.ts
@ -1,15 +0,0 @@
-import * as docxProcessor from "../docxProcessor";
-
-describe("DOCX Processing Module - Integration Test", () => {
-  it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
-    delete process.env.LLAMAPARSE_API_KEY;
-    const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
-      "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
-    );
-    expect(content.trim()).toContain(
-      "SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
-    );
-    expect(pageStatusCode).toBe(200);
-    expect(pageError).toBeUndefined();
-  });
-});
--- a/apps/api/src/scraper/WebScraper/utils/tests/parseTable.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/parseTable.test.ts
@ -1,128 +0,0 @@
-import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
-import cheerio from 'cheerio';
-
-describe('parseTablesToMarkdown', () => {
-  it('converts a simple HTML table to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><th>Header 1</th><th>Header 2</th></tr>
-        <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
-        <tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with a single row to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><th>Header 1</th><th>Header 2</th></tr>
-        <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with a single column to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><th>Header 1</th></tr>
-        <tr><td>Row 1 Col 1</td></tr>
-        <tr><td>Row 2 Col 1</td></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with a single cell to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><th>Header 1</th></tr>
-        <tr><td>Row 1 Col 1</td></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with no header to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
-        <tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with no rows to Markdown', async () => {
-    const html = `
-      <table>
-      </table>
-    `;
-    const expectedMarkdown = `<div></div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with no cells to Markdown', async () => {
-    const html = `
-      <table>
-        <tr></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div></div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with no columns to Markdown', async () => {
-    const html = `
-      <table>
-        <tr><th></th></tr>
-      </table>
-    `;
-    const expectedMarkdown = `<div></div>`;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-  it('converts a table with no table to Markdown', async () => {
-    const html = ``;
-    const expectedMarkdown = ``;
-    const markdown = await parseTablesToMarkdown(html);
-    expect(markdown).toBe(expectedMarkdown);
-  });
-
-it('converts a table inside of a bunch of html noise', async () => {
-  const html = `
-    <div>
-      <p>Some text before</p>
-      <table>
-        <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
-        <tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
-      </table>
-      <p>Some text after</p>
-    </div>
-  `;
-  const expectedMarkdown = `<div>
-      <p>Some text before</p>
-      <div>| Row 1 Col 1 | Row 1 Col 2 |
-| Row 2 Col 1 | Row 2 Col 2 |</div>
-      <p>Some text after</p>
-    </div>`;
-    
-  const markdown = await parseTablesToMarkdown(html);
-  expect(markdown).toBe(expectedMarkdown);
-});
-
-});
--- a/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/pdfProcessor.test.ts
@ -1,19 +0,0 @@
-import * as pdfProcessor from '../pdfProcessor';
-
-describe('PDF Processing Module - Integration Test', () => {
-  it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
-    delete process.env.LLAMAPARSE_API_KEY;
-    const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
-    expect(content.trim()).toEqual("Dummy PDF file");
-    expect(pageStatusCode).toEqual(200);
-    expect(pageError).toBeUndefined();
-  });
-
-  it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
-    const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
-    expect(pageStatusCode).toBe(200);
-    expect(pageError).toBeUndefined();
-    expect(content).toContain('/Title(arXiv:astro-ph/9301001v1  7 Jan 1993)>>endobj');
-  }, 60000); // 60 seconds
-
-});
--- a/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
--- a/apps/api/src/scraper/WebScraper/utils/tests/replacePaths.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/replacePaths.test.ts
@ -1,127 +0,0 @@
-import { Document } from "../../../../lib/entities";
-import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
-
-describe('replacePaths', () => {
-  describe('replacePathsWithAbsolutePaths', () => {
-    it('should replace relative paths with absolute paths', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](/path/to/resource).',
-        markdown: 'This is a [link](/path/to/resource).'
-      }];
-
-      const expectedDocuments: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](https://example.com/path/to/resource).',
-        markdown: 'This is a [link](https://example.com/path/to/resource).'
-      }];
-
-      const result = replacePathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it('should not alter absolute URLs', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an [external link](https://external.com/path).',
-        markdown: 'This is an [external link](https://external.com/path).'
-      }];
-
-      const result = replacePathsWithAbsolutePaths(documents);
-      expect(result).toEqual(documents); // Expect no change
-    });
-
-    it('should not alter data URLs for images', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
-        markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
-      }];
-
-      const result = replacePathsWithAbsolutePaths(documents);
-      expect(result).toEqual(documents); // Expect no change
-    });
-
-    it('should handle multiple links and images correctly', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](/path1) and [link2](/path2).',
-        markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
-      }];
-
-      const expectedDocuments: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
-        markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
-      }];
-
-      const result = replacePathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it('should correctly handle a mix of absolute and relative paths', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
-        markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
-      }];
-
-      const expectedDocuments: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
-        markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
-      }];
-
-      const result = replacePathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-    
-  });
-
-  describe('replaceImgPathsWithAbsolutePaths', () => {
-    it('should replace relative image paths with absolute paths', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](/path/to/image.jpg).',
-        markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
-      }];
-
-      const expectedDocuments: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
-        markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
-      }];
-
-      const result = replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-
-    it('should not alter data:image URLs', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
-        markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
-      }];
-
-      const result = replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(documents); // Expect no change
-    });
-
-    it('should handle multiple images with a mix of data and relative URLs', () => {
-      const documents: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
-        markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
-      }];
-
-      const expectedDocuments: Document[] = [{
-        metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
-        markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
-      }];
-
-      const result = replaceImgPathsWithAbsolutePaths(documents);
-      expect(result).toEqual(expectedDocuments);
-    });
-  });
-});
--- a/apps/api/src/scraper/WebScraper/utils/tests/socialBlockList.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/socialBlockList.test.ts
@ -1,66 +0,0 @@
-import { Logger } from '../../../../lib/logger';
-import { isUrlBlocked } from '../blocklist';
-
-describe('isUrlBlocked', () => {
-  it('should return true for blocked social media URLs', () => {
-    const blockedUrls = [
-      'https://www.facebook.com',
-      'https://twitter.com/someuser',
-      'https://instagram.com/someuser',
-      'https://www.linkedin.com/in/someuser',
-      'https://snapchat.com/someuser',
-      'https://tiktok.com/@someuser',
-      'https://reddit.com/r/somesubreddit',
-      'https://flickr.com/photos/someuser',
-      'https://whatsapp.com/someuser',
-      'https://wechat.com/someuser',
-      'https://telegram.org/someuser',
-    ];
-
-    blockedUrls.forEach(url => {
-      if (!isUrlBlocked(url)) {
-        Logger.debug(`URL not blocked: ${url}`);
-      }
-      expect(isUrlBlocked(url)).toBe(true);
-    });
-  });
-
-  it('should return false for URLs containing allowed keywords', () => {
-    const allowedUrls = [
-      'https://www.facebook.com/privacy',
-      'https://twitter.com/terms',
-      'https://instagram.com/legal',
-      'https://www.linkedin.com/help',
-      'https://pinterest.com/about',
-      'https://snapchat.com/support',
-      'https://tiktok.com/contact',
-      'https://reddit.com/user-agreement',
-      'https://tumblr.com/policy',
-      'https://flickr.com/blog',
-      'https://whatsapp.com/press',
-      'https://wechat.com/careers',
-      'https://telegram.org/conditions',
-      'https://wix.com/careers',
-    ];
-
-    allowedUrls.forEach(url => {
-      expect(isUrlBlocked(url)).toBe(false);
-    });
-  });
-
-  it('should return false for non-blocked URLs', () => {
-    const nonBlockedUrls = [
-      'https://www.example.com',
-      'https://www.somewebsite.org',
-      'https://subdomain.example.com',
-      'firecrawl.dev',
-      'amazon.com',
-      'wix.com',
-      'https://wix.com'
-    ];
-
-    nonBlockedUrls.forEach(url => {
-      expect(isUrlBlocked(url)).toBe(false);
-    });
-  });
-});
--- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts
+++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts
@ -1,4 +1,4 @@
-import { Logger } from "../../../lib/logger";
+import { logger } from "../../../lib/logger";

 const socialMediaBlocklist = [
  'facebook.com',
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
    return isBlocked;
  } catch (e) {
    // If an error occurs (e.g., invalid URL), return false
-    Logger.error(`Error parsing the following URL: ${url}`);
+    logger.error(`Error parsing the following URL: ${url}`);
    return false;
  }
 }
--- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts
+++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts
@ -1,198 +0,0 @@
-export const urlSpecificParams = {
-  
-  "support.greenpay.me":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 2000,
-
-      },
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-        "sec-fetch-site": "same-origin",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-dest": "empty",
-        referer: "https://www.google.com/",
-        "accept-language": "en-US,en;q=0.9",
-        "accept-encoding": "gzip, deflate, br",
-        accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-      },
-  },
-  "docs.pdw.co":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 3000,
-      },
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-        "sec-fetch-site": "same-origin",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-dest": "empty",
-        referer: "https://www.google.com/",
-        "accept-language": "en-US,en;q=0.9",
-        "accept-encoding": "gzip, deflate, br",
-        accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-      },
-  },
-  "developers.notion.com":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 2000,
-      },
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-        "sec-fetch-site": "same-origin",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-dest": "empty",
-        referer: "https://www.google.com/",
-        "accept-language": "en-US,en;q=0.9",
-        "accept-encoding": "gzip, deflate, br",
-        accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-      },
-  },
-  "docs2.hubitat.com":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 2000,
-      },
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-        "sec-fetch-site": "same-origin",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-dest": "empty",
-        referer: "https://www.google.com/",
-        "accept-language": "en-US,en;q=0.9",
-        "accept-encoding": "gzip, deflate, br",
-        accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-      },
-  },
-  "scrapethissite.com":{
-    defaultScraper: "fetch",
-    headers: {
-      "User-Agent":
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-      "sec-fetch-site": "same-origin",
-      "sec-fetch-mode": "cors",
-      "sec-fetch-dest": "empty",
-      referer: "https://www.google.com/",
-      "accept-language": "en-US,en;q=0.9",
-      "accept-encoding": "gzip, deflate, br",
-      accept:
-        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-    },
-  },
-  "rsseau.fr":{
-    defaultScraper: "fetch",
-    headers: {
-      "User-Agent":
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-      "sec-fetch-site": "same-origin",
-      "sec-fetch-mode": "cors",
-      "sec-fetch-dest": "empty",
-      referer: "https://www.google.com/",
-      "accept-language": "en-US,en;q=0.9",
-      "accept-encoding": "gzip, deflate, br",
-      accept:
-        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-    },
-  },
-  "help.salesforce.com":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 2000,
-      },
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-        "sec-fetch-site": "same-origin",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-dest": "empty",
-        referer: "https://www.google.com/",
-        "accept-language": "en-US,en;q=0.9",
-        "accept-encoding": "gzip, deflate, br",
-        accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-      },
-  },
-  "ir.veeva.com":{
-    defaultScraper: "fire-engine",
-  },
-  "eonhealth.com":{
-    defaultScraper: "fire-engine",
-    params:{
-      fireEngineOptions:{
-        mobileProxy: true,
-        method: "get",
-        engine: "request",
-      },
-    },
-  },
-  "notion.com":{
-    defaultScraper: "fire-engine",
-    params: {
-        wait_browser: "networkidle2",
-        block_resources: false,
-        wait: 2000,
-        engine: "playwright",
-      }
-  },
-  "developer.apple.com":{
-    defaultScraper: "fire-engine",
-    params:{
-      engine: "playwright",
-      wait: 2000,
-      fireEngineOptions: {
-        blockMedia: false,
-      }
-    },
-  },
-  "amazon.com":{
-    defaultScraper: "fire-engine",
-    params:{
-      fireEngineOptions:{
-        engine: "chrome-cdp",
-      },
-    },
-  },
-  "digikey.com":{
-    defaultScraper: "fire-engine",
-    params:{
-      fireEngineOptions:{
-        engine: "tlsclient",
-      },
-    },
-  },
-  "zoopla.co.uk":{
-    defaultScraper: "fire-engine",
-    params:{
-      fireEngineOptions:{
-        engine: "chrome-cdp",
-      },
-    },
-  },
-  "lorealparis.hu":{
-    defaultScraper: "fire-engine",
-    params:{
-      fireEngineOptions:{
-        engine: "tlsclient",
-      },
-    },
-  }
-};
--- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
@ -1,79 +0,0 @@
-import axios from "axios";
-import fs from "fs";
-import { createWriteStream } from "node:fs";
-import path from "path";
-import os from "os";
-import mammoth from "mammoth";
-import { Logger } from "../../../lib/logger";
-
-export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
-  let tempFilePath = '';
-  let pageStatusCode = 200;
-  let pageError = '';
-  let content = '';
-
-  try {
-    const downloadResult = await downloadDocx(url);
-    tempFilePath = downloadResult.tempFilePath;
-    pageStatusCode = downloadResult.pageStatusCode;
-    pageError = downloadResult.pageError;
-    content = await processDocxToText(tempFilePath);
-  } catch (error) {
-    Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
-    pageStatusCode = 500;
-    pageError = error.message;
-    content = '';
-  } finally {
-    if (tempFilePath) {
-      fs.unlinkSync(tempFilePath); // Clean up the temporary file
-    }
-  }
-
-  return { content, pageStatusCode, pageError };
-}
-
-async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
-  try {
-    const response = await axios({
-      url,
-      method: "GET",
-      responseType: "stream",
-    });
-
-    const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
-    const writer = createWriteStream(tempFilePath);
-
-    response.data.pipe(writer);
-
-    return new Promise((resolve, reject) => {
-      writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
-      writer.on("error", () => {
-        Logger.error('Failed to write DOCX file to disk');
-        reject(new Error('Failed to write DOCX file to disk'));
-      });
-    });
-  } catch (error) {
-    Logger.error(`Failed to download DOCX: ${error.message}`);
-    return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
-  }
-}
-
-export async function processDocxToText(filePath: string): Promise<string> {
-  try {
-    const content = await extractTextFromDocx(filePath);
-    return content;
-  } catch (error) {
-    Logger.error(`Failed to process DOCX to text: ${error.message}`);
-    return "";
-  }
-}
-
-async function extractTextFromDocx(filePath: string): Promise<string> {
-  try {
-    const result = await mammoth.extractRawText({ path: filePath });
-    return result.value;
-  } catch (error) {
-    Logger.error(`Failed to extract text from DOCX: ${error.message}`);
-    return "";
-  }
-}
--- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
+++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
@ -1,42 +0,0 @@
-export const excludeNonMainTags = [
-  "header",
-  "footer",
-  "nav",
-  "aside",
-  ".top",
-  ".navbar",
-  ".footer",
-  ".bottom",
-  "#footer",
-  ".sidebar",
-  ".side",
-  ".aside",
-  "#sidebar",
-  ".modal",
-  ".popup",
-  "#modal",
-  ".overlay",
-  ".ad",
-  ".ads",
-  ".advert",
-  "#ad",
-  ".lang-selector",
-  ".language",
-  "#language-selector",
-  ".social",
-  ".social-media",
-  ".social-links",
-  "#social",
-  ".menu",
-  ".navigation",
-  "#nav",
-  ".breadcrumbs",
-  "#breadcrumbs",
-  "#search-form",
-  ".search",
-  "#search",
-  ".share",
-  "#share",
-  ".cookie",
-  "#cookie"
-];
--- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts
+++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts
@ -1,89 +0,0 @@
-import Anthropic from '@anthropic-ai/sdk';
-import axios from 'axios';
-import { Logger } from '../../../lib/logger';
-
-export async function getImageDescription(
-  imageUrl: string,
-  backText: string,
-  frontText: string,
-  model: string = "gpt-4-turbo"
-): Promise<string> {
-  try {
-    const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
-      backText +
-      " and the following text: " +
-      frontText +
-      ". Be super concise."
-
-    switch (model) {
-      case 'claude-3-opus': {
-        if (!process.env.ANTHROPIC_API_KEY) {
-          throw new Error("No Anthropic API key provided");
-        }
-        const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
-        const imageMediaType = 'image/png';
-        const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
-
-        const anthropic = new Anthropic();
-        const response = await anthropic.messages.create({
-          model: "claude-3-opus-20240229",
-          max_tokens: 1024,
-          messages: [
-            {
-              role: "user",
-              content: [
-                {
-                  type: "image",
-                  source: {
-                    type: "base64",
-                    media_type: imageMediaType,
-                    data: imageData,
-                  },
-                },
-                {
-                  type: "text",
-                  text: prompt
-                }
-              ],
-            }
-          ]
-        });
-
-        return response[0].content.text;
-      }
-      default: {
-        if (!process.env.OPENAI_API_KEY) {
-          throw new Error("No OpenAI API key provided");
-        }
-
-        const { OpenAI } = require("openai");
-        const openai = new OpenAI();
-      
-        const response = await openai.chat.completions.create({
-          model: "gpt-4-turbo",
-          messages: [
-            {
-              role: "user",
-              content: [
-                {
-                  type: "text",
-                  text: prompt,
-                },
-                {
-                  type: "image_url",
-                  image_url: {
-                    url: imageUrl,
-                  },
-                },
-              ],
-            },
-          ],
-        });
-        return response.choices[0].message.content;
-      }
-    }
-  } catch (error) {
-    Logger.error(`Error generating image alt text: ${error}`);
-    return "";
-  }
-}
--- a/apps/api/src/scraper/WebScraper/utils/metadata.ts
+++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts
@ -1,185 +0,0 @@
-import { CheerioAPI } from "cheerio";
-import { Logger } from "../../../lib/logger";
-
-interface Metadata {
-  title?: string;
-  description?: string;
-  language?: string;
-  keywords?: string;
-  robots?: string;
-  ogTitle?: string;
-  ogDescription?: string;
-  ogUrl?: string;
-  ogImage?: string;
-  ogAudio?: string;
-  ogDeterminer?: string;
-  ogLocale?: string;
-  ogLocaleAlternate?: string[];
-  ogSiteName?: string;
-  ogVideo?: string;
-  dctermsCreated?: string;
-  dcDateCreated?: string;
-  dcDate?: string;
-  dctermsType?: string;
-  dcType?: string;
-  dctermsAudience?: string;
-  dctermsSubject?: string;
-  dcSubject?: string;
-  dcDescription?: string;
-  dctermsKeywords?: string;
-  modifiedTime?: string;
-  publishedTime?: string;
-  articleTag?: string;
-  articleSection?: string;
-  sourceURL?: string;
-  pageStatusCode?: number;
-  pageError?: string;
-  [key: string]: string | string[] | number | undefined;
-}
-
-export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
-  let title: string | null = null;
-  let description: string | null = null;
-  let language: string | null = null;
-  let keywords: string | null = null;
-  let robots: string | null = null;
-  let ogTitle: string | null = null;
-  let ogDescription: string | null = null;
-  let ogUrl: string | null = null;
-  let ogImage: string | null = null;
-  let ogAudio: string | null = null;
-  let ogDeterminer: string | null = null;
-  let ogLocale: string | null = null;
-  let ogLocaleAlternate: string[] | null = null;
-  let ogSiteName: string | null = null;
-  let ogVideo: string | null = null;
-  let dctermsCreated: string | null = null;
-  let dcDateCreated: string | null = null;
-  let dcDate: string | null = null;
-  let dctermsType: string | null = null;
-  let dcType: string | null = null;
-  let dctermsAudience: string | null = null;
-  let dctermsSubject: string | null = null;
-  let dcSubject: string | null = null;
-  let dcDescription: string | null = null;
-  let dctermsKeywords: string | null = null;
-  let modifiedTime: string | null = null;
-  let publishedTime: string | null = null;
-  let articleTag: string | null = null;
-  let articleSection: string | null = null;
-  let sourceURL: string | null = null;
-  let pageStatusCode: number | null = null;
-  let pageError: string | null = null;
-
-  const customMetadata: Record<string, string | string[]> = {};
-
-  try {
-    // TODO: remove this as it is redundant with the below implementation
-    title = soup("title").text() || null;
-    description = soup('meta[name="description"]').attr("content") || null;
-
-    language = soup("html").attr("lang") || null;
-
-    keywords = soup('meta[name="keywords"]').attr("content") || null;
-    robots = soup('meta[name="robots"]').attr("content") || null;
-    ogTitle = soup('meta[property="og:title"]').attr("content") || null;
-    ogDescription =
-      soup('meta[property="og:description"]').attr("content") || null;
-    ogUrl = soup('meta[property="og:url"]').attr("content") || null;
-    ogImage = soup('meta[property="og:image"]').attr("content") || null;
-    ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
-    ogDeterminer =
-      soup('meta[property="og:determiner"]').attr("content") || null;
-    ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
-    ogLocaleAlternate =
-      soup('meta[property="og:locale:alternate"]')
-        .map((i, el) => soup(el).attr("content"))
-        .get() || null;
-    ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
-    ogVideo = soup('meta[property="og:video"]').attr("content") || null;
-    articleSection =
-      soup('meta[name="article:section"]').attr("content") || null;
-    articleTag = soup('meta[name="article:tag"]').attr("content") || null;
-    publishedTime =
-      soup('meta[property="article:published_time"]').attr("content") || null;
-    modifiedTime =
-      soup('meta[property="article:modified_time"]').attr("content") || null;
-    dctermsKeywords =
-      soup('meta[name="dcterms.keywords"]').attr("content") || null;
-    dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
-    dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
-    dctermsSubject =
-      soup('meta[name="dcterms.subject"]').attr("content") || null;
-    dctermsAudience =
-      soup('meta[name="dcterms.audience"]').attr("content") || null;
-    dcType = soup('meta[name="dc.type"]').attr("content") || null;
-    dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
-    dcDate = soup('meta[name="dc.date"]').attr("content") || null;
-    dcDateCreated =
-      soup('meta[name="dc.date.created"]').attr("content") || null;
-    dctermsCreated =
-      soup('meta[name="dcterms.created"]').attr("content") || null;
-
-    try {
-      // Extract all meta tags for custom metadata
-      soup("meta").each((i, elem) => {
-        try {
-          const name = soup(elem).attr("name") || soup(elem).attr("property");
-          const content = soup(elem).attr("content");
-
-          if (name && content) {
-            if (customMetadata[name] === undefined) {
-              customMetadata[name] = content;
-            } else if (Array.isArray(customMetadata[name])) {
-              (customMetadata[name] as string[]).push(content);
-            } else {
-              customMetadata[name] = [customMetadata[name] as string, content];
-            }
-          }
-        } catch (error) {
-          Logger.error(`Error extracting custom metadata (in): ${error}`);
-        }
-      });
-    } catch (error) {
-      Logger.error(`Error extracting custom metadata: ${error}`);
-    }
-  } catch (error) {
-    Logger.error(`Error extracting metadata: ${error}`);
-  }
-
-  return {
-    ...(title ? { title } : {}),
-    ...(description ? { description } : {}),
-    ...(language ? { language } : {}),
-    ...(keywords ? { keywords } : {}),
-    ...(robots ? { robots } : {}),
-    ...(ogTitle ? { ogTitle } : {}),
-    ...(ogDescription ? { ogDescription } : {}),
-    ...(ogUrl ? { ogUrl } : {}),
-    ...(ogImage ? { ogImage } : {}),
-    ...(ogAudio ? { ogAudio } : {}),
-    ...(ogDeterminer ? { ogDeterminer } : {}),
-    ...(ogLocale ? { ogLocale } : {}),
-    ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
-    ...(ogSiteName ? { ogSiteName } : {}),
-    ...(ogVideo ? { ogVideo } : {}),
-    ...(dctermsCreated ? { dctermsCreated } : {}),
-    ...(dcDateCreated ? { dcDateCreated } : {}),
-    ...(dcDate ? { dcDate } : {}),
-    ...(dctermsType ? { dctermsType } : {}),
-    ...(dcType ? { dcType } : {}),
-    ...(dctermsAudience ? { dctermsAudience } : {}),
-    ...(dctermsSubject ? { dctermsSubject } : {}),
-    ...(dcSubject ? { dcSubject } : {}),
-    ...(dcDescription ? { dcDescription } : {}),
-    ...(dctermsKeywords ? { dctermsKeywords } : {}),
-    ...(modifiedTime ? { modifiedTime } : {}),
-    ...(publishedTime ? { publishedTime } : {}),
-    ...(articleTag ? { articleTag } : {}),
-    ...(articleSection ? { articleSection } : {}),
-    ...(sourceURL ? { sourceURL } : {}),
-    ...(pageStatusCode ? { pageStatusCode } : {}),
-    ...(pageError ? { pageError } : {}),
-    ...customMetadata,
-  };
-}
--- a/apps/api/src/scraper/WebScraper/utils/parseTable.ts
+++ b/apps/api/src/scraper/WebScraper/utils/parseTable.ts
@ -1,74 +0,0 @@
-import cheerio, { CheerioAPI } from "cheerio";
-
-interface Replacement {
-  start: number;
-  end: number;
-  markdownTable: string;
-}
-
-export const parseTablesToMarkdown = async (html: string): Promise<string> => {
-  const soup: CheerioAPI = cheerio.load(html, {
-    xmlMode: true,
-    withStartIndices: true,
-    withEndIndices: true
-  });
-  let tables = soup("table");
-  let replacements: Replacement[] = [];
-
-  if (tables.length) {
-    tables.each((_, tableElement) => {
-      const start: number = tableElement.startIndex;
-      const end: number = tableElement.endIndex + 1; // Include the closing tag properly
-      let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
-      const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
-      if (isTableEmpty) {
-        markdownTable = '';
-      }
-      replacements.push({ start, end, markdownTable });
-    });
-  }
-
-  replacements.sort((a, b) => b.start - a.start);
-  
-  let modifiedHtml: string = html;
-  replacements.forEach(({ start, end, markdownTable }) => {
-    modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
-  });
-
-  return modifiedHtml.trim();
-};
-
-export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
-  let rows: string[] = [];
-  let headerRowFound: boolean = false;
-  tableSoup("tr").each((i, tr) => {
-    const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
-      let cellText: string = tableSoup(cell).text().trim();
-      if (tableSoup(cell).is("th") && !headerRowFound) {
-        headerRowFound = true;
-      }
-      return ` ${cellText} |`;
-    }).get().join("");
-    if (cells) {
-      rows.push(`|${cells}`);
-    }
-    if (headerRowFound && i === 0) { // Header row
-      rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
-    }
-  });
-
-  return rows.join('\n').trim();
-};
-
-export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
-  const cells: string = rowSoup("td, th").map((_, cell) => {
-    let cellText: string = rowSoup(cell).text().trim();
-    return ` ${cellText} |`;
-  }).get().join("");
-
-  return `|${cells}`;
-};
-
-export function createMarkdownDividerRow(cellCount: number): string {
-  return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
-}
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -1,140 +0,0 @@
-import axios, { AxiosResponse } from "axios";
-import fs from "fs/promises";
-import { createReadStream, createWriteStream } from "node:fs";
-import FormData from "form-data";
-import dotenv from "dotenv";
-import pdf from "pdf-parse";
-import path from "path";
-import os from "os";
-import { axiosTimeout } from "../../../lib/timeout";
-import { Logger } from "../../../lib/logger";
-
-dotenv.config();
-
-export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
-  try {
-    const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
-    const content = await processPdfToText(tempFilePath, parsePDF);
-    await fs.unlink(tempFilePath); // Clean up the temporary file
-    return { content, pageStatusCode, pageError };
-  } catch (error) {
-    Logger.error(`Failed to fetch and process PDF: ${error.message}`);
-    return { content: "", pageStatusCode: 500, pageError: error.message };
-  }
-}
-
-async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
-  const response = await axios({
-    url,
-    method: "GET",
-    responseType: "stream",
-  });
-
-  const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
-  const writer = createWriteStream(tempFilePath);
-
-  response.data.pipe(writer);
-
-  return new Promise((resolve, reject) => {
-    writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
-    writer.on("error", reject);
-  });
-}
-
-export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
-  let content = "";
-
-  if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
-    Logger.debug("Processing pdf document w/ LlamaIndex");
-    const apiKey = process.env.LLAMAPARSE_API_KEY;
-    const headers = {
-      Authorization: `Bearer ${apiKey}`,
-    };
-    const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
-    const fileType2 = "application/pdf";
-
-    try {
-      const formData = new FormData();
-      formData.append("file", createReadStream(filePath), {
-        filename: filePath,
-        contentType: fileType2,
-      });
-
-      const uploadUrl = `${base_url}/upload`;
-      const uploadResponse = await axios.post(uploadUrl, formData, {
-        headers: {
-          ...headers,
-          ...formData.getHeaders(),
-        },
-      });
-
-      const jobId = uploadResponse.data.id;
-      const resultType = "text";
-      const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
-
-      let resultResponse: AxiosResponse;
-      let attempt = 0;
-      const maxAttempts = 10; // Maximum number of attempts
-      let resultAvailable = false;
-      while (attempt < maxAttempts && !resultAvailable) {
-        try {
-          resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
-          if (resultResponse.status === 200) {
-            resultAvailable = true; // Exit condition met
-          } else {
-            // If the status code is not 200, increment the attempt counter and wait
-            attempt++;
-            await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
-          }
-        } catch (error) {
-          Logger.debug("Error fetching result w/ LlamaIndex");
-          attempt++;
-          if (attempt >= maxAttempts) {
-            Logger.error("Max attempts reached, unable to fetch result.");
-            break; // Exit the loop if max attempts are reached
-          }
-          await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
-          // You may want to handle specific errors differently
-        }
-      }
-
-      if (!resultAvailable) {
-        try {
-          content = await processPdf(filePath);
-        } catch (error) {
-          Logger.error(`Failed to process PDF: ${error}`);
-          content = "";
-        }
-      }
-      content = resultResponse.data[resultType];
-    } catch (error) {
-      Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
-      content = await processPdf(filePath);
-    }
-  } else if (parsePDF) {
-    try {
-      content = await processPdf(filePath);
-    } catch (error) {
-      Logger.error(`Failed to process PDF: ${error}`);
-      content = "";
-    }
-  } else {
-    try {
-      content = await fs.readFile(filePath, "utf-8");
-    } catch (error) {
-      Logger.error(`Failed to read PDF file: ${error}`);
-      content = "";
-    }
-  }
-  return content;
-}
-
-async function processPdf(file: string) {
-  try {
-    const fileContent = await fs.readFile(file);
-    const data = await pdf(fileContent);
-    return data.text;
-  } catch (error) {
-    throw error;
-  }
-}
--- a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
+++ b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
@ -1,82 +0,0 @@
-import { AnyNode, Cheerio, load } from "cheerio";
-import { PageOptions } from "../../../lib/entities";
-import { excludeNonMainTags } from "./excludeTags";
-
-export const removeUnwantedElements = (
-  html: string,
-  pageOptions: PageOptions,
-) => {
-  let soup = load(html);
-
-  if (
-    pageOptions.onlyIncludeTags &&
-    pageOptions.onlyIncludeTags.length > 0 &&
-    pageOptions.onlyIncludeTags[0] !== ""
-  ) {
-    if (typeof pageOptions.onlyIncludeTags === "string") {
-      pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
-    }
-    if (pageOptions.onlyIncludeTags.length !== 0) {
-      // Create a new root element to hold the tags to keep
-      const newRoot = load("<div></div>")("div");
-      pageOptions.onlyIncludeTags.forEach((tag) => {
-        soup(tag).each((index, element) => {
-          newRoot.append(soup(element).clone());
-        });
-      });
-
-      soup = load(newRoot.html());
-    }
-  }
-
-  soup("script, style, noscript, meta, head").remove();
-
-  if (
-    pageOptions.removeTags &&
-    pageOptions.removeTags.length > 0 &&
-    pageOptions.removeTags[0] !== ""
-  ) {
-    if (typeof pageOptions.removeTags === "string") {
-      pageOptions.removeTags = [pageOptions.removeTags];
-    }
-
-    if (Array.isArray(pageOptions.removeTags)) {
-      pageOptions.removeTags.forEach((tag) => {
-        let elementsToRemove: Cheerio<AnyNode>;
-        if (tag.startsWith("*") && tag.endsWith("*")) {
-          let classMatch = false;
-
-          const regexPattern = new RegExp(tag.slice(1, -1), "i");
-          elementsToRemove = soup("*").filter((i, element) => {
-            if (element.type === "tag") {
-              const attributes = element.attribs;
-              const tagNameMatches = regexPattern.test(element.name);
-              const attributesMatch = Object.keys(attributes).some((attr) =>
-                regexPattern.test(`${attr}="${attributes[attr]}"`),
-              );
-              if (tag.startsWith("*.")) {
-                classMatch = Object.keys(attributes).some((attr) =>
-                  regexPattern.test(`class="${attributes[attr]}"`),
-                );
-              }
-              return tagNameMatches || attributesMatch || classMatch;
-            }
-            return false;
-          });
-        } else {
-          elementsToRemove = soup(tag);
-        }
-        elementsToRemove.remove();
-      });
-    }
-  }
-
-  if (pageOptions.onlyMainContent) {
-    excludeNonMainTags.forEach((tag) => {
-      const elementsToRemove = soup(tag);
-      elementsToRemove.remove();
-    });
-  }
-  const cleanedHtml = soup.html();
-  return cleanedHtml;
-};
--- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
+++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
@ -1,85 +0,0 @@
-import { Logger } from "../../../lib/logger";
-import { Document } from "../../../lib/entities";
-
-export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
-  try {
-    documents.forEach((document) => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
-      const paths =
-        document.content.match(
-          /!?\[.*?\]\(.*?\)|href=".+?"/g
-        ) || [];
-
-      paths.forEach((path: string) => {
-        try {
-          const isImage = path.startsWith("!");
-        let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
-        let url = matchedUrl[1];
-
-        if (!url.startsWith("data:") && !url.startsWith("http")) {
-          if (url.startsWith("/")) {
-            url = url.substring(1);
-          }
-          url = new URL(url, baseUrl).toString();
-        }
-
-        const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
-        // Image is handled afterwards
-        if (!isImage) {
-          document.content = document.content.replace(
-            path,
-            `${markdownLinkOrImageText}(${url})`
-          );
-          }
-        } catch (error) {
-          
-        }
-      });
-      document.markdown = document.content;
-    });
-
-    return documents;
-  } catch (error) {
-    Logger.debug(`Error replacing paths with absolute paths: ${error}`);
-    return documents;
-  }
-};
-
-export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
-  try {
-    documents.forEach((document) => {
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
-      const images =
-        document.content.match(
-          /!\[.*?\]\(.*?\)/g
-        ) || [];
-
-      images.forEach((image: string) => {
-        let imageUrl = image.match(/\((.*?)\)/)[1];
-        let altText = image.match(/\[(.*?)\]/)[1];
-
-        if (!imageUrl.startsWith("data:image")) {
-          if (!imageUrl.startsWith("http")) {
-            if (imageUrl.startsWith("/")) {
-              imageUrl = imageUrl.substring(1);
-              imageUrl = new URL(imageUrl, baseUrl).toString();
-            } else {
-              imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
-            }
-          }
-        }
-
-        document.content = document.content.replace(
-          image,
-          `![${altText}](${imageUrl})`
-        );
-      });
-      document.markdown = document.content;
-    });
-
-    return documents;
-  } catch (error) {
-    Logger.error(`Error replacing img paths with absolute paths: ${error}`);
-    return documents;
-  }
-};
--- a/apps/api/src/scraper/WebScraper/utils/utils.ts
+++ b/apps/api/src/scraper/WebScraper/utils/utils.ts
@ -1,59 +0,0 @@
-import axios from "axios";
-import * as cheerio from "cheerio";
-import { Logger } from "../../../lib/logger";
-
-
-export async function attemptScrapWithRequests(
-  urlToScrap: string
-): Promise<string | null> {
-  try {
-    const response = await axios.get(urlToScrap, { timeout: 15000 });
-
-    if (!response.data) {
-      Logger.debug("Failed normal requests as well");
-      return null;
-    }
-
-    return response.data;
-  } catch (error) {
-    Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
-    return null;
-  }
-}
-
-export function sanitizeText(text: string): string {
-  return text.replace("\u0000", "");
-}
-
-export function extractLinks(html: string, baseUrl: string): string[] {
-  const $ = cheerio.load(html);
-  const links: string[] = [];
-
-  $('a').each((_, element) => {
-    const href = $(element).attr('href');
-    if (href) {
-      try {
-        if (href.startsWith('http://') || href.startsWith('https://')) {
-          // Absolute URL, add as is
-          links.push(href);
-        } else if (href.startsWith('/')) {
-          // Relative URL starting with '/', append to base URL
-          links.push(new URL(href, baseUrl).href);
-        } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
-          // Relative URL not starting with '/', append to base URL
-          links.push(new URL(href, baseUrl).href);
-        } else if (href.startsWith('mailto:')) {
-          // mailto: links, add as is
-          links.push(href);
-        }
-        // Fragment-only links (#) are ignored
-      } catch (error) {
-        // Log the error and continue
-        console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
-      }
-    }
-  });
-
-  // Remove duplicates and return
-  return [...new Set(links)];
-}
--- a/apps/api/src/scraper/scrapeURL/README.md
+++ b/apps/api/src/scraper/scrapeURL/README.md
@ -0,0 +1,25 @@
+# `scrapeURL`
+New URL scraper for Firecrawl
+
+## Signal flow
+```mermaid
+flowchart TD;
+    scrapeURL-.->buildFallbackList;
+    buildFallbackList-.->scrapeURLWithEngine;
+    scrapeURLWithEngine-.->parseMarkdown;
+    parseMarkdown-.->wasScrapeSuccessful{{Was scrape successful?}};
+    wasScrapeSuccessful-."No".->areEnginesLeft{{Are there engines left to try?}};
+    areEnginesLeft-."Yes, try next engine".->scrapeURLWithEngine;
+    areEnginesLeft-."No".->NoEnginesLeftError[/NoEnginesLeftError/]
+    wasScrapeSuccessful-."Yes".->asd;
+```
+
+## Differences from `WebScraperDataProvider`
+ - The job of `WebScraperDataProvider.validateInitialUrl` has been delegated to the zod layer above `scrapeUrl`.
+ - `WebScraperDataProvider.mode` has no equivalent, only `scrape_url` is supported.
+ - You may no longer specify multiple URLs.
+ - Built on `v1` definitons, instead of `v0`.
+ - PDFs are now converted straight to markdown using LlamaParse, instead of converting to just plaintext.
+ - DOCXs are now converted straight to HTML (and then later to markdown) using mammoth, instead of converting to just plaintext.
+ - Using new JSON Schema OpenAI API -- schema fails with LLM Extract will be basically non-existant.
+        
--- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts
@ -0,0 +1,15 @@
+import { Meta } from "../..";
+import { EngineScrapeResult } from "..";
+import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
+import mammoth from "mammoth";
+
+export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
+    const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
+
+    return {
+        url: response.url,
+        statusCode: response.status,
+
+        html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
+    }
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@ -0,0 +1,28 @@
+import { EngineScrapeResult } from "..";
+import { Meta } from "../..";
+import { TimeoutError } from "../../error";
+import { specialtyScrapeCheck } from "../utils/specialtyHandler";
+
+export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
+    const timeout = 20000;
+
+    const response = await Promise.race([
+        fetch(meta.url, {
+            redirect: "follow",
+            headers: meta.options.headers,
+        }),
+        (async () => {
+            await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
+            throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
+        })()
+    ]);
+
+    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
+
+    return {
+        url: response.url,
+        html: await response.text(),
+        statusCode: response.status,
+        // TODO: error?
+    };
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
@ -0,0 +1,107 @@
+import { Logger } from "winston";
+import * as Sentry from "@sentry/node";
+import { z } from "zod";
+
+import { robustFetch } from "../../lib/fetch";
+import { EngineError } from "../../error";
+
+const successSchema = z.object({
+    jobId: z.string(),
+    state: z.literal("completed"),
+    processing: z.literal(false),
+
+    // timeTaken: z.number(),
+    content: z.string(),
+    url: z.string().optional(),
+
+    pageStatusCode: z.number(),
+    pageError: z.string().optional(),
+
+    // TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
+    responseHeaders: z.record(z.string(), z.string()).optional(),
+
+    // timeTakenCookie: z.number().optional(),
+    // timeTakenRequest: z.number().optional(),
+
+    // legacy: playwright only
+    screenshot: z.string().optional(),
+
+    // new: actions
+    screenshots: z.string().array().optional(),
+    actionContent: z.object({
+        url: z.string(),
+        html: z.string(),
+    }).array().optional(),
+})
+
+export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
+
+const processingSchema = z.object({
+    jobId: z.string(),
+    state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown"]),
+    processing: z.boolean(),
+});
+
+const failedSchema = z.object({
+    jobId: z.string(),
+    state: z.literal("failed"),
+    processing: z.literal(false),
+    error: z.string(),
+});
+
+export class StillProcessingError extends Error {
+    constructor(jobId: string) {
+        super("Job is still under processing", { cause: { jobId } })
+    }
+}
+
+export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
+    const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
+
+    const status = await Sentry.startSpan({
+        name: "fire-engine: Check status",
+        attributes: {
+            jobId,
+        }
+    }, async span => {
+        return await robustFetch(
+            {
+                url: `${fireEngineURL}/scrape/${jobId}`,
+                method: "GET",
+                logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
+                headers: {
+                    ...(Sentry.isInitialized() ? ({
+                        "sentry-trace": Sentry.spanToTraceHeader(span),
+                        "baggage": Sentry.spanToBaggageHeader(span),
+                    }) : {}),
+                },
+            }
+        )
+    });
+
+    const successParse = successSchema.safeParse(status);
+    const processingParse = processingSchema.safeParse(status);
+    const failedParse = failedSchema.safeParse(status);
+
+    if (successParse.success) {
+        logger.debug("Scrape succeeded!", { jobId });
+        return successParse.data;
+    } else if (processingParse.success) {
+        logger.debug("Scrape is still processing", { jobId });
+        throw new StillProcessingError(jobId);
+    } else if (failedParse.success) {
+        logger.debug("Scrape job failed", { status, jobId });
+        throw new EngineError("Scrape job failed", {
+            cause: {
+                status, jobId
+            }
+        });
+    } else {
+        logger.debug("Check status returned response not matched by any schema", { status, jobId });
+        throw new Error("Check status returned response not matched by any schema", {
+            cause: {
+                status, jobId
+            }
+        });
+    }
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
@ -0,0 +1,33 @@
+import { Logger } from "winston";
+import * as Sentry from "@sentry/node";
+
+import { robustFetch } from "../../lib/fetch";
+
+export async function fireEngineDelete(logger: Logger, jobId: string) {
+    const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
+
+    await Sentry.startSpan({
+        name: "fire-engine: Delete scrape",
+        attributes: {
+            jobId,
+        }
+    }, async span => {
+        await robustFetch(
+            {
+                url: `${fireEngineURL}/scrape/${jobId}`,
+                method: "DELETE",
+                headers: {
+                    ...(Sentry.isInitialized() ? ({
+                        "sentry-trace": Sentry.spanToTraceHeader(span),
+                        "baggage": Sentry.spanToBaggageHeader(span),
+                    }) : {}),
+                },
+                ignoreResponse: true,
+                ignoreFailure: true,
+                logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
+            }
+        )
+    });
+
+    // We do not care whether this fails or not.
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -0,0 +1,198 @@
+import { Logger } from "winston";
+import { Meta } from "../..";
+import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
+import { EngineScrapeResult } from "..";
+import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
+import { EngineError, TimeoutError } from "../../error";
+import * as Sentry from "@sentry/node";
+import { Action } from "../../../../lib/entities";
+import { specialtyScrapeCheck } from "../utils/specialtyHandler";
+
+const defaultTimeout = 20000;
+
+// This function does not take `Meta` on purpose. It may not access any
+// meta values to construct the request -- that must be done by the
+// `scrapeURLWithFireEngine*` functions.
+async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
+    logger: Logger,
+    request: FireEngineScrapeRequestCommon & Engine,
+    timeout = defaultTimeout,
+): Promise<FireEngineCheckStatusSuccess> {
+    const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
+
+    const startTime = Date.now();
+    const errorLimit = 3;
+    let errors: any[] = [];
+    let status: FireEngineCheckStatusSuccess | undefined = undefined;
+
+    while (status === undefined) {
+        if (errors.length >= errorLimit) {
+            logger.error("Error limit hit.", { errors });
+            throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
+        }
+
+        if (Date.now() - startTime > timeout) {
+            logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
+            throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
+        }
+
+        try {
+            status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
+        } catch (error) {
+            if (error instanceof StillProcessingError) {
+                logger.debug("Scrape is still processing...");
+            } else if (error instanceof EngineError) {
+                logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
+                throw error;
+            } else {
+                Sentry.captureException(error);
+                errors.push(error);
+                logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
+            }
+        }
+
+        await new Promise((resolve) => setTimeout(resolve, 250));
+    }
+
+    return status;
+}
+
+export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
+    const actions: Action[] = [
+        // Transform waitFor option into an action (unsupported by chrome-cdp)
+        ...(meta.options.waitFor !== 0 ? [{
+            type: "wait" as const,
+            milliseconds: meta.options.waitFor,
+        }] : []),
+
+        // Transform screenshot format into an action (unsupported by chrome-cdp)
+        ...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
+            type: "screenshot" as const,
+            fullPage: meta.options.formats.includes("screenshot@fullPage"),
+        }] : []),
+
+        // Include specified actions
+        ...(meta.options.actions ?? []),
+    ];
+
+    const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
+        url: meta.url,
+        engine: "chrome-cdp",
+        instantReturn: true,
+        skipTlsVerification: meta.options.skipTlsVerification,
+        headers: meta.options.headers,
+        ...(actions.length > 0 ? ({
+            actions,
+        }) : {}),
+        priority: meta.internalOptions.priority,
+        geolocation: meta.options.geolocation,
+        mobile: meta.options.mobile,
+        // TODO: scrollXPaths
+    };
+
+    let response = await performFireEngineScrape(
+        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
+        request,
+    );
+
+    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
+
+    if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
+        meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
+        response.screenshot = (response.screenshots ?? [])[0];
+        (response.screenshots ?? []).splice(0, 1);
+        meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
+    }
+
+    if (!response.url) {
+        meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
+    }
+
+    return {
+        url: response.url ?? meta.url,
+
+        html: response.content,
+        error: response.pageError,
+        statusCode: response.pageStatusCode,
+
+        screenshot: response.screenshot,
+        ...(actions.length > 0 ? {
+            actions: {
+                screenshots: response.screenshots ?? [],
+                scrapes: response.actionContent ?? [],
+            }
+        } : {}),
+    };
+}
+
+export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
+    const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
+        url: meta.url,
+        engine: "playwright",
+        instantReturn: true,
+
+        headers: meta.options.headers,
+        priority: meta.internalOptions.priority,
+        screenshot: meta.options.formats.includes("screenshot"),
+        fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
+        wait: meta.options.waitFor,
+        geolocation: meta.options.geolocation,
+    };
+
+    let response = await performFireEngineScrape(
+        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
+        request,
+    );
+    
+    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
+
+    if (!response.url) {
+        meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
+    }
+
+    return {
+        url: response.url ?? meta.url,
+
+        html: response.content,
+        error: response.pageError,
+        statusCode: response.pageStatusCode,
+
+        ...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
+            screenshot: response.screenshots[0],
+        }) : {}),
+    };
+}
+
+export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
+    const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
+        url: meta.url,
+        engine: "tlsclient",
+        instantReturn: true,
+
+        headers: meta.options.headers,
+        priority: meta.internalOptions.priority,
+
+        atsv: meta.internalOptions.atsv,
+        geolocation: meta.options.geolocation,
+        disableJsDom: meta.internalOptions.v0DisableJsDom,
+    };
+
+    let response = await performFireEngineScrape(
+        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
+        request,
+    );
+
+    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
+
+    if (!response.url) {
+        meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
+    }
+
+    return {
+        url: response.url ?? meta.url,
+
+        html: response.content,
+        error: response.pageError,
+        statusCode: response.pageStatusCode,
+    };
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
@ -0,0 +1,94 @@
+import { Logger } from "winston";
+import * as Sentry from "@sentry/node";
+import { z } from "zod";
+
+import { Action } from "../../../../lib/entities";
+import { robustFetch } from "../../lib/fetch";
+
+export type FireEngineScrapeRequestCommon = {
+    url: string;
+    
+    headers?: { [K: string]: string };
+
+    blockMedia?: boolean; // default: true
+    blockAds?: boolean; // default: true
+    // pageOptions?: any; // unused, .scrollXPaths is considered on FE side
+
+    // useProxy?: boolean; // unused, default: true
+    // customProxy?: string; // unused
+
+    // disableSmartWaitCache?: boolean; // unused, default: false
+    // skipDnsCheck?: boolean; // unused, default: false
+
+    priority?: number; // default: 1
+    // team_id?: string; // unused
+    logRequest?: boolean; // default: true
+    instantReturn?: boolean; // default: false
+    geolocation?: { country?: string; languages?: string[]; };
+}
+
+export type FireEngineScrapeRequestChromeCDP = {
+    engine: "chrome-cdp";
+    skipTlsVerification?: boolean;
+    actions?: Action[];
+    blockMedia?: true; // cannot be false
+    mobile?: boolean;
+};
+
+export type FireEngineScrapeRequestPlaywright = {
+    engine: "playwright";
+    blockAds?: boolean; // default: true
+
+    // mutually exclusive, default: false
+    screenshot?: boolean;
+    fullPageScreenshot?: boolean;
+
+    wait?: number; // default: 0
+};
+
+export type FireEngineScrapeRequestTLSClient = {
+    engine: "tlsclient";
+    atsv?: boolean; // v0 only, default: false
+    disableJsDom?: boolean; // v0 only, default: false
+    // blockAds?: boolean; // default: true
+};
+
+const schema = z.object({
+    jobId: z.string(),
+    processing: z.boolean(),
+});
+
+export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
+    logger: Logger,
+    request: FireEngineScrapeRequestCommon & Engine,
+): Promise<z.infer<typeof schema>> {
+    const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
+
+    // TODO: retries
+
+    const scrapeRequest = await Sentry.startSpan({
+        name: "fire-engine: Scrape",
+        attributes: {
+            url: request.url,
+        },
+    }, async span => {
+        return await robustFetch(
+            {
+                url: `${fireEngineURL}/scrape`,
+                method: "POST",
+                headers: {
+                    ...(Sentry.isInitialized() ? ({
+                        "sentry-trace": Sentry.spanToTraceHeader(span),
+                        "baggage": Sentry.spanToBaggageHeader(span),
+                    }) : {}),
+                },
+                body: request,
+                logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
+                schema,
+                tryCount: 3,
+            }
+        );
+    });
+
+    return scrapeRequest;
+}
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -0,0 +1,295 @@
+import { ScrapeActionContent } from "../../../lib/entities";
+import { Meta } from "..";
+import { scrapeDOCX } from "./docx";
+import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
+import { scrapePDF } from "./pdf";
+import { scrapeURLWithScrapingBee } from "./scrapingbee";
+import { scrapeURLWithFetch } from "./fetch";
+import { scrapeURLWithPlaywright } from "./playwright";
+
+export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
+
+const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
+const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
+const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
+
+export const engines: Engine[] = [
+    ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
+    ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
+    ...(usePlaywright ? [ "playwright" as const ] : []),
+    "fetch",
+    "pdf",
+    "docx",
+];
+
+export const featureFlags = [
+    "actions",
+    "waitFor",
+    "screenshot",
+    "screenshot@fullScreen",
+    "pdf",
+    "docx",
+    "atsv",
+    "location",
+    "mobile",
+    "skipTlsVerification",
+    "useFastMode",
+] as const;
+
+export type FeatureFlag = typeof featureFlags[number];
+
+export const featureFlagOptions: {
+    [F in FeatureFlag]: {
+        priority: number;
+    }
+} = {
+    "actions": { priority: 20 },
+    "waitFor": { priority: 1 },
+    "screenshot": { priority: 10 },
+    "screenshot@fullScreen": { priority: 10 },
+    "pdf": { priority: 100 },
+    "docx": { priority: 100 },
+    "atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
+    "useFastMode": { priority: 90 },
+    "location": { priority: 10 },
+    "mobile": { priority: 10 },
+    "skipTlsVerification": { priority: 10 },
+} as const;
+
+export type EngineScrapeResult = {
+    url: string;
+
+    html: string;
+    markdown?: string;
+    statusCode: number;
+    error?: string;
+
+    screenshot?: string;
+    actions?: {
+        screenshots: string[];
+        scrapes: ScrapeActionContent[];
+    };
+}
+
+const engineHandlers: {
+    [E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
+} = {
+    "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
+    "fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
+    "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
+    "scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
+    "scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
+    "playwright": scrapeURLWithPlaywright,
+    "fetch": scrapeURLWithFetch,
+    "pdf": scrapePDF,
+    "docx": scrapeDOCX,
+};
+
+export const engineOptions: {
+    [E in Engine]: {
+        // A list of feature flags the engine supports.
+        features: { [F in FeatureFlag]: boolean },
+
+        // This defines the order of engines in general. The engine with the highest quality will be used the most.
+        // Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
+        quality: number,
+    }
+} = {
+    "fire-engine;chrome-cdp": {
+        features: {
+            "actions": true,
+            "waitFor": true, // through actions transform
+            "screenshot": true, // through actions transform
+            "screenshot@fullScreen": true, // through actions transform
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": true,
+            "mobile": true,
+            "skipTlsVerification": true,
+            "useFastMode": false,
+        },
+        quality: 50,
+    },
+    "fire-engine;playwright": {
+        features: {
+            "actions": false,
+            "waitFor": true,
+            "screenshot": true,
+            "screenshot@fullScreen": true,
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": false,
+        },
+        quality: 40,
+    },
+    "scrapingbee": {
+        features: {
+            "actions": false,
+            "waitFor": true,
+            "screenshot": true,
+            "screenshot@fullScreen": true,
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": false,
+        },
+        quality: 30,
+    },
+    "scrapingbeeLoad": {
+        features: {
+            "actions": false,
+            "waitFor": true,
+            "screenshot": true,
+            "screenshot@fullScreen": true,
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": false,
+        },
+        quality: 29,
+    },
+    "playwright": {
+        features: {
+            "actions": false,
+            "waitFor": true,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": false,
+        },
+        quality: 20,
+    },
+    "fire-engine;tlsclient": {
+        features: {
+            "actions": false,
+            "waitFor": false,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": false,
+            "docx": false,
+            "atsv": true,
+            "location": true,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": true,
+        },
+        quality: 10,
+    },
+    "fetch": {
+        features: {
+            "actions": false,
+            "waitFor": false,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": false,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": true,
+        },
+        quality: 5,
+    },
+    "pdf": {
+        features: {
+            "actions": false,
+            "waitFor": false,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": true,
+            "docx": false,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": true,
+        },
+        quality: -10,
+    },
+    "docx": {
+        features: {
+            "actions": false,
+            "waitFor": false,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": false,
+            "docx": true,
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": true,
+        },
+        quality: -10,
+    },
+};
+
+export function buildFallbackList(meta: Meta): {
+    engine: Engine,
+    unsupportedFeatures: Set<FeatureFlag>,
+}[] {
+    const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
+    const priorityThreshold = Math.floor(prioritySum / 2);
+    let selectedEngines: {
+        engine: Engine,
+        supportScore: number,
+        unsupportedFeatures: Set<FeatureFlag>,
+    }[] = [];
+
+    const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
+
+    for (const engine of currentEngines) {
+        const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
+        const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
+
+        const unsupportedFeatures = new Set([...meta.featureFlags]);
+            for (const flag of meta.featureFlags) {
+                if (supportedFlags.has(flag)) {
+                    unsupportedFeatures.delete(flag);
+                }
+            }
+
+        if (supportScore >= priorityThreshold) {
+            selectedEngines.push({ engine, supportScore, unsupportedFeatures });
+            meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
+        } else {
+            meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
+        }
+    }
+
+    if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
+        selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
+    }
+
+    selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
+
+    return selectedEngines;
+}
+
+export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
+    const fn = engineHandlers[engine];
+    const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
+    const _meta = {
+        ...meta,
+        logger,
+    };
+
+    return await fn(_meta);
+}
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@ -0,0 +1,114 @@
+import { createReadStream, promises as fs } from "node:fs";
+import FormData from "form-data";
+import { Meta } from "../..";
+import { EngineScrapeResult } from "..";
+import * as marked from "marked";
+import { robustFetch } from "../../lib/fetch";
+import { z } from "zod";
+import * as Sentry from "@sentry/node";
+import escapeHtml from "escape-html";
+import PdfParse from "pdf-parse";
+import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
+
+type PDFProcessorResult = {html: string, markdown?: string};
+
+async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
+    meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
+
+    const uploadForm = new FormData();
+    uploadForm.append("file", createReadStream(tempFilePath), {
+        filename: tempFilePath,
+        contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
+    });
+
+    const upload = await robustFetch({
+        url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
+        method: "POST",
+        headers: {
+            "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
+        },
+        body: uploadForm,
+        logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
+        schema: z.object({
+            id: z.string(),
+        }),
+    });
+
+    const jobId = upload.id;
+
+    // TODO: timeout, retries
+    const result = await robustFetch({
+        url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
+        method: "GET",
+        headers: {
+            "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
+        },
+        logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
+        schema: z.object({
+            markdown: z.string(),
+        }),
+    });
+    
+    return {
+        markdown: result.markdown,
+        html: await marked.parse(result.markdown, { async: true }),
+    };
+}
+
+async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
+    meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
+
+    const result = await PdfParse(await fs.readFile(tempFilePath));
+    const escaped = escapeHtml(result.text);
+
+    return {
+        markdown: escaped,
+        html: escaped,
+    };
+}
+
+export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
+    if (!meta.options.parsePDF) {
+        const file = await fetchFileToBuffer(meta.url);
+        const content = file.buffer.toString("base64");
+        return {
+            url: file.response.url,
+            statusCode: file.response.status,
+
+            html: content,
+            markdown: content,
+        };
+    }
+
+    const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
+
+    let result: PDFProcessorResult | null = null;
+    if (process.env.LLAMAPARSE_API_KEY) {
+        try {
+            result = await scrapePDFWithLlamaParse({
+                ...meta,
+                logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
+            }, tempFilePath);
+        } catch (error) {
+            meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
+            Sentry.captureException(error);
+        }
+    }
+
+    if (result === null) {
+        result = await scrapePDFWithParsePDF({
+            ...meta,
+            logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
+        }, tempFilePath);
+    }
+
+    await fs.unlink(tempFilePath);
+
+    return {
+        url: response.url,
+        statusCode: response.status,
+
+        html: result.html,
+        markdown: result.markdown,
+    }
+}
--- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
@ -0,0 +1,42 @@
+import { z } from "zod";
+import { EngineScrapeResult } from "..";
+import { Meta } from "../..";
+import { TimeoutError } from "../../error";
+import { robustFetch } from "../../lib/fetch";
+
+export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
+    const timeout = 20000 + meta.options.waitFor;
+
+    const response = await Promise.race([
+        await robustFetch({
+            url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
+            headers: {
+                "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+                url: meta.url,
+                wait_after_load: meta.options.waitFor,
+                timeout,
+                headers: meta.options.headers,
+            }),
+            method: "POST",
+            logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
+            schema: z.object({
+                content: z.string(),
+                pageStatusCode: z.number(),
+                pageError: z.string().optional(),
+            }),
+        }),
+        (async () => {
+            await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
+            throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
+        })(),
+    ]);
+
+    return {
+        url: meta.url, // TODO: impove redirect following
+        html: response.content,
+        statusCode: response.pageStatusCode,
+        error: response.pageError,
+    }
+}
--- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
@ -0,0 +1,66 @@
+import { ScrapingBeeClient } from "scrapingbee";
+import { Meta } from "../..";
+import { EngineScrapeResult } from "..";
+import { specialtyScrapeCheck } from "../utils/specialtyHandler";
+import { AxiosError, type AxiosResponse } from "axios";
+import { EngineError } from "../../error";
+
+const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
+
+export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise<EngineScrapeResult>) {
+    return async (meta: Meta): Promise<EngineScrapeResult> => {
+        let response: AxiosResponse<any>;
+        try {
+            response = await client.get({
+                url: meta.url,
+                params: {
+                    timeout: 15000, // TODO: dynamic timeout based on request timeout
+                    wait_browser: wait_browser,
+                    wait: Math.min(meta.options.waitFor, 35000),
+                    transparent_status_code: true,
+                    json_response: true,
+                    screenshot: meta.options.formats.includes("screenshot"),
+                    screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"),             
+                },
+                headers: {
+                    "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
+                },
+            });
+        } catch (error) {
+            if (error instanceof AxiosError && error.response !== undefined) {
+                response = error.response;
+            } else {
+                throw error;
+            }
+        }
+
+        const data: Buffer = response.data;
+        const body = JSON.parse(new TextDecoder().decode(data));
+
+        const headers = body.headers ?? {};
+        const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]);
+
+        if (body.errors || body.body?.error || isHiddenEngineError) {
+            meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body });
+            throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } });
+        }
+
+        if (typeof body.body !== "string") {
+            meta.logger.error("ScrapingBee: Body is not string??", { body });
+            throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } });
+        }
+
+        specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers);
+
+        return {
+            url: body["resolved-url"] ?? meta.url,
+
+            html: body.body,
+            error: response.status >= 300 ? response.statusText : undefined,
+            statusCode: response.status,
+            ...(body.screenshot ? ({
+                screenshot: `data:image/png;base64,${body.screenshot}`,
+            }) : {}),
+        };
+    };
+}
--- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
@ -0,0 +1,45 @@
+import path from "path";
+import os from "os";
+import { createWriteStream, promises as fs } from "node:fs";
+import { EngineError } from "../../error";
+import { Writable } from "stream";
+import { v4 as uuid } from "uuid";
+
+export async function fetchFileToBuffer(url: string): Promise<{
+    response: Response,
+    buffer: Buffer
+}> {
+    const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
+    return {
+        response,
+        buffer: Buffer.from(await response.arrayBuffer()),
+    };
+}
+
+export async function downloadFile(id: string, url: string): Promise<{
+    response: Response
+    tempFilePath: string
+}> {
+    const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
+    const tempFileWrite = createWriteStream(tempFilePath);
+
+    const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
+
+    // This should never happen in the current state of JS (2024), but let's check anyways.
+    if (response.body === null) {
+        throw new EngineError("Response body was null", { cause: { response } });
+    }
+
+    response.body.pipeTo(Writable.toWeb(tempFileWrite));
+    await new Promise((resolve, reject) => {
+        tempFileWrite.on("finish", () => resolve(null));
+        tempFileWrite.on("error", (error) => {
+            reject(new EngineError("Failed to write to temp file", { cause: { error } }));
+        });
+    })
+
+    return {
+        response,
+        tempFilePath,
+    };
+}
--- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
@ -0,0 +1,14 @@
+import { Logger } from "winston";
+import { AddFeatureError } from "../../error";
+
+export function specialtyScrapeCheck(logger: Logger, headers: Record<string, string> | undefined) {
+    const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1];
+
+    if (contentType === undefined) {
+        logger.warn("Failed to check contentType -- was not present in headers", { headers });
+    } else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf
+        throw new AddFeatureError(["pdf"]);
+    } else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx
+        throw new AddFeatureError(["docx"]);
+    }
+}
--- a/apps/api/src/scraper/scrapeURL/error.ts
+++ b/apps/api/src/scraper/scrapeURL/error.ts
@ -0,0 +1,34 @@
+import { EngineResultsTracker } from "."
+import { Engine, FeatureFlag } from "./engines"
+
+export class EngineError extends Error {
+    constructor(message?: string, options?: ErrorOptions) {
+        super(message, options)
+    }
+}
+
+export class TimeoutError extends Error {
+    constructor(message?: string, options?: ErrorOptions) {
+        super(message, options)
+    }
+}
+
+export class NoEnginesLeftError extends Error {
+    public fallbackList: Engine[];
+    public results: EngineResultsTracker;
+
+    constructor(fallbackList: Engine[], results: EngineResultsTracker) {
+        super("All scraping engines failed!");
+        this.fallbackList = fallbackList;
+        this.results = results;
+    }
+}
+
+export class AddFeatureError extends Error {
+    public featureFlags: FeatureFlag[];
+
+    constructor(featureFlags: FeatureFlag[]) {
+        super("New feature flags have been discovered: " + featureFlags.join(", "));
+        this.featureFlags = featureFlags;
+    }
+}
--- a/Show More
+++ b/Show More