mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
WebScraper
refactor into scrapeURL
(#714)
* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
ed5a0d3cf2
commit
8d467c8ca7
2
.github/archive/js-sdk.yml
vendored
2
.github/archive/js-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/python-sdk.yml
vendored
2
.github/archive/python-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/rust-sdk.yml
vendored
2
.github/archive/rust-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -21,7 +20,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
|
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
@ -12,7 +12,6 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -25,7 +24,6 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
|
|
32
.github/workflows/deploy-image-staging.yml
vendored
Normal file
32
.github/workflows/deploy-image-staging.yml
vendored
Normal file
|
@ -0,0 +1,32 @@
|
|||
name: STAGING Deploy Images to GHCR
|
||||
|
||||
env:
|
||||
DOTNET_VERSION: '6.0.x'
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- mog/webscraper-refactor
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
push-app-image:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: './apps/api'
|
||||
steps:
|
||||
- name: 'Checkout GitHub Action'
|
||||
uses: actions/checkout@main
|
||||
|
||||
- name: 'Login to GitHub Container Registry'
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{github.actor}}
|
||||
password: ${{secrets.GITHUB_TOKEN}}
|
||||
|
||||
- name: 'Build Inventory Image'
|
||||
run: |
|
||||
docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
|
||||
docker push ghcr.io/mendableai/firecrawl-staging:latest
|
|
@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
||||
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -33,8 +33,6 @@ SCRAPING_BEE_API_KEY=
|
|||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
|
@ -54,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
|
|||
STRIPE_PRICE_ID_GROWTH=
|
||||
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||
|
||||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
global.fetch = require('jest-fetch-mock');
|
||||
// global.fetch = require('jest-fetch-mock');
|
||||
|
|
|
@ -32,9 +32,11 @@
|
|||
"@tsconfig/recommended": "^1.0.3",
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/cors": "^2.8.13",
|
||||
"@types/escape-html": "^1.0.4",
|
||||
"@types/express": "^4.17.17",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/node": "^20.14.1",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"body-parser": "^1.20.1",
|
||||
"express": "^4.18.2",
|
||||
"jest": "^29.6.3",
|
||||
|
@ -53,9 +55,7 @@
|
|||
"@bull-board/api": "^5.20.5",
|
||||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.13",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@dqbd/tiktoken": "^1.0.16",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
|
@ -78,6 +78,7 @@
|
|||
"date-fns": "^3.6.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"dotenv-cli": "^7.4.2",
|
||||
"escape-html": "^1.0.3",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
|
@ -92,6 +93,7 @@
|
|||
"languagedetect": "^2.0.0",
|
||||
"logsnag": "^1.0.0",
|
||||
"luxon": "^3.4.3",
|
||||
"marked": "^14.1.2",
|
||||
"md5": "^2.3.0",
|
||||
"moment": "^2.29.4",
|
||||
"mongoose": "^8.4.4",
|
||||
|
@ -114,6 +116,8 @@
|
|||
"typesense": "^1.5.4",
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"winston": "^3.14.2",
|
||||
"winston-transport": "^4.8.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,15 +1,15 @@
|
|||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"corterix.com"
|
||||
"url":"firecrawl.dev"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
||||
### Check Job Status
|
||||
|
@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
|||
|
||||
### Scrape Website
|
||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
|
@ -45,7 +45,7 @@ content-type: application/json
|
|||
|
||||
### Scrape Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
|
@ -56,12 +56,12 @@ content-type: application/json
|
|||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
||||
Authorization: Bearer
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
### Get Job Result
|
||||
|
||||
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
|
||||
Authorization: Bearer
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
|
@ -70,7 +70,7 @@ content-type: application/json
|
|||
|
||||
### Check Job Status
|
||||
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
|
||||
Authorization: Bearer
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
### Get Active Jobs Count
|
||||
GET http://localhost:3002/serverHealthCheck
|
||||
|
|
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
Normal file
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
html-to-markdown.so
|
||||
html-to-markdown.h
|
|
@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(crawlInitResponse.statusCode).toBe(200);
|
||||
expect(crawlInitResponse.body).toHaveProperty("jobId");
|
||||
|
||||
let crawlStatus: string;
|
||||
let crawlStatus: string = "scraping";
|
||||
let crawlData = [];
|
||||
while (crawlStatus !== "completed") {
|
||||
const statusResponse = await request(TEST_URL)
|
||||
|
|
|
@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||
process.env.SCRAPING_BEE_API_KEY = "";
|
||||
process.env.OPENAI_API_KEY = "";
|
||||
process.env.BULL_AUTH_KEY = "";
|
||||
process.env.LOGTAIL_KEY = "";
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||
process.env.LLAMAPARSE_API_KEY = "";
|
||||
process.env.TEST_API_KEY = "";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeRequestInput,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
|
@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
|
@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
|
@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://arxiv.org/abs/2410.04840",
|
||||
};
|
||||
|
||||
|
@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
|
@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
|
@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
|
@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
|
@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 8000
|
||||
|
@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
|
|||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
|
|
603
apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
Normal file
603
apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
Normal file
|
@ -0,0 +1,603 @@
|
|||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
||||
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||
// expect(response.body.data.markdown).toContain("Loading...");
|
||||
expect(response.body.data.markdown).toContain("Click me!");
|
||||
expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
|
||||
expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
|
||||
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
||||
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
||||
expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'formats:html' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html"]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
|
||||
expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||
expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["rawHtml"]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
|
||||
expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
||||
},
|
||||
30000);
|
||||
|
||||
// - TODO: tests for links
|
||||
// - TODO: tests for screenshot
|
||||
// - TODO: tests for screenshot@fullPage
|
||||
|
||||
it.concurrent("should handle 'headers' parameter correctly", async () => {
|
||||
// @ts-ignore
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
headers: { "e2e-header-test": "firecrawl" }
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("should handle 'includeTags' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
includeTags: ['#content-1']
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'excludeTags' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
excludeTags: ['#content-1']
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.markdown).not.toContain("Content with id #content-1");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'onlyMainContent' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html", "markdown"],
|
||||
onlyMainContent: false
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'timeout' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
timeout: 500
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
|
||||
if (!("error" in response.body)) {
|
||||
throw new Error("Expected response body to have 'error' property");
|
||||
}
|
||||
expect(response.body.error).toBe("Request timed out");
|
||||
expect(response.body.success).toBe(false);
|
||||
}, 30000);
|
||||
|
||||
|
||||
it.concurrent("should handle 'mobile' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
mobile: true
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).toContain("This content is only visible on mobile");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'parsePDF' parameter correctly",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
|
||||
expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||
|
||||
const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(responseNoParsePDF.statusCode).toBe(200);
|
||||
expect(responseNoParsePDF.body).toHaveProperty('data');
|
||||
if (!("data" in responseNoParsePDF.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||
},
|
||||
30000);
|
||||
|
||||
// it.concurrent("should handle 'location' parameter correctly",
|
||||
// async () => {
|
||||
// const scrapeRequest: ScrapeRequest = {
|
||||
// url: "https://roastmywebsite.ai",
|
||||
// location: {
|
||||
// country: "US",
|
||||
// languages: ["en"]
|
||||
// }
|
||||
// };
|
||||
|
||||
// const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
// .post("/v1/scrape")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send(scrapeRequest);
|
||||
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// // Add assertions to verify location is handled correctly
|
||||
// },
|
||||
// 30000);
|
||||
|
||||
it.concurrent("should handle 'skipTlsVerification' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: "https://expired.badssl.com/",
|
||||
timeout: 120000
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
console.log("Error1a")
|
||||
// console.log(response.body)
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||
console.log("Error?")
|
||||
|
||||
const scrapeRequestWithSkipTlsVerification = {
|
||||
url: "https://expired.badssl.com/",
|
||||
skipTlsVerification: true,
|
||||
timeout: 120000
|
||||
|
||||
} as ScrapeRequest;
|
||||
|
||||
const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithSkipTlsVerification);
|
||||
|
||||
console.log("Error1b")
|
||||
// console.log(responseWithSkipTlsVerification.body)
|
||||
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
|
||||
if (!("data" in responseWithSkipTlsVerification.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(responseWithSkipTlsVerification.body.data)
|
||||
expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
|
||||
},
|
||||
60000);
|
||||
|
||||
it.concurrent("should handle 'removeBase64Images' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
removeBase64Images: true
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(response.body.data.markdown)
|
||||
// - TODO: not working for every image
|
||||
// expect(response.body.data.markdown).toContain("Image-Removed");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action wait' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "wait",
|
||||
milliseconds: 10000
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).not.toContain("Loading...");
|
||||
expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
|
||||
},
|
||||
30000);
|
||||
|
||||
// screenshot
|
||||
it.concurrent("should handle 'action screenshot' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "screenshot"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
if (!response.body.data.actions?.screenshots) {
|
||||
throw new Error("Expected response body to have screenshots array");
|
||||
}
|
||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||
|
||||
// TODO compare screenshot with expected screenshot
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "screenshot",
|
||||
fullPage: true
|
||||
},
|
||||
{
|
||||
type:"scrape"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(response.body.data.actions?.screenshots[0])
|
||||
if (!response.body.data.actions?.screenshots) {
|
||||
throw new Error("Expected response body to have screenshots array");
|
||||
}
|
||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||
|
||||
if (!response.body.data.actions?.scrapes) {
|
||||
throw new Error("Expected response body to have scrapes array");
|
||||
}
|
||||
expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
|
||||
expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
// TODO compare screenshot with expected full page screenshot
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action click' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "click",
|
||||
selector: "#click-me"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).not.toContain("Click me!");
|
||||
expect(response.body.data.markdown).toContain("Text changed after click!");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action write' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html"],
|
||||
actions: [{
|
||||
type: "click",
|
||||
selector: "#input-1"
|
||||
},
|
||||
{
|
||||
type: "write",
|
||||
text: "Hello, world!"
|
||||
}
|
||||
]} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
// uncomment the following line:
|
||||
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
it.concurrent("should handle 'action pressKey' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["markdown"],
|
||||
actions: [
|
||||
{
|
||||
type: "press",
|
||||
key: "ArrowDown"
|
||||
}
|
||||
]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
// // TODO: fix this test (need to fix fire-engine first)
|
||||
// // right now response.body is: { success: false, error: '(Internal server error) - null' }
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// if (!("data" in response.body)) {
|
||||
// throw new Error("Expected response body to have 'data' property");
|
||||
// }
|
||||
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
it.concurrent("should handle 'action scroll' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["markdown"],
|
||||
actions: [
|
||||
{
|
||||
type: "click",
|
||||
selector: "#scroll-bottom-loader"
|
||||
},
|
||||
{
|
||||
type: "scroll",
|
||||
direction: "down",
|
||||
amount: 2000
|
||||
}
|
||||
]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
// TODO: uncomment this tests
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// if (!("data" in response.body)) {
|
||||
// throw new Error("Expected response body to have 'data' property");
|
||||
// }
|
||||
//
|
||||
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: test scrape action
|
||||
|
||||
});
|
|
@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
await new Promise((r) => setTimeout(r, 10000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.maxResponseSize(4000000000);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
|
|
|
@ -9,9 +9,8 @@ import {
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { deleteKey, getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
|
@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
|
|||
export async function setCachedACUC(
|
||||
api_key: string,
|
||||
acuc:
|
||||
| AuthCreditUsageChunk
|
||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
|
||||
| AuthCreditUsageChunk | null
|
||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
||||
) {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
const redLockKey = `lock_${cacheKeyACUC}`;
|
||||
|
@ -49,7 +48,7 @@ export async function setCachedACUC(
|
|||
try {
|
||||
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
||||
if (typeof acuc === "function") {
|
||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
|
||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
|
||||
|
||||
if (acuc === null) {
|
||||
if (signal.aborted) {
|
||||
|
@ -69,7 +68,7 @@ export async function setCachedACUC(
|
|||
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||
logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,7 +102,7 @@ export async function getACUC(
|
|||
break;
|
||||
}
|
||||
|
||||
Logger.warn(
|
||||
logger.warn(
|
||||
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
||||
);
|
||||
retries++;
|
||||
|
@ -146,33 +145,14 @@ export async function authenticateUser(
|
|||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
|
||||
}
|
||||
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: PlanType;
|
||||
chunk?: AuthCreditUsageChunk;
|
||||
}> {
|
||||
): Promise<AuthResponse> {
|
||||
const authHeader =
|
||||
req.headers.authorization ??
|
||||
(req.headers["sec-websocket-protocol"]
|
||||
|
@ -200,7 +180,7 @@ export async function supaAuthenticateUser(
|
|||
|
||||
let teamId: string | null = null;
|
||||
let priceId: string | null = null;
|
||||
let chunk: AuthCreditUsageChunk;
|
||||
let chunk: AuthCreditUsageChunk | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
if (mode == RateLimiterMode.CrawlStatus) {
|
||||
|
@ -233,8 +213,6 @@ export async function supaAuthenticateUser(
|
|||
priceId = chunk.price_id;
|
||||
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: teamId,
|
||||
plan,
|
||||
|
@ -291,7 +269,7 @@ export async function supaAuthenticateUser(
|
|||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
|
@ -318,7 +296,7 @@ export async function supaAuthenticateUser(
|
|||
mode === RateLimiterMode.CrawlStatus ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
return { success: true, team_id: "preview", chunk: null };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
// const origin = req.headers.origin;
|
||||
// if (origin && origin.includes("firecrawl.dev")){
|
||||
|
@ -333,12 +311,12 @@ export async function supaAuthenticateUser(
|
|||
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: (subscriptionData.plan ?? "") as PlanType,
|
||||
team_id: teamId ?? undefined,
|
||||
plan: (subscriptionData?.plan ?? "") as PlanType,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string): PlanType {
|
||||
function getPlanByPriceId(price_id: string | null): PlanType {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return "starter";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { supabase_service } from "../../../services/supabase";
|
||||
import { clearACUC } from "../../auth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { logger } from "../../../lib/logger";
|
||||
|
||||
export async function acucCacheClearController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
|
|||
.select("*")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
||||
await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
|
||||
|
||||
res.json({ ok: true });
|
||||
} catch (error) {
|
||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||
|
@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const batchSize = 10;
|
||||
|
@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
(job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
|
|||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
|
|||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
if (targetMachineCount !== activeMachines) {
|
||||
Logger.info(
|
||||
logger.info(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||
);
|
||||
|
||||
|
@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
count: activeMachines,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).send("Failed to initialize autoscaler");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
|
@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt === retries) throw error;
|
||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
const queueRedis = new Redis(process.env.REDIS_URL!);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||
await retryOperation(() => queueRedis.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`queueRedis health check failed: ${error}`);
|
||||
logger.error(`queueRedis health check failed: ${error}`);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
);
|
||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
|
@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
Logger.info("Both Redis instances are healthy");
|
||||
logger.info("Both Redis instances are healthy");
|
||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
Logger.info(
|
||||
logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
|
@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
logger.error(`Redis health check failed: ${error}`);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
// true
|
||||
|
|
|
@ -2,7 +2,7 @@ import { Request, Response } from "express";
|
|||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
const { team_id } = auth;
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
|
@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
|
@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,15 +2,17 @@ import { Request, Response } from "express";
|
|||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job } from "bullmq";
|
||||
import { toLegacyDocument } from "../v1/types";
|
||||
configDotenv();
|
||||
|
||||
export async function getJobs(crawlId: string, ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
||||
|
@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) {
|
|||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
const { team_id } = auth;
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
|
@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
|
|||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
|
@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
// TODO: need to do this to v1
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
let url = req.body.url;
|
||||
let url = urlSchema.parse(req.body.url);
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
createdAt: Date.now(),
|
||||
|
@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -208,10 +216,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
plan: plan!,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
},
|
||||
|
@ -226,7 +235,9 @@ export async function crawlController(req: Request, res: Response) {
|
|||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error instanceof ZodError
|
||||
? "Invalid URL"
|
||||
: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,15 +3,16 @@ import { authenticateUser } from "../auth";
|
|||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
|
@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
|
||||
const team_id = "preview";
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
const { plan } = auth;
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
|
@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
robots = await this.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
robots,
|
||||
|
@ -107,10 +113,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -123,10 +130,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
}, {}, jobId);
|
||||
|
@ -136,7 +144,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,13 +8,14 @@ import { authenticateUser } from "../auth";
|
|||
export const keyAuthController = async (req: Request, res: Response) => {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
// if success, return success: true
|
||||
return res.status(200).json({ success: true });
|
||||
} catch (error) {
|
||||
|
|
|
@ -7,7 +7,7 @@ import {
|
|||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import {
|
||||
|
@ -19,9 +19,11 @@ import {
|
|||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
|
@ -35,10 +37,10 @@ export async function scrapeHelper(
|
|||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
data?: Document | { url: string };
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
const url = urlSchema.parse(req.body.url);
|
||||
if (typeof url !== "string") {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
@ -54,15 +56,16 @@ export async function scrapeHelper(
|
|||
|
||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
plan,
|
||||
extractorOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
plan: plan!,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -81,7 +84,7 @@ export async function scrapeHelper(
|
|||
},
|
||||
async (span) => {
|
||||
try {
|
||||
doc = (await waitForJob(jobId, timeout))[0];
|
||||
doc = (await waitForJob<Document>(jobId, timeout));
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
|
@ -149,7 +152,7 @@ export async function scrapeHelper(
|
|||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
data: toLegacyDocument(doc, internalOptions),
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = {
|
||||
|
@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({
|
||||
error:
|
||||
|
@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && result.data.markdown
|
||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||
result.data && (result.data as Document).markdown
|
||||
? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
if (result.success) {
|
||||
|
@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
if (doc && (doc as Document).rawHtml) {
|
||||
delete (doc as Document).rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
||||
delete (doc as Document).markdown;
|
||||
}
|
||||
}
|
||||
|
||||
const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
|
@ -276,19 +283,20 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
scrapeOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({
|
||||
error:
|
||||
typeof error === "string"
|
||||
error instanceof ZodError
|
||||
? "Invalid URL"
|
||||
: typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
});
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
|
@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
|
|||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { Job } from "bullmq";
|
||||
import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
subscription_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
plan: PlanType
|
||||
plan: PlanType | undefined
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
|
@ -35,8 +36,8 @@ export async function searchHelper(
|
|||
return { success: false, error: "Query is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
const tbs = searchOptions.tbs ?? null;
|
||||
const filter = searchOptions.filter ?? null;
|
||||
const tbs = searchOptions.tbs ?? undefined;
|
||||
const filter = searchOptions.filter ?? undefined;
|
||||
let num_results = Math.min(searchOptions.limit ?? 7, 10);
|
||||
|
||||
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||
|
@ -58,10 +59,11 @@ export async function searchHelper(
|
|||
|
||||
let justSearch = pageOptions.fetchPageContent === false;
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
|
||||
|
||||
if (justSearch) {
|
||||
billTeam(team_id, subscription_id, res.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
|
@ -88,9 +90,9 @@ export async function searchHelper(
|
|||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
|
@ -104,7 +106,7 @@ export async function searchHelper(
|
|||
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
|
||||
}
|
||||
|
||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]);
|
||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
|
@ -115,7 +117,7 @@ export async function searchHelper(
|
|||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
|
||||
(doc: any) => doc && doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
|
@ -132,14 +134,15 @@ export async function searchHelper(
|
|||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
}
|
||||
const { team_id, plan, chunk } = auth;
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
|
@ -162,7 +165,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -189,7 +192,6 @@ export async function searchController(req: Request, res: Response) {
|
|||
mode: "search",
|
||||
url: req.body.query,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
|
@ -199,7 +201,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error("Unhandled error occurred in search", { error });
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,8 +4,6 @@ import {
|
|||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
|
@ -29,19 +27,16 @@ export async function batchScrapeController(
|
|||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account;
|
||||
let { remainingCredits } = req.account!;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -64,10 +59,9 @@ export async function batchScrapeController(
|
|||
url: x,
|
||||
mode: "single_urls" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
plan: req.auth.plan!,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
scrapeOptions: req.body,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Response } from "express";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
|||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
|
@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Job, JobState } from "bullmq";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
|
@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs = [];
|
||||
let doneJobIDs: string[] = [];
|
||||
let finished = false;
|
||||
|
||||
const loop = async () => {
|
||||
|
@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
|
||||
for (const jobID of newlyDoneJobIDs) {
|
||||
const job = await getJob(jobID);
|
||||
const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
|
||||
|
||||
for (const job of newlyDoneJobs) {
|
||||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
data: job.returnvalue,
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
|
@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
|
@ -126,7 +126,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
data: data,
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
if (!auth.success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error,
|
||||
error: auth.error,
|
||||
});
|
||||
}
|
||||
|
||||
const { team_id, plan } = auth;
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
|
@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
|
|||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
|
@ -24,7 +25,7 @@ export async function getJob(id: string) {
|
|||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
|
@ -81,7 +82,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs = [];
|
||||
let doneJobs: Job[] = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
|
@ -98,7 +99,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
bytes += JSON.stringify(job.returnvalue).length;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,7 +123,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
}
|
||||
|
||||
if (data.length > 0) {
|
||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
||||
if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
|
||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||
if (data[ii]) {
|
||||
delete data[ii].rawHtml;
|
||||
|
@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
data: data,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -4,9 +4,8 @@ import {
|
|||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
toLegacyCrawlerOptions,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
|
@ -20,9 +19,10 @@ import {
|
|||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
|
@ -34,18 +34,22 @@ export async function crawlController(
|
|||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account;
|
||||
let { remainingCredits } = req.account!;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
const crawlerOptions = {
|
||||
...req.body,
|
||||
url: undefined,
|
||||
scrapeOptions: undefined,
|
||||
};
|
||||
const scrapeOptions = req.body.scrapeOptions;
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
if (Array.isArray(crawlerOptions.includePaths)) {
|
||||
for (const x of crawlerOptions.includePaths) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
|
@ -54,8 +58,8 @@ export async function crawlController(
|
|||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
if (Array.isArray(crawlerOptions.excludePaths)) {
|
||||
for (const x of crawlerOptions.excludePaths) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
|
@ -68,8 +72,9 @@ export async function crawlController(
|
|||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
scrapeOptions,
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -78,9 +83,9 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
|
@ -112,7 +117,7 @@ export async function crawlController(
|
|||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
scrapeOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -142,10 +147,10 @@ export async function crawlController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
pageOptions: pageOptions,
|
||||
crawlerOptions,
|
||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||
plan: req.auth.plan!,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
scrapeOptions,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
|
@ -18,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine";
|
|||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import Redis from "ioredis";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL);
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
|
||||
// Max Links that /map can return
|
||||
const MAX_MAP_LIMIT = 5000;
|
||||
|
@ -44,8 +44,12 @@ export async function mapController(
|
|||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
crawlerOptions: {
|
||||
...req.body,
|
||||
scrapeOptions: undefined,
|
||||
},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -65,8 +69,8 @@ export async function mapController(
|
|||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
|
||||
let allResults: any[];
|
||||
let pagePromises: Promise<any>[];
|
||||
let allResults: any[] = [];
|
||||
let pagePromises: Promise<any>[] = [];
|
||||
|
||||
if (cachedResult) {
|
||||
allResults = JSON.parse(cachedResult);
|
||||
|
@ -139,7 +143,7 @@ export async function mapController(
|
|||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null);
|
||||
.filter((x) => x !== null) as string[];
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
@ -153,7 +157,7 @@ export async function mapController(
|
|||
links = removeDuplicateUrls(links);
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||
Logger.error(
|
||||
logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
|
@ -175,9 +179,8 @@ export async function mapController(
|
|||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
scrapeOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {
|
|||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
|
||||
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { Response } from "express";
|
||||
import { logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ScrapeRequest,
|
||||
scrapeRequestSchema,
|
||||
|
@ -12,7 +9,6 @@ import {
|
|||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
@ -28,8 +24,6 @@ export async function scrapeController(
|
|||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -43,11 +37,10 @@ export async function scrapeController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
plan: req.auth.plan!,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -56,13 +49,13 @@ export async function scrapeController(
|
|||
jobPriority
|
||||
);
|
||||
|
||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||
|
||||
let doc: any | undefined;
|
||||
let doc: Document;
|
||||
try {
|
||||
doc = (await waitForJob(jobId, timeout + totalWait))[0];
|
||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||
} catch (e) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
|
@ -71,34 +64,19 @@ export async function scrapeController(
|
|||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
extractorOptions && extractorOptions.mode !== "markdown"
|
||||
? " - Could be due to LLM parsing issues"
|
||||
: ""
|
||||
}`,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await getScrapeQueue().remove(jobId);
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc,
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
doc && doc.markdown
|
||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
doc && doc.extract
|
||||
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
? 0 // TODO: fix
|
||||
: 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
|
@ -111,22 +89,16 @@ export async function scrapeController(
|
|||
}
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (!req.body.formats.includes("rawHtml")) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
|
@ -137,16 +109,14 @@ export async function scrapeController(
|
|||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
scrapeOptions: req.body,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
data: doc,
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
import { countries } from "../../lib/validate-country";
|
||||
import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
|
||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
|
@ -167,6 +168,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||
});
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
|
@ -240,7 +242,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
|||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000).optional(),
|
||||
limit: z.number().min(1).max(5000).default(5000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
|
@ -252,13 +254,14 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
|
|||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
extract?: string;
|
||||
extract?: any;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
};
|
||||
warning?: string;
|
||||
metadata: {
|
||||
|
@ -291,11 +294,11 @@ export type Document = {
|
|||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
url?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -366,7 +369,7 @@ export type CrawlStatusResponse =
|
|||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType;
|
||||
plan: PlanType | undefined;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
|
@ -439,7 +442,7 @@ export interface ResponseWithSentry<
|
|||
sentry?: string,
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
|
@ -453,68 +456,90 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
|||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
headers: x.headers,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.location ?? x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification,
|
||||
removeBase64Images: x.removeBase64Images,
|
||||
mobile: x.mobile,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
userPrompt: x.prompt ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc === null || doc === undefined) return null;
|
||||
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
actions: doc.actions ?? undefined,
|
||||
warning: doc.warning ?? undefined,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata?.pageError,
|
||||
statusCode: doc.metadata?.pageStatusCode,
|
||||
crawlOptions: crawlerOptions.parse({
|
||||
includePaths: x.includes,
|
||||
excludePaths: x.excludes,
|
||||
limit: x.maxCrawledLinks ?? x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
// TODO: returnOnlyUrls support
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
|
||||
return {
|
||||
scrapeOptions: scrapeOptions.parse({
|
||||
formats: [
|
||||
(pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
|
||||
(pageOptions.includeHtml ?? false) ? "html" as const : null,
|
||||
(pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
|
||||
(pageOptions.screenshot ?? false) ? "screenshot" as const : null,
|
||||
(pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
|
||||
(extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
|
||||
"links"
|
||||
].filter(x => x !== null),
|
||||
waitFor: pageOptions.waitFor,
|
||||
headers: pageOptions.headers,
|
||||
includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
|
||||
excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
timeout: timeout,
|
||||
parsePDF: pageOptions.parsePDF,
|
||||
actions: pageOptions.actions,
|
||||
location: pageOptions.geolocation,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification,
|
||||
removeBase64Images: pageOptions.removeBase64Images,
|
||||
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
|
||||
systemPrompt: extractorOptions.extractionPrompt,
|
||||
prompt: extractorOptions.userPrompt,
|
||||
schema: extractorOptions.extractionSchema,
|
||||
} : undefined,
|
||||
mobile: pageOptions.mobile,
|
||||
}),
|
||||
internalOptions: {
|
||||
atsv: pageOptions.atsv,
|
||||
v0DisableJsDom: pageOptions.disableJsDom,
|
||||
v0UseFastMode: pageOptions.useFastMode,
|
||||
},
|
||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||
}
|
||||
}
|
||||
|
||||
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
|
||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||
}
|
||||
|
||||
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
|
||||
if (internalOptions.v0CrawlOnlyUrls) {
|
||||
return { url: document.metadata.sourceURL! };
|
||||
}
|
||||
|
||||
return {
|
||||
content: document.markdown!,
|
||||
markdown: document.markdown!,
|
||||
html: document.html,
|
||||
rawHtml: document.rawHtml,
|
||||
linksOnPage: document.links,
|
||||
llm_extraction: document.extract,
|
||||
metadata: {
|
||||
...document.metadata,
|
||||
error: undefined,
|
||||
statusCode: undefined,
|
||||
pageError: document.metadata.error,
|
||||
pageStatusCode: document.metadata.statusCode,
|
||||
screenshot: document.screenshot,
|
||||
},
|
||||
actions: document.actions ,
|
||||
warning: document.warning,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
import { WebScraperDataProvider } from "./scraper/WebScraper";
|
||||
|
||||
async function example() {
|
||||
const example = new WebScraperDataProvider();
|
||||
|
||||
await example.setOptions({
|
||||
jobId: "TEST",
|
||||
mode: "crawl",
|
||||
urls: ["https://mendable.ai"],
|
||||
crawlerOptions: {},
|
||||
});
|
||||
const docs = await example.getDocuments(false);
|
||||
docs.map((doc) => {
|
||||
console.log(doc.metadata.sourceURL);
|
||||
});
|
||||
console.log(docs.length);
|
||||
}
|
||||
|
||||
// example();
|
|
@ -6,28 +6,24 @@ import bodyParser from "body-parser";
|
|||
import cors from "cors";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import os from "os";
|
||||
import { Logger } from "./lib/logger";
|
||||
import { logger } from "./lib/logger";
|
||||
import { adminRouter } from "./routes/admin";
|
||||
import { ScrapeEvents } from "./lib/scrape-events";
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import dns from 'node:dns';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
const { ExpressAdapter } = require("@bull-board/express");
|
||||
|
||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
|
||||
const cacheable = new CacheableLookup()
|
||||
|
||||
|
@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
|||
serverAdapter: serverAdapter,
|
||||
});
|
||||
|
||||
|
||||
app.use(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
serverAdapter.getRouter()
|
||||
|
@ -78,15 +73,10 @@ app.use(adminRouter);
|
|||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
|
||||
// HyperDX OpenTelemetry
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
||||
}
|
||||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
Logger.info(
|
||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
logger.info(
|
||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
});
|
||||
|
@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
|||
const [waitingJobs] = await Promise.all([
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
const noWaitingJobs = waitingJobs === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||
|
@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||
// Re-check the waiting jobs count after the timeout
|
||||
waitingJobsCount = await getWaitingJobsCount();
|
||||
if (waitingJobsCount >= treshold) {
|
||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
|
||||
const message = {
|
||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||
timeout / 60000
|
||||
|
@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||
});
|
||||
|
||||
if (!response.ok) {
|
||||
Logger.error("Failed to send Slack notification");
|
||||
logger.error("Failed to send Slack notification");
|
||||
}
|
||||
}
|
||||
}, timeout);
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.debug(error);
|
||||
logger.debug(error);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => {
|
|||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
|
||||
Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||
logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||
}
|
||||
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
|
@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
|
|||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
});
|
||||
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
logger.info(`Worker ${process.pid} started`);
|
||||
|
||||
// const sq = getScrapeQueue();
|
||||
|
||||
|
|
|
@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
|||
|
||||
import { generateOpenAICompletions } from "./models";
|
||||
import { Document, ExtractorOptions } from "../entities";
|
||||
import { Logger } from "../logger";
|
||||
import { logger } from "../logger";
|
||||
|
||||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions,
|
||||
extractionOptions: ExtractorOptions | undefined,
|
||||
mode: "markdown" | "raw-html"
|
||||
): Promise<Document[]> {
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const systemPrompt = extractionOptions.extractionPrompt;
|
||||
const prompt = extractionOptions.userPrompt;
|
||||
const schema = extractionOptions?.extractionSchema;
|
||||
const systemPrompt = extractionOptions?.extractionPrompt;
|
||||
const prompt = extractionOptions?.userPrompt;
|
||||
|
||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
|
@ -51,7 +51,7 @@ export async function generateCompletions(
|
|||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
logger.error(`Error generating completions: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -95,7 +95,7 @@ export async function generateOpenAICompletions({
|
|||
|
||||
try {
|
||||
llmExtraction = JSON.parse(
|
||||
jsonCompletion.choices[0].message.content.trim()
|
||||
(jsonCompletion.choices[0].message.content ?? "").trim()
|
||||
);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
|
|
|
@ -3,7 +3,7 @@ export async function batchProcess<T>(
|
|||
batchSize: number,
|
||||
asyncFunction: (item: T, index: number) => Promise<void>
|
||||
): Promise<void> {
|
||||
const batches = [];
|
||||
const batches: T[][] = [];
|
||||
for (let i = 0; i < array.length; i += batchSize) {
|
||||
const batch = array.slice(i, i + batchSize);
|
||||
batches.push(batch);
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl?: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
scrapeOptions: Omit<ScrapeOptions, "timeout">;
|
||||
internalOptions: InternalOptions;
|
||||
team_id: string;
|
||||
plan: string;
|
||||
plan?: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
|
@ -100,7 +103,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||
urlO.hash = "";
|
||||
url = urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
|
@ -117,7 +120,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
|||
urlO.hash = "";
|
||||
return urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
return url;
|
||||
|
@ -131,7 +134,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
|||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
initialUrl: sc.originUrl,
|
||||
initialUrl: sc.originUrl!,
|
||||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import type { Document as V1Document } from "../controllers/v1/types";
|
||||
|
||||
export interface Progress {
|
||||
current: number;
|
||||
total: number;
|
||||
|
@ -129,7 +131,8 @@ export class Document {
|
|||
provider?: string;
|
||||
warning?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
}
|
||||
|
||||
index?: number;
|
||||
|
|
|
@ -5,7 +5,7 @@ import "../services/sentry"
|
|||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { Logger } from './logger';
|
||||
import { logger } from './logger';
|
||||
dotenv.config();
|
||||
|
||||
// TODO: add a timeout to the Go parser
|
||||
|
@ -40,7 +40,7 @@ class GoMarkdownConverter {
|
|||
}
|
||||
}
|
||||
|
||||
export async function parseMarkdown(html: string): Promise<string> {
|
||||
export async function parseMarkdown(html: string | null | undefined): Promise<string> {
|
||||
if (!html) {
|
||||
return '';
|
||||
}
|
||||
|
@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise<string> {
|
|||
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
Logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
return markdownContent;
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
}
|
||||
|
||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { redisConnection } from "../../src/services/queue-service";
|
||||
import { PlanType } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
|
||||
const SET_KEY_PREFIX = "limit_team_id:";
|
||||
export async function addJobPriority(team_id, job_id) {
|
||||
|
@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
|
|||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||
await redisConnection.expire(setKey, 60);
|
||||
} catch (e) {
|
||||
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||
// remove job_id from the set
|
||||
await redisConnection.srem(setKey, job_id);
|
||||
} catch (e) {
|
||||
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,7 @@ export async function getJobPriority({
|
|||
team_id,
|
||||
basePriority = 10,
|
||||
}: {
|
||||
plan: PlanType;
|
||||
plan: PlanType | undefined;
|
||||
team_id: string;
|
||||
basePriority?: number;
|
||||
}): Promise<number> {
|
||||
|
@ -95,7 +95,7 @@ export async function getJobPriority({
|
|||
);
|
||||
}
|
||||
} catch (e) {
|
||||
Logger.error(
|
||||
logger.error(
|
||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
||||
);
|
||||
return basePriority;
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
|
||||
|
||||
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
// const scrapInBatches = async (
|
||||
// urls: string[],
|
||||
// batchSize: number,
|
||||
// delayMs: number
|
||||
// ) => {
|
||||
// let successCount = 0;
|
||||
// let errorCount = 0;
|
||||
|
||||
// for (let i = 0; i < urls.length; i += batchSize) {
|
||||
// const batch = urls
|
||||
// .slice(i, i + batchSize)
|
||||
// .map((url) => scrapWithFireEngine(url));
|
||||
// try {
|
||||
// const results = await Promise.all(batch);
|
||||
// results.forEach((data, index) => {
|
||||
// if (data.trim() === "") {
|
||||
// errorCount++;
|
||||
// } else {
|
||||
// successCount++;
|
||||
// console.log(
|
||||
// `Scraping result ${i + index + 1}:`,
|
||||
// data.trim().substring(0, 20) + "..."
|
||||
// );
|
||||
// }
|
||||
// });
|
||||
// } catch (error) {
|
||||
// console.error("Error during scraping:", error);
|
||||
// }
|
||||
// await delay(delayMs);
|
||||
// }
|
||||
|
||||
// console.log(`Total successful scrapes: ${successCount}`);
|
||||
// console.log(`Total errored scrapes: ${errorCount}`);
|
||||
// };
|
||||
// function run() {
|
||||
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
|
||||
// scrapInBatches(urls, 10, 1000);
|
||||
// }
|
|
@ -1,57 +1,82 @@
|
|||
import * as winston from "winston";
|
||||
|
||||
import { configDotenv } from "dotenv";
|
||||
import Transport from "winston-transport";
|
||||
configDotenv();
|
||||
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
||||
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
||||
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
||||
}
|
||||
export class Logger {
|
||||
static colors = {
|
||||
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
||||
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
||||
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
||||
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
||||
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
||||
};
|
||||
|
||||
static log (message: string, level: LogLevel) {
|
||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
|
||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||
const currentLevelIndex = levels.indexOf(logLevel);
|
||||
const messageLevelIndex = levels.indexOf(level);
|
||||
|
||||
if (currentLevelIndex >= messageLevelIndex) {
|
||||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// if (useDbAuthentication) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
const logFormat = winston.format.printf(info =>
|
||||
`${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
|
||||
info.metadata,
|
||||
(_, value) => {
|
||||
if (value instanceof Error) {
|
||||
return {
|
||||
...value,
|
||||
name: value.name,
|
||||
message: value.message,
|
||||
stack: value.stack,
|
||||
cause: value.cause,
|
||||
}
|
||||
} else {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
static error(message: string | any) {
|
||||
Logger.log(message, LogLevel.ERROR);
|
||||
) : ""}`
|
||||
)
|
||||
|
||||
export const logger = winston.createLogger({
|
||||
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
|
||||
format: winston.format.json({
|
||||
replacer(key, value) {
|
||||
if (value instanceof Error) {
|
||||
return {
|
||||
...value,
|
||||
name: value.name,
|
||||
message: value.message,
|
||||
stack: value.stack,
|
||||
cause: value.cause,
|
||||
}
|
||||
} else {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
}),
|
||||
transports: [
|
||||
new winston.transports.Console({
|
||||
format: winston.format.combine(
|
||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||
winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
|
||||
...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
|
||||
),
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
export type ArrayTransportOptions = Transport.TransportStreamOptions & {
|
||||
array: any[];
|
||||
scrapeId?: string;
|
||||
};
|
||||
|
||||
export class ArrayTransport extends Transport {
|
||||
private array: any[];
|
||||
private scrapeId?: string;
|
||||
|
||||
constructor(opts: ArrayTransportOptions) {
|
||||
super(opts);
|
||||
this.array = opts.array;
|
||||
this.scrapeId = opts.scrapeId;
|
||||
}
|
||||
|
||||
static warn(message: string) {
|
||||
Logger.log(message, LogLevel.WARN);
|
||||
log(info, next) {
|
||||
setImmediate(() => {
|
||||
this.emit("logged", info);
|
||||
});
|
||||
|
||||
if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
|
||||
return next();
|
||||
}
|
||||
|
||||
static info(message: string) {
|
||||
Logger.log(message, LogLevel.INFO);
|
||||
}
|
||||
this.array.push(info);
|
||||
|
||||
static debug(message: string) {
|
||||
Logger.log(message, LogLevel.DEBUG);
|
||||
}
|
||||
|
||||
static trace(message: string) {
|
||||
Logger.log(message, LogLevel.TRACE);
|
||||
next();
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
|
||||
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||
try {
|
||||
|
@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
|
|||
links = a.map((item) => item.link);
|
||||
return links;
|
||||
} catch (error) {
|
||||
Logger.error(`Error performing cosine similarity: ${error}`);
|
||||
logger.error(`Error performing cosine similarity: ${error}`);
|
||||
return links;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import { Job } from "bullmq";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
|
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
|
|||
type: "scrape",
|
||||
url: string,
|
||||
worker?: string,
|
||||
method: (typeof baseScrapers)[number],
|
||||
method: Engine,
|
||||
result: null | {
|
||||
success: boolean,
|
||||
response_code?: number,
|
||||
|
@ -49,7 +49,7 @@ export class ScrapeEvents {
|
|||
}).select().single();
|
||||
return (result.data as any).id;
|
||||
} catch (error) {
|
||||
// Logger.error(`Error inserting scrape event: ${error}`);
|
||||
// logger.error(`Error inserting scrape event: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -69,7 +69,7 @@ export class ScrapeEvents {
|
|||
}
|
||||
}).eq("id", logId);
|
||||
} catch (error) {
|
||||
Logger.error(`Error updating scrape result: ${error}`);
|
||||
logger.error(`Error updating scrape result: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,7 +81,7 @@ export class ScrapeEvents {
|
|||
worker: process.env.FLY_MACHINE_ID,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging job event: ${error}`);
|
||||
logger.error(`Error logging job event: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
/**
|
||||
|
@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
|
|||
.in("job_id", jobIds);
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||
logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
|||
.eq("crawl_id", crawlId)
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||
logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
|
|
@ -1,30 +1,25 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
import { logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||
originalFunction: (...args: U) => Promise<T>
|
||||
export function withAuth<T, U extends any[]>(
|
||||
originalFunction: (...args: U) => Promise<T>,
|
||||
mockSuccess: T,
|
||||
) {
|
||||
return async function (...args: U): Promise<T> {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
if (warningCount < 5) {
|
||||
Logger.warn("You're bypassing authentication");
|
||||
logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
}
|
||||
return { success: true } as T;
|
||||
} else {
|
||||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,151 +1,127 @@
|
|||
import { Job } from "bullmq";
|
||||
import {
|
||||
CrawlResult,
|
||||
WebScraperOptions,
|
||||
RunWebScraperParams,
|
||||
RunWebScraperResult,
|
||||
} from "../types";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../lib/entities";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
}: {
|
||||
job: Job<WebScraperOptions>;
|
||||
job: Job<WebScraperOptions> & { id: string };
|
||||
token: string;
|
||||
}) {
|
||||
let partialDocs: Document[] = [];
|
||||
return (await runWebScraper({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
scrapeOptions: {
|
||||
...job.data.scrapeOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
partialDocs.push(progress.currentDocument);
|
||||
if (partialDocs.length > 50) {
|
||||
partialDocs = partialDocs.slice(-50);
|
||||
}
|
||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
}
|
||||
},
|
||||
onSuccess: (result, mode) => {
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result, token, mode);
|
||||
},
|
||||
onError: (error) => {
|
||||
Logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error, token, false);
|
||||
},
|
||||
internalOptions: job.data.internalOptions,
|
||||
// onSuccess: (result, mode) => {
|
||||
// logger.debug(`🐂 Job completed ${job.id}`);
|
||||
// saveJob(job, result, token, mode);
|
||||
// },
|
||||
// onError: (error) => {
|
||||
// logger.error(`🐂 Job failed ${job.id}`);
|
||||
// ScrapeEvents.logJobEvent(job, "failed");
|
||||
// },
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}));
|
||||
}
|
||||
|
||||
export async function runWebScraper({
|
||||
url,
|
||||
mode,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
inProgress,
|
||||
onSuccess,
|
||||
onError,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
// onSuccess,
|
||||
// onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
let response: ScrapeUrlResponse | undefined = undefined;
|
||||
let engines: EngineResultsTracker = {};
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
teamId: team_id
|
||||
});
|
||||
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
inProgress(progress);
|
||||
})) as Document[];
|
||||
|
||||
if (docs.length === 0) {
|
||||
return {
|
||||
success: true,
|
||||
message: "No pages found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
}
|
||||
})
|
||||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||
if (scrapeOptions.extract) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||
billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
onSuccess(filteredDocs, mode);
|
||||
// onSuccess(response.document, mode);
|
||||
|
||||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
engines = response.engines;
|
||||
return response;
|
||||
} catch (error) {
|
||||
onError(error);
|
||||
return { success: false, message: error.message, docs: [] };
|
||||
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
|
||||
|
||||
if (response !== undefined) {
|
||||
return {
|
||||
...response,
|
||||
success: false,
|
||||
error,
|
||||
}
|
||||
} else {
|
||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
|
||||
}
|
||||
// onError(error);
|
||||
} finally {
|
||||
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
|
||||
|
||||
for (const engine of engineOrder) {
|
||||
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
|
||||
ScrapeEvents.insert(bull_job_id, {
|
||||
type: "scrape",
|
||||
url,
|
||||
method: engine,
|
||||
result: {
|
||||
success: result.state === "success",
|
||||
response_code: (result.state === "success" ? result.result.statusCode : undefined),
|
||||
response_size: (result.state === "success" ? result.result.html.length : undefined),
|
||||
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
|
||||
time_taken: result.finishedAt - result.startedAt,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
|
@ -173,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
|||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -6,8 +6,8 @@ import {
|
|||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { wrap } from "./v1";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ import expressWs from "express-ws";
|
|||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
|
@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
|
||||
if (chunk) {
|
||||
req.acuc = chunk;
|
||||
}
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
||||
}
|
||||
|
@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
const auth = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
if (!auth.success) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
return res.status(auth.status).json({ success: false, error: auth.error });
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
req.acuc = chunk;
|
||||
req.acuc = chunk ?? undefined;
|
||||
if (chunk) {
|
||||
req.account = { remainingCredits: chunk.remaining_credits };
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
import { WebCrawler } from '../crawler';
|
||||
import axios from 'axios';
|
||||
import robotsParser from 'robots-parser';
|
||||
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
|
||||
|
||||
jest.mock('axios');
|
||||
jest.mock('robots-parser');
|
||||
|
@ -35,165 +34,6 @@ describe('WebCrawler', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
|
||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 2;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page1', // depth 1
|
||||
initialUrl + '/page1/page2', // depth 2
|
||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page1', html: '' },
|
||||
{ url: initialUrl + '/page1/page2', html: '' }
|
||||
]);
|
||||
|
||||
|
||||
// Ensure that the link with depth 3 is not included
|
||||
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
|
||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 0;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page1', // depth 1
|
||||
initialUrl + '/page1/page2', // depth 2
|
||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
|
||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 1;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page2', // depth 1
|
||||
initialUrl + '/page2/page3', // depth 2
|
||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page2', html: '' }
|
||||
]);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
|
||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 2;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page2', // depth 1
|
||||
initialUrl + '/page2/page3', // depth 2
|
||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page2', html: '' },
|
||||
{ url: initialUrl + '/page2/page3', html: '' }
|
||||
]);
|
||||
});
|
||||
|
||||
it('should handle allowBackwardCrawling option correctly', async () => {
|
||||
const initialUrl = 'https://mendable.ai/blog';
|
||||
|
||||
// Setup the crawler with the specific test case options
|
||||
const crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: 3, // Example depth
|
||||
allowBackwardCrawling: true
|
||||
});
|
||||
|
||||
// Mock the sitemap fetching function to simulate backward crawling
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl,
|
||||
'https://mendable.ai', // backward link
|
||||
initialUrl + '/page1',
|
||||
initialUrl + '/page1/page2'
|
||||
]);
|
||||
|
||||
const results = await crawler.start();
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
|
||||
{ url: initialUrl + '/page1', html: '' },
|
||||
{ url: initialUrl + '/page1/page2', html: '' }
|
||||
]);
|
||||
|
||||
// Check that the backward link is included if allowBackwardCrawling is true
|
||||
expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
|
||||
});
|
||||
|
||||
it('should respect the limit parameter by not returning more links than specified', async () => {
|
||||
const initialUrl = 'http://example.com';
|
||||
const limit = 2; // Set a limit for the number of links
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
|
||||
jest.mock('../single_url', () => {
|
||||
const originalModule = jest.requireActual('../single_url');
|
||||
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
|
||||
|
||||
return originalModule;
|
||||
});
|
||||
|
||||
describe('scrapSingleUrl', () => {
|
||||
it('should handle includeHtml option correctly', async () => {
|
||||
const url = 'https://roastmywebsite.ai';
|
||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||
|
||||
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||
|
||||
expect(resultWithHtml.html).toBeDefined();
|
||||
expect(resultWithoutHtml.html).toBeUndefined();
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://flutterbricks.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 15000);
|
|
@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios";
|
|||
import cheerio, { load } from "cheerio";
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import async from "async";
|
||||
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
|
@ -73,7 +70,7 @@ export class WebCrawler {
|
|||
try {
|
||||
url = new URL(link.trim(), this.baseUrl);
|
||||
} catch (error) {
|
||||
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||
logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
const path = url.pathname;
|
||||
|
@ -132,7 +129,7 @@ export class WebCrawler {
|
|||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
// Check if the link is disallowed by robots.txt
|
||||
if (!isAllowed) {
|
||||
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -161,7 +158,7 @@ export class WebCrawler {
|
|||
}
|
||||
|
||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||
|
@ -170,115 +167,6 @@ export class WebCrawler {
|
|||
return null;
|
||||
}
|
||||
|
||||
public async start(
|
||||
inProgress?: (progress: Progress) => void,
|
||||
pageOptions?: PageOptions,
|
||||
crawlerOptions?: CrawlerOptions,
|
||||
concurrencyLimit: number = 5,
|
||||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
|
||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const txt = await this.getRobotsTxt();
|
||||
this.importRobotsTxt(txt);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
const sm = await this.tryGetSitemap();
|
||||
if (sm !== null) {
|
||||
return sm;
|
||||
}
|
||||
}
|
||||
|
||||
const urls = await this.crawlUrls(
|
||||
[this.initialUrl],
|
||||
pageOptions,
|
||||
concurrencyLimit,
|
||||
inProgress
|
||||
);
|
||||
|
||||
if (
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
) {
|
||||
return [{ url: this.initialUrl, html: "" }];
|
||||
}
|
||||
|
||||
// make sure to run include exclude here again
|
||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||
}
|
||||
|
||||
private async crawlUrls(
|
||||
urls: string[],
|
||||
pageOptions: PageOptions,
|
||||
concurrencyLimit: number,
|
||||
inProgress?: (progress: Progress) => void,
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
Logger.debug(`Crawling ${task}`);
|
||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
return;
|
||||
}
|
||||
const newUrls = await this.crawl(task, pageOptions);
|
||||
// add the initial url if not already added
|
||||
// if (this.visited.size === 1) {
|
||||
// let normalizedInitial = this.initialUrl;
|
||||
// if (!normalizedInitial.endsWith("/")) {
|
||||
// normalizedInitial = normalizedInitial + "/";
|
||||
// }
|
||||
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
||||
// newUrls.push({ url: this.initialUrl, html: "" });
|
||||
// }
|
||||
// }
|
||||
|
||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||
|
||||
if (inProgress && newUrls.length > 0) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||
});
|
||||
} else if (inProgress) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: task,
|
||||
});
|
||||
}
|
||||
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
}, concurrencyLimit);
|
||||
|
||||
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
||||
queue.push(
|
||||
urls.filter(
|
||||
(url) =>
|
||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
||||
),
|
||||
(err) => {
|
||||
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
public filterURL(href: string, url: string): string | null {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
|
@ -346,79 +234,9 @@ export class WebCrawler {
|
|||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
}
|
||||
this.visited.add(url);
|
||||
|
||||
if (!url.startsWith("http")) {
|
||||
url = "https://" + url;
|
||||
}
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
let content: string = "";
|
||||
let pageStatusCode: number;
|
||||
let pageError: string | undefined = undefined;
|
||||
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||
content = page.html ?? "";
|
||||
pageStatusCode = page.metadata?.pageStatusCode;
|
||||
pageError = page.metadata?.pageError || undefined;
|
||||
} else {
|
||||
const response = await axios.get(url, { timeout: axiosTimeout });
|
||||
content = response.data ?? "";
|
||||
pageStatusCode = response.status;
|
||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||
}
|
||||
|
||||
const $ = load(content);
|
||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||
|
||||
// Add the initial URL to the list of links
|
||||
if (this.visited.size === 1) {
|
||||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
}
|
||||
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
return links.filter((link) => !this.visited.has(link.url));
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private isRobotsAllowed(url: string): boolean {
|
||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||
}
|
||||
private normalizeCrawlUrl(url: string): string {
|
||||
try{
|
||||
const urlObj = new URL(url);
|
||||
urlObj.searchParams.sort(); // Sort query parameters to normalize
|
||||
return urlObj.toString();
|
||||
} catch (error) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
private matchesIncludes(url: string): boolean {
|
||||
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
||||
}
|
||||
|
||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||
return this.excludes.some((pattern) => {
|
||||
|
@ -503,7 +321,7 @@ export class WebCrawler {
|
|||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||
} catch (error) {
|
||||
Logger.error(`Error processing URL in isFile: ${error}`);
|
||||
logger.error(`Error processing URL in isFile: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -524,7 +342,6 @@ export class WebCrawler {
|
|||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
||||
//
|
||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||
const normalizeUrl = (url: string) => {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
|
@ -546,7 +363,7 @@ export class WebCrawler {
|
|||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
|
@ -565,7 +382,7 @@ export class WebCrawler {
|
|||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
import { logger } from "../../../lib/logger";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
|
@ -6,7 +6,7 @@ export async function handleCustomScraping(
|
|||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
|
||||
Logger.debug(
|
||||
logger.debug(
|
||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -21,7 +21,7 @@ export async function handleCustomScraping(
|
|||
|
||||
// Check for Vanta security portals
|
||||
if (text.includes('<link href="https://static.vanta.com')) {
|
||||
Logger.debug(
|
||||
logger.debug(
|
||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -36,7 +36,7 @@ export async function handleCustomScraping(
|
|||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||
if (googleDriveMetaMatch) {
|
||||
const url = googleDriveMetaMatch[1];
|
||||
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
|
||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||
if (fileIdMatch) {
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
export const universalTimeout = 15000;
|
|
@ -1,743 +0,0 @@
|
|||
import {
|
||||
Document,
|
||||
ExtractorOptions,
|
||||
PageOptions,
|
||||
WebScraperOptions,
|
||||
} from "../../lib/entities";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/imageDescription";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import {
|
||||
replaceImgPathsWithAbsolutePaths,
|
||||
replacePathsWithAbsolutePaths,
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private jobId: string;
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string | string[];
|
||||
private excludes: string | string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
private limit: number = 10000;
|
||||
private concurrentRequests: number = 20;
|
||||
private generateImgAltText: boolean = false;
|
||||
private ignoreSitemap: boolean = false;
|
||||
private pageOptions?: PageOptions;
|
||||
private extractorOptions?: ExtractorOptions;
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
private teamId?: string;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
authorizeNango(): Promise<void> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
private async convertUrlsToDocuments(
|
||||
urls: string[],
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const totalUrls = urls.length;
|
||||
let processedUrls = 0;
|
||||
|
||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(
|
||||
this.jobId,
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML,
|
||||
this.priority,
|
||||
this.teamId,
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
current: processedUrls,
|
||||
total: totalUrls,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: url,
|
||||
currentDocument: { ...result, index: processedUrls },
|
||||
});
|
||||
}
|
||||
|
||||
results[i + index] = result;
|
||||
})
|
||||
);
|
||||
}
|
||||
return results.filter((result) => result !== null) as Document[];
|
||||
}
|
||||
|
||||
async getDocuments(
|
||||
useCaching: boolean = false,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
this.validateInitialUrl();
|
||||
if (!useCaching) {
|
||||
return this.processDocumentsWithoutCache(inProgress);
|
||||
}
|
||||
|
||||
return this.processDocumentsWithCache(inProgress);
|
||||
}
|
||||
|
||||
private validateInitialUrl(): void {
|
||||
if (this.urls[0].trim() === "") {
|
||||
throw new Error("Url is required");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process documents without cache handling each mode
|
||||
* @param inProgress inProgress
|
||||
* @returns documents
|
||||
*/
|
||||
private async processDocumentsWithoutCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
switch (this.mode) {
|
||||
case "crawl":
|
||||
return this.handleCrawlMode(inProgress);
|
||||
case "single_urls":
|
||||
return this.handleSingleUrlsMode(inProgress);
|
||||
case "sitemap":
|
||||
return this.handleSitemapMode(inProgress);
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async cleanIrrelevantPath(links: string[]) {
|
||||
return links.filter((link) => {
|
||||
const normalizedInitialUrl = new URL(this.urls[0]);
|
||||
const normalizedLink = new URL(link);
|
||||
|
||||
// Normalize the hostname to account for www and non-www versions
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||
/^www\./,
|
||||
""
|
||||
);
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||
|
||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||
return (
|
||||
linkHostname === initialHostname &&
|
||||
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let includes: string[];
|
||||
if (Array.isArray(this.includes)) {
|
||||
if (this.includes[0] != "") {
|
||||
includes = this.includes;
|
||||
}
|
||||
} else {
|
||||
includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
let excludes: string[];
|
||||
if (Array.isArray(this.excludes)) {
|
||||
if (this.excludes[0] != "") {
|
||||
excludes = this.excludes;
|
||||
}
|
||||
} else {
|
||||
excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||
allowExternalContentLinks: this.allowExternalContentLinks,
|
||||
});
|
||||
|
||||
let links = await crawler.start(
|
||||
inProgress,
|
||||
this.pageOptions,
|
||||
{
|
||||
ignoreSitemap: this.ignoreSitemap,
|
||||
},
|
||||
5,
|
||||
this.limit,
|
||||
this.maxCrawledDepth
|
||||
);
|
||||
|
||||
let allLinks = links.map((e) => e.url);
|
||||
const allHtmls = links.map((e) => e.html);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
||||
}
|
||||
|
||||
let documents = [];
|
||||
// check if fast mode is enabled and there is html inside the links
|
||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
} else {
|
||||
documents = await this.processLinks(allLinks, inProgress);
|
||||
}
|
||||
|
||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
const links = this.urls;
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return documents;
|
||||
}
|
||||
|
||||
private async handleSitemapMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
||||
links = await this.cleanIrrelevantPath(links);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async returnOnlyUrlsResponse(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
inProgress?.({
|
||||
current: links.length,
|
||||
total: links.length,
|
||||
status: "COMPLETED",
|
||||
currentDocumentUrl: this.urls[0],
|
||||
});
|
||||
return links.map((url) => ({
|
||||
content: "",
|
||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url, pageStatusCode: 200 },
|
||||
}));
|
||||
}
|
||||
|
||||
private async processLinks(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(
|
||||
(link) => link.endsWith(".doc") || link.endsWith(".docx")
|
||||
);
|
||||
|
||||
const [pdfDocuments, docxDocuments] = await Promise.all([
|
||||
this.fetchPdfDocuments(pdfLinks),
|
||||
this.fetchDocxDocuments(docLinks),
|
||||
]);
|
||||
|
||||
links = links.filter(
|
||||
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
|
||||
);
|
||||
|
||||
let [documents, sitemapData] = await Promise.all([
|
||||
this.convertUrlsToDocuments(links, inProgress, allHtmls),
|
||||
this.mode === "single_urls" && links.length > 0
|
||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||
(error) => {
|
||||
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
||||
return null;
|
||||
}
|
||||
)
|
||||
: Promise.resolve(null),
|
||||
]);
|
||||
|
||||
if (this.mode === "single_urls" && documents.length > 0) {
|
||||
documents[0].metadata.sitemap = sitemapData ?? undefined;
|
||||
} else {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||
|
||||
if (
|
||||
extractionMode === "llm-extraction" ||
|
||||
extractionMode === "llm-extraction-from-markdown" ||
|
||||
extractionMode === "llm-extraction-from-raw-html"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
completionMode
|
||||
);
|
||||
}
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: pdfLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "pdf-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions.parsePDF
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (docxLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: docxLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "docx-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||
docxLink
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
content,
|
||||
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
documents = replacePathsWithAbsolutePaths(documents);
|
||||
}
|
||||
return replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||
return this.generateImgAltText
|
||||
? this.generatesImgAltText(documents)
|
||||
: documents;
|
||||
}
|
||||
|
||||
private async cacheAndFinalizeDocuments(
|
||||
documents: Document[],
|
||||
links: string[]
|
||||
): Promise<Document[]> {
|
||||
// await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private async processDocumentsWithCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.getCachedDocuments(
|
||||
this.urls.slice(0, this.limit)
|
||||
);
|
||||
if (documents.length < this.limit) {
|
||||
const newDocuments: Document[] = await this.getDocuments(
|
||||
false,
|
||||
inProgress
|
||||
);
|
||||
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||
}
|
||||
documents = this.filterDocsExcludeInclude(documents);
|
||||
documents = this.filterDepth(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private mergeNewDocuments(
|
||||
existingDocuments: Document[],
|
||||
newDocuments: Document[]
|
||||
): Document[] {
|
||||
newDocuments.forEach((doc) => {
|
||||
if (
|
||||
!existingDocuments.some(
|
||||
(d) =>
|
||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||
)
|
||||
) {
|
||||
existingDocuments.push(doc);
|
||||
}
|
||||
});
|
||||
return existingDocuments;
|
||||
}
|
||||
|
||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||
return documents.filter((document) => {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
|
||||
if (!Array.isArray(this.excludes)) {
|
||||
this.excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
// Check if the link should be excluded
|
||||
if (
|
||||
this.excludes.some((excludePattern) =>
|
||||
new RegExp(excludePattern).test(path)
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!Array.isArray(this.includes)) {
|
||||
this.includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0) {
|
||||
return this.includes.some((includePattern) =>
|
||||
new RegExp(includePattern).test(path)
|
||||
);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
private normalizeUrl(url: string): string {
|
||||
if (url.includes("//www.")) {
|
||||
return url.replace("//www.", "//");
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
private removeChildLinks(documents: Document[]): Document[] {
|
||||
for (let document of documents) {
|
||||
if (document?.childrenLinks) delete document.childrenLinks;
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
|
||||
for (const document of documents) {
|
||||
if (document.content.trim().length === 0) {
|
||||
continue;
|
||||
}
|
||||
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
||||
await setValue(
|
||||
"web-scraper-cache:" + normalizedUrl,
|
||||
JSON.stringify({
|
||||
...document,
|
||||
childrenLinks: childrenLinks || [],
|
||||
}),
|
||||
60 * 60
|
||||
); // 10 days
|
||||
}
|
||||
}
|
||||
|
||||
async getCachedDocuments(urls: string[]): Promise<Document[]> {
|
||||
let documents: Document[] = [];
|
||||
for (const url of urls) {
|
||||
const normalizedUrl = this.normalizeUrl(url);
|
||||
Logger.debug(
|
||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
const cachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
if (cachedDocumentString) {
|
||||
const cachedDocument = JSON.parse(cachedDocumentString);
|
||||
documents.push(cachedDocument);
|
||||
|
||||
// get children documents
|
||||
for (const childUrl of cachedDocument.childrenLinks || []) {
|
||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||
const childCachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedChildUrl
|
||||
);
|
||||
if (childCachedDocumentString) {
|
||||
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
||||
if (
|
||||
!documents.find(
|
||||
(doc) =>
|
||||
doc.metadata.sourceURL ===
|
||||
childCachedDocument.metadata.sourceURL
|
||||
)
|
||||
) {
|
||||
documents.push(childCachedDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
setOptions(options: WebScraperOptions): void {
|
||||
if (!options.urls) {
|
||||
throw new Error("Urls are required");
|
||||
}
|
||||
|
||||
this.jobId = options.jobId;
|
||||
this.bullJobId = options.bullJobId;
|
||||
this.urls = options.urls;
|
||||
this.mode = options.mode;
|
||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
||||
this.includes = options.crawlerOptions?.includes ?? [];
|
||||
this.excludes = options.crawlerOptions?.excludes ?? [];
|
||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
||||
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
|
||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
|
||||
mobile: options.pageOptions?.mobile ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
|
||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
if (!url.trim().startsWith("http")) {
|
||||
return `https://${url}`;
|
||||
}
|
||||
return url;
|
||||
});
|
||||
}
|
||||
|
||||
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl);
|
||||
if (sitemapData) {
|
||||
for (let i = 0; i < documents.length; i++) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) =>
|
||||
this.normalizeUrl(data.loc) ===
|
||||
this.normalizeUrl(documents[i].metadata.sourceURL)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
documents[i].metadata.sitemap = sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
private async getSitemapDataForSingleUrl(
|
||||
baseUrl: string,
|
||||
url: string,
|
||||
timeout?: number
|
||||
) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl, timeout);
|
||||
if (sitemapData) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
return sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||
await Promise.all(
|
||||
documents.map(async (document) => {
|
||||
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
||||
|
||||
await Promise.all(
|
||||
images.map(async (image: string) => {
|
||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (
|
||||
!altText &&
|
||||
!imageUrl.startsWith("data:image") &&
|
||||
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
|
||||
) {
|
||||
const imageIndex = document.content.indexOf(image);
|
||||
const contentLength = document.content.length;
|
||||
let backText = document.content.substring(
|
||||
imageIndex + image.length,
|
||||
Math.min(imageIndex + image.length + 1000, contentLength)
|
||||
);
|
||||
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
||||
let frontText = document.content.substring(
|
||||
frontTextStartIndex,
|
||||
imageIndex
|
||||
);
|
||||
altText = await getImageDescription(
|
||||
imageUrl,
|
||||
backText,
|
||||
frontText,
|
||||
this.generateImgAltTextModel
|
||||
);
|
||||
}
|
||||
|
||||
document.content = document.content.replace(
|
||||
image,
|
||||
`![${altText}](${imageUrl})`
|
||||
);
|
||||
})
|
||||
);
|
||||
})
|
||||
);
|
||||
|
||||
return documents;
|
||||
};
|
||||
|
||||
filterDepth(documents: Document[]): Document[] {
|
||||
return documents.filter((document) => {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
|
||||
});
|
||||
}
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
import axios from "axios";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Axios
|
||||
* @param url The URL to scrape
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithFetch(
|
||||
url: string,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "fetch",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: universalTimeout,
|
||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
||||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
Logger.debug(
|
||||
`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
|
||||
);
|
||||
logParams.error_message = response.statusText;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: response.status,
|
||||
pageError: response.statusText,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode: response.status, pageError };
|
||||
} else {
|
||||
const text = response.data;
|
||||
logParams.success = true;
|
||||
logParams.html = text;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
content: text,
|
||||
pageStatusCode: response.status,
|
||||
pageError: null,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: error.response?.status ?? null,
|
||||
pageError: logParams.error_message,
|
||||
};
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
|
@ -1,230 +0,0 @@
|
|||
import axios from "axios";
|
||||
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import axiosRetry from 'axios-retry';
|
||||
|
||||
axiosRetry(axios, { retries: 3 , onRetry:()=>{
|
||||
console.log("Retrying (fire-engine)...");
|
||||
}, retryDelay: axiosRetry.exponentialDelay});
|
||||
/**
|
||||
* Scrapes a URL with Fire-Engine
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param screenshot Whether to take a screenshot
|
||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||
* @param pageOptions The options for the page
|
||||
* @param headers The headers to send with the request
|
||||
* @param options The options for the request
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithFireEngine({
|
||||
url,
|
||||
actions,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
teamId,
|
||||
}: {
|
||||
url: string;
|
||||
actions?: Action[];
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "fire-engine",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
||||
if(options?.endpoint === "request") {
|
||||
endpoint = "/request";
|
||||
}
|
||||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
|
||||
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
||||
fireEngineOptionsParam.atsv = true;
|
||||
} else {
|
||||
pageOptions.atsv = false;
|
||||
}
|
||||
|
||||
const axiosInstance = axios.create({
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
|
||||
const startTime = Date.now();
|
||||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
headers: headers,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
mobile: pageOptions?.mobile ?? false,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
removeBase64Images: pageOptions?.removeBase64Images ?? true,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
}
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
||||
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
|
||||
// added 5 seconds to the timeout to account for 'smart wait'
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
if (checkStatusResponse.data.processing) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||
axiosInstance.delete(
|
||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
||||
validateStatus: (status) => true
|
||||
}
|
||||
).catch((error) => {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||
});
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", pageStatusCode: null, pageError: "" };
|
||||
}
|
||||
|
||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||
Logger.debug(
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
|
||||
);
|
||||
|
||||
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
||||
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
||||
|
||||
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
|
||||
}
|
||||
|
||||
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
||||
|
||||
return {
|
||||
html: "",
|
||||
pageStatusCode,
|
||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
|
||||
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
logParams.success = true;
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { html: content, pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = checkStatusResponse.data;
|
||||
|
||||
logParams.success =
|
||||
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
||||
data.pageStatusCode === 404;
|
||||
logParams.html = data.content ?? "";
|
||||
logParams.response_code = data.pageStatusCode;
|
||||
logParams.error_message = data.pageError ?? data.error;
|
||||
|
||||
return {
|
||||
html: data.content ?? "",
|
||||
screenshots: data.screenshots ?? [data.screenshot] ?? [],
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError ?? data.error,
|
||||
scrapeActionContent: data?.actionContent ?? [],
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
} else {
|
||||
Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
}
|
||||
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams, pageOptions);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
import axios from "axios";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Playwright
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param headers The headers to send with the request
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "playwright",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
// If the user has passed a wait parameter in the request, use that
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL,
|
||||
{
|
||||
url: url,
|
||||
wait_after_load: waitParam,
|
||||
timeout: universalTimeout + waitParam,
|
||||
headers: headers,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
|
||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
||||
}
|
||||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
||||
);
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: response.data?.pageStatusCode,
|
||||
pageError: response.data?.pageError,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode, pageError };
|
||||
} else {
|
||||
const textData = response.data;
|
||||
try {
|
||||
const data = JSON.parse(textData);
|
||||
const html = data.content;
|
||||
logParams.success = true;
|
||||
logParams.html = html;
|
||||
logParams.response_code = data.pageStatusCode;
|
||||
logParams.error_message = data.pageError;
|
||||
return {
|
||||
content: html ?? "",
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError,
|
||||
};
|
||||
} catch (jsonError) {
|
||||
logParams.error_message = jsonError.message || jsonError;
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
||||
);
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with ScrapingBee
|
||||
* @param url The URL to scrape
|
||||
* @param wait_browser The browser event to wait for
|
||||
* @param timeout The timeout for the scrape
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithScrapingBee(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = universalTimeout,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
const clientParams = await generateRequestParams(
|
||||
url,
|
||||
wait_browser,
|
||||
timeout
|
||||
);
|
||||
const response = await client.get({
|
||||
...clientParams,
|
||||
params: {
|
||||
...clientParams.params,
|
||||
transparent_status_code: "True",
|
||||
},
|
||||
});
|
||||
Logger.info(
|
||||
`⛏️ ScrapingBee: Scraping ${url}`
|
||||
);
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode, pageError };
|
||||
} else {
|
||||
let text = "";
|
||||
try {
|
||||
const decoder = new TextDecoder();
|
||||
text = decoder.decode(response.data);
|
||||
logParams.success = true;
|
||||
} catch (decodeError) {
|
||||
Logger.debug(
|
||||
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
||||
);
|
||||
logParams.error_message = decodeError.message || decodeError;
|
||||
}
|
||||
logParams.response_code = response.status;
|
||||
logParams.html = text;
|
||||
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
|
||||
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
|
||||
return {
|
||||
content: text,
|
||||
pageStatusCode: response.status,
|
||||
pageError: response.statusText !== "OK" ? response.statusText : undefined,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
logParams.response_code = error.response?.status;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: error.response?.status,
|
||||
pageError: error.response?.statusText,
|
||||
};
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
|
@ -1,506 +0,0 @@
|
|||
import * as cheerio from "cheerio";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
Document,
|
||||
PageOptions,
|
||||
FireEngineResponse,
|
||||
ExtractorOptions,
|
||||
Action,
|
||||
} from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
||||
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
|
||||
import { scrapWithFetch } from "./scrapers/fetch";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||
import { extractLinks } from "./utils/utils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
import { clientSideError } from "../../strings";
|
||||
import { ScrapeActionContent } from "../../lib/entities";
|
||||
import { removeBase64Images } from "./utils/removeBase64Images";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
|
||||
export const baseScrapers = [
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = 15000
|
||||
): Promise<any> {
|
||||
const defaultParams = {
|
||||
url: url,
|
||||
params: { timeout: timeout, wait_browser: wait_browser },
|
||||
headers: { "ScrapingService-Request": "TRUE" },
|
||||
};
|
||||
|
||||
try {
|
||||
const urlKey = new URL(url).hostname.replace(/^www\./, "");
|
||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||
} else {
|
||||
return defaultParams;
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating URL key: ${error}`);
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the order of scrapers to be used for scraping a URL
|
||||
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
|
||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||
* @returns The order of scrapers to be used for scraping a URL
|
||||
*/
|
||||
function getScrapingFallbackOrder(
|
||||
defaultScraper?: string,
|
||||
isWaitPresent: boolean = false,
|
||||
isScreenshotPresent: boolean = false,
|
||||
isHeadersPresent: boolean = false,
|
||||
isActionsPresent: boolean = false,
|
||||
) {
|
||||
if (isActionsPresent) {
|
||||
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
||||
}
|
||||
|
||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||
switch (scraper) {
|
||||
case "scrapingBee":
|
||||
case "scrapingBeeLoad":
|
||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||
case "fire-engine":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "fire-engine;chrome-cdp":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "playwright":
|
||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
let defaultOrder = [
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
// defaultOrder = [
|
||||
// "fire-engine",
|
||||
// useFireEngine ? undefined : "playwright",
|
||||
// ...defaultOrder.filter(
|
||||
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
// ),
|
||||
// ].filter(Boolean);
|
||||
// }
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
(scraper: (typeof baseScrapers)[number]) =>
|
||||
availableScrapers.includes(scraper)
|
||||
);
|
||||
const uniqueScrapers = new Set(
|
||||
defaultScraper
|
||||
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
|
||||
: [...filteredDefaultOrder, ...availableScrapers]
|
||||
);
|
||||
|
||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||
}
|
||||
|
||||
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
includeExtract: pageOptions.includeExtract ?? false,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
useFastMode: pageOptions.useFastMode ?? false,
|
||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
removeBase64Images: pageOptions.removeBase64Images ?? true,
|
||||
mobile: pageOptions.mobile ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
url: string,
|
||||
method: (typeof baseScrapers)[number]
|
||||
) => {
|
||||
let scraperResponse: {
|
||||
text: string;
|
||||
screenshot: string;
|
||||
actions?: {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
};
|
||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
type: "scrape",
|
||||
url,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method,
|
||||
result: null,
|
||||
});
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if (method === "fire-engine;chrome-cdp") {
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
|
||||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||
const result: Action[] = [];
|
||||
// Don't add a wait if the previous action is a wait
|
||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||
result.push(action);
|
||||
// Don't add a wait if the next action is a wait
|
||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
return result;
|
||||
}
|
||||
return [action as Action];
|
||||
}) ?? [] as Action[];
|
||||
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
...(engine === "chrome-cdp" ? ({
|
||||
actions: [
|
||||
...(pageOptions.waitFor ? [{
|
||||
type: "wait" as const,
|
||||
milliseconds: pageOptions.waitFor,
|
||||
}] : []),
|
||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: !!pageOptions.fullPageScreenshot,
|
||||
}] : []),
|
||||
...processedActions,
|
||||
],
|
||||
}) : ({
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
})),
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
atsv: pageOptions.atsv,
|
||||
disableJsDom: pageOptions.disableJsDom,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||
}
|
||||
if (pageOptions.actions) {
|
||||
scraperResponse.actions = {
|
||||
screenshots: response.screenshots ?? [],
|
||||
scrapes: response.scrapeActionContent ?? [],
|
||||
};
|
||||
}
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBee":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
const response = await scrapWithScrapingBee(
|
||||
url,
|
||||
"domcontentloaded",
|
||||
pageOptions.fallback === false ? 7000 : 15000
|
||||
);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "playwright":
|
||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||
const response = await scrapWithPlaywright(
|
||||
url,
|
||||
pageOptions.waitFor,
|
||||
pageOptions.headers
|
||||
);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBeeLoad":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
const response = await scrapWithScrapingBee(url, "networkidle2");
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "fetch":
|
||||
const response = await scrapWithFetch(url);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
break;
|
||||
}
|
||||
|
||||
let customScrapedContent: FireEngineResponse | null = null;
|
||||
|
||||
// Check for custom scraping conditions
|
||||
const customScraperResult = await handleCustomScraping(
|
||||
scraperResponse.text,
|
||||
url
|
||||
);
|
||||
|
||||
if (customScraperResult) {
|
||||
switch (customScraperResult.scraper) {
|
||||
case "fire-engine":
|
||||
customScrapedContent = await scrapWithFireEngine({
|
||||
url: customScraperResult.url,
|
||||
actions: customScraperResult.waitAfterLoad ? ([
|
||||
{
|
||||
type: "wait",
|
||||
milliseconds: customScraperResult.waitAfterLoad,
|
||||
}
|
||||
]) : ([]),
|
||||
pageOptions: customScraperResult.pageOptions,
|
||||
});
|
||||
break;
|
||||
case "pdf":
|
||||
const { content, pageStatusCode, pageError } =
|
||||
await fetchAndProcessPdf(
|
||||
customScraperResult.url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
customScrapedContent = {
|
||||
html: content,
|
||||
pageStatusCode,
|
||||
pageError,
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (customScrapedContent) {
|
||||
scraperResponse.text = customScrapedContent.html;
|
||||
}
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
let text = await parseMarkdown(cleanedHtml);
|
||||
if (pageOptions.removeBase64Images) {
|
||||
text = await removeBase64Images(text);
|
||||
}
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: scraperResponse.text.length,
|
||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
error: scraperResponse.metadata.pageError,
|
||||
response_code: scraperResponse.metadata.pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
text,
|
||||
html: cleanedHtml,
|
||||
rawHtml: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
actions: scraperResponse.actions,
|
||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||
pageError: scraperResponse.metadata.pageError || undefined,
|
||||
};
|
||||
};
|
||||
|
||||
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
||||
text: "",
|
||||
html: "",
|
||||
rawHtml: "",
|
||||
screenshot: "",
|
||||
actions: undefined,
|
||||
pageStatusCode: 200,
|
||||
pageError: undefined,
|
||||
};
|
||||
try {
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||
);
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
|
||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||
text = await parseMarkdown(cleanedHtml);
|
||||
html = cleanedHtml;
|
||||
break;
|
||||
}
|
||||
|
||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||
text = attempt.text ?? "";
|
||||
html = attempt.html ?? "";
|
||||
rawHtml = attempt.rawHtml ?? "";
|
||||
screenshot = attempt.screenshot ?? "";
|
||||
actions = attempt.actions ?? undefined;
|
||||
|
||||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
}
|
||||
|
||||
if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
|
||||
pageError = attempt.pageError;
|
||||
|
||||
if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
|
||||
pageStatusCode = 500;
|
||||
}
|
||||
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
|
||||
break;
|
||||
}
|
||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
// if (nextScraperIndex < scrapersInOrder.length) {
|
||||
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
// }
|
||||
}
|
||||
|
||||
if (!text) {
|
||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||
}
|
||||
|
||||
const soup = cheerio.load(rawHtml);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
actions,
|
||||
metadata: {
|
||||
...metadata,
|
||||
...(screenshot && screenshot.length > 0 ? ({
|
||||
screenshot,
|
||||
}) : {}),
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
};
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||
ScrapeEvents.insert(jobId, {
|
||||
type: "error",
|
||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||
stack: error.stack,
|
||||
});
|
||||
|
||||
return {
|
||||
content: "",
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
} as Document;
|
||||
}
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
import axios from "axios";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { scrapeURL } from "../scrapeURL";
|
||||
import { scrapeOptions } from "../../controllers/v1/types";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
{
|
||||
|
@ -17,17 +18,20 @@ export async function getLinksFromSitemap(
|
|||
}
|
||||
): Promise<string[]> {
|
||||
try {
|
||||
let content: string;
|
||||
let content: string = "";
|
||||
try {
|
||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } });
|
||||
content = response.html;
|
||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
|
||||
if (!response.success) {
|
||||
throw response.error;
|
||||
}
|
||||
content = response.document.rawHtml!;
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
@ -47,7 +51,7 @@ export async function getLinksFromSitemap(
|
|||
allUrls.push(...validUrls);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
import * as docxProcessor from "../docxProcessor";
|
||||
|
||||
describe("DOCX Processing Module - Integration Test", () => {
|
||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||
);
|
||||
expect(content.trim()).toContain(
|
||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||
);
|
||||
expect(pageStatusCode).toBe(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
});
|
|
@ -1,128 +0,0 @@
|
|||
import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
describe('parseTablesToMarkdown', () => {
|
||||
it('converts a simple HTML table to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single row to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single column to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th></tr>
|
||||
<tr><td>Row 1 Col 1</td></tr>
|
||||
<tr><td>Row 2 Col 1</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single cell to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th></tr>
|
||||
<tr><td>Row 1 Col 1</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no header to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no rows to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no cells to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no columns to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th></th></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no table to Markdown', async () => {
|
||||
const html = ``;
|
||||
const expectedMarkdown = ``;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table inside of a bunch of html noise', async () => {
|
||||
const html = `
|
||||
<div>
|
||||
<p>Some text before</p>
|
||||
<table>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
<p>Some text after</p>
|
||||
</div>
|
||||
`;
|
||||
const expectedMarkdown = `<div>
|
||||
<p>Some text before</p>
|
||||
<div>| Row 1 Col 1 | Row 1 Col 2 |
|
||||
| Row 2 Col 1 | Row 2 Col 2 |</div>
|
||||
<p>Some text after</p>
|
||||
</div>`;
|
||||
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
});
|
|
@ -1,19 +0,0 @@
|
|||
import * as pdfProcessor from '../pdfProcessor';
|
||||
|
||||
describe('PDF Processing Module - Integration Test', () => {
|
||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||
expect(content.trim()).toEqual("Dummy PDF file");
|
||||
expect(pageStatusCode).toEqual(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
|
||||
expect(pageStatusCode).toBe(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
});
|
File diff suppressed because one or more lines are too long
|
@ -1,127 +0,0 @@
|
|||
import { Document } from "../../../../lib/entities";
|
||||
import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
|
||||
|
||||
describe('replacePaths', () => {
|
||||
describe('replacePathsWithAbsolutePaths', () => {
|
||||
it('should replace relative paths with absolute paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is a [link](/path/to/resource).',
|
||||
markdown: 'This is a [link](/path/to/resource).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is a [link](https://example.com/path/to/resource).',
|
||||
markdown: 'This is a [link](https://example.com/path/to/resource).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not alter absolute URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is an [external link](https://external.com/path).',
|
||||
markdown: 'This is an [external link](https://external.com/path).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should not alter data URLs for images', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is an image: ![alt text]().',
|
||||
markdown: 'This is an image: ![alt text]().'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should handle multiple links and images correctly', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
|
||||
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
|
||||
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should correctly handle a mix of absolute and relative paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().',
|
||||
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().',
|
||||
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||
it('should replace relative image paths with absolute paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
|
||||
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
|
||||
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not alter data:image URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'An image with a data URL: ![alt text]().',
|
||||
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should handle multiple images with a mix of data and relative URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).',
|
||||
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).',
|
||||
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -1,66 +0,0 @@
|
|||
import { Logger } from '../../../../lib/logger';
|
||||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
it('should return true for blocked social media URLs', () => {
|
||||
const blockedUrls = [
|
||||
'https://www.facebook.com',
|
||||
'https://twitter.com/someuser',
|
||||
'https://instagram.com/someuser',
|
||||
'https://www.linkedin.com/in/someuser',
|
||||
'https://snapchat.com/someuser',
|
||||
'https://tiktok.com/@someuser',
|
||||
'https://reddit.com/r/somesubreddit',
|
||||
'https://flickr.com/photos/someuser',
|
||||
'https://whatsapp.com/someuser',
|
||||
'https://wechat.com/someuser',
|
||||
'https://telegram.org/someuser',
|
||||
];
|
||||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
Logger.debug(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for URLs containing allowed keywords', () => {
|
||||
const allowedUrls = [
|
||||
'https://www.facebook.com/privacy',
|
||||
'https://twitter.com/terms',
|
||||
'https://instagram.com/legal',
|
||||
'https://www.linkedin.com/help',
|
||||
'https://pinterest.com/about',
|
||||
'https://snapchat.com/support',
|
||||
'https://tiktok.com/contact',
|
||||
'https://reddit.com/user-agreement',
|
||||
'https://tumblr.com/policy',
|
||||
'https://flickr.com/blog',
|
||||
'https://whatsapp.com/press',
|
||||
'https://wechat.com/careers',
|
||||
'https://telegram.org/conditions',
|
||||
'https://wix.com/careers',
|
||||
];
|
||||
|
||||
allowedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for non-blocked URLs', () => {
|
||||
const nonBlockedUrls = [
|
||||
'https://www.example.com',
|
||||
'https://www.somewebsite.org',
|
||||
'https://subdomain.example.com',
|
||||
'firecrawl.dev',
|
||||
'amazon.com',
|
||||
'wix.com',
|
||||
'https://wix.com'
|
||||
];
|
||||
|
||||
nonBlockedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -1,4 +1,4 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
import { logger } from "../../../lib/logger";
|
||||
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
|
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||
return isBlocked;
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
Logger.error(`Error parsing the following URL: ${url}`);
|
||||
logger.error(`Error parsing the following URL: ${url}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,198 +0,0 @@
|
|||
export const urlSpecificParams = {
|
||||
|
||||
"support.greenpay.me":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"docs.pdw.co":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 3000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"developers.notion.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"docs2.hubitat.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"scrapethissite.com":{
|
||||
defaultScraper: "fetch",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"rsseau.fr":{
|
||||
defaultScraper: "fetch",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"help.salesforce.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"ir.veeva.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
},
|
||||
"eonhealth.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
mobileProxy: true,
|
||||
method: "get",
|
||||
engine: "request",
|
||||
},
|
||||
},
|
||||
},
|
||||
"notion.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
engine: "playwright",
|
||||
}
|
||||
},
|
||||
"developer.apple.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
engine: "playwright",
|
||||
wait: 2000,
|
||||
fireEngineOptions: {
|
||||
blockMedia: false,
|
||||
}
|
||||
},
|
||||
},
|
||||
"amazon.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"digikey.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
},
|
||||
"zoopla.co.uk":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"lorealparis.hu":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
|
@ -1,79 +0,0 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import { createWriteStream } from "node:fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||
let tempFilePath = '';
|
||||
let pageStatusCode = 200;
|
||||
let pageError = '';
|
||||
let content = '';
|
||||
|
||||
try {
|
||||
const downloadResult = await downloadDocx(url);
|
||||
tempFilePath = downloadResult.tempFilePath;
|
||||
pageStatusCode = downloadResult.pageStatusCode;
|
||||
pageError = downloadResult.pageError;
|
||||
content = await processDocxToText(tempFilePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
||||
pageStatusCode = 500;
|
||||
pageError = error.message;
|
||||
content = '';
|
||||
} finally {
|
||||
if (tempFilePath) {
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
}
|
||||
}
|
||||
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||
try {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", () => {
|
||||
Logger.error('Failed to write DOCX file to disk');
|
||||
reject(new Error('Failed to write DOCX file to disk'));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to download DOCX: ${error.message}`);
|
||||
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
try {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
try {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
export const excludeNonMainTags = [
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
".top",
|
||||
".navbar",
|
||||
".footer",
|
||||
".bottom",
|
||||
"#footer",
|
||||
".sidebar",
|
||||
".side",
|
||||
".aside",
|
||||
"#sidebar",
|
||||
".modal",
|
||||
".popup",
|
||||
"#modal",
|
||||
".overlay",
|
||||
".ad",
|
||||
".ads",
|
||||
".advert",
|
||||
"#ad",
|
||||
".lang-selector",
|
||||
".language",
|
||||
"#language-selector",
|
||||
".social",
|
||||
".social-media",
|
||||
".social-links",
|
||||
"#social",
|
||||
".menu",
|
||||
".navigation",
|
||||
"#nav",
|
||||
".breadcrumbs",
|
||||
"#breadcrumbs",
|
||||
"#search-form",
|
||||
".search",
|
||||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
|
@ -1,89 +0,0 @@
|
|||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import axios from 'axios';
|
||||
import { Logger } from '../../../lib/logger';
|
||||
|
||||
export async function getImageDescription(
|
||||
imageUrl: string,
|
||||
backText: string,
|
||||
frontText: string,
|
||||
model: string = "gpt-4-turbo"
|
||||
): Promise<string> {
|
||||
try {
|
||||
const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
|
||||
backText +
|
||||
" and the following text: " +
|
||||
frontText +
|
||||
". Be super concise."
|
||||
|
||||
switch (model) {
|
||||
case 'claude-3-opus': {
|
||||
if (!process.env.ANTHROPIC_API_KEY) {
|
||||
throw new Error("No Anthropic API key provided");
|
||||
}
|
||||
const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
|
||||
const imageMediaType = 'image/png';
|
||||
const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
|
||||
|
||||
const anthropic = new Anthropic();
|
||||
const response = await anthropic.messages.create({
|
||||
model: "claude-3-opus-20240229",
|
||||
max_tokens: 1024,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: imageMediaType,
|
||||
data: imageData,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "text",
|
||||
text: prompt
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
});
|
||||
|
||||
return response[0].content.text;
|
||||
}
|
||||
default: {
|
||||
if (!process.env.OPENAI_API_KEY) {
|
||||
throw new Error("No OpenAI API key provided");
|
||||
}
|
||||
|
||||
const { OpenAI } = require("openai");
|
||||
const openai = new OpenAI();
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: "gpt-4-turbo",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: prompt,
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: imageUrl,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
return response.choices[0].message.content;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating image alt text: ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -1,185 +0,0 @@
|
|||
import { CheerioAPI } from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
interface Metadata {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dctermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dctermsType?: string;
|
||||
dcType?: string;
|
||||
dctermsAudience?: string;
|
||||
dctermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dctermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
}
|
||||
|
||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
let title: string | null = null;
|
||||
let description: string | null = null;
|
||||
let language: string | null = null;
|
||||
let keywords: string | null = null;
|
||||
let robots: string | null = null;
|
||||
let ogTitle: string | null = null;
|
||||
let ogDescription: string | null = null;
|
||||
let ogUrl: string | null = null;
|
||||
let ogImage: string | null = null;
|
||||
let ogAudio: string | null = null;
|
||||
let ogDeterminer: string | null = null;
|
||||
let ogLocale: string | null = null;
|
||||
let ogLocaleAlternate: string[] | null = null;
|
||||
let ogSiteName: string | null = null;
|
||||
let ogVideo: string | null = null;
|
||||
let dctermsCreated: string | null = null;
|
||||
let dcDateCreated: string | null = null;
|
||||
let dcDate: string | null = null;
|
||||
let dctermsType: string | null = null;
|
||||
let dcType: string | null = null;
|
||||
let dctermsAudience: string | null = null;
|
||||
let dctermsSubject: string | null = null;
|
||||
let dcSubject: string | null = null;
|
||||
let dcDescription: string | null = null;
|
||||
let dctermsKeywords: string | null = null;
|
||||
let modifiedTime: string | null = null;
|
||||
let publishedTime: string | null = null;
|
||||
let articleTag: string | null = null;
|
||||
let articleSection: string | null = null;
|
||||
let sourceURL: string | null = null;
|
||||
let pageStatusCode: number | null = null;
|
||||
let pageError: string | null = null;
|
||||
|
||||
const customMetadata: Record<string, string | string[]> = {};
|
||||
|
||||
try {
|
||||
// TODO: remove this as it is redundant with the below implementation
|
||||
title = soup("title").text() || null;
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
language = soup("html").attr("lang") || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||
ogDescription =
|
||||
soup('meta[property="og:description"]').attr("content") || null;
|
||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||
ogDeterminer =
|
||||
soup('meta[property="og:determiner"]').attr("content") || null;
|
||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||
ogLocaleAlternate =
|
||||
soup('meta[property="og:locale:alternate"]')
|
||||
.map((i, el) => soup(el).attr("content"))
|
||||
.get() || null;
|
||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||
articleSection =
|
||||
soup('meta[name="article:section"]').attr("content") || null;
|
||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||
publishedTime =
|
||||
soup('meta[property="article:published_time"]').attr("content") || null;
|
||||
modifiedTime =
|
||||
soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||
dctermsKeywords =
|
||||
soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||
dctermsSubject =
|
||||
soup('meta[name="dcterms.subject"]').attr("content") || null;
|
||||
dctermsAudience =
|
||||
soup('meta[name="dcterms.audience"]').attr("content") || null;
|
||||
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
||||
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
||||
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
||||
dcDateCreated =
|
||||
soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||
dctermsCreated =
|
||||
soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
|
||||
try {
|
||||
// Extract all meta tags for custom metadata
|
||||
soup("meta").each((i, elem) => {
|
||||
try {
|
||||
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
||||
const content = soup(elem).attr("content");
|
||||
|
||||
if (name && content) {
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else if (Array.isArray(customMetadata[name])) {
|
||||
(customMetadata[name] as string[]).push(content);
|
||||
} else {
|
||||
customMetadata[name] = [customMetadata[name] as string, content];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata (in): ${error}`);
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata: ${error}`);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting metadata: ${error}`);
|
||||
}
|
||||
|
||||
return {
|
||||
...(title ? { title } : {}),
|
||||
...(description ? { description } : {}),
|
||||
...(language ? { language } : {}),
|
||||
...(keywords ? { keywords } : {}),
|
||||
...(robots ? { robots } : {}),
|
||||
...(ogTitle ? { ogTitle } : {}),
|
||||
...(ogDescription ? { ogDescription } : {}),
|
||||
...(ogUrl ? { ogUrl } : {}),
|
||||
...(ogImage ? { ogImage } : {}),
|
||||
...(ogAudio ? { ogAudio } : {}),
|
||||
...(ogDeterminer ? { ogDeterminer } : {}),
|
||||
...(ogLocale ? { ogLocale } : {}),
|
||||
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
|
||||
...(ogSiteName ? { ogSiteName } : {}),
|
||||
...(ogVideo ? { ogVideo } : {}),
|
||||
...(dctermsCreated ? { dctermsCreated } : {}),
|
||||
...(dcDateCreated ? { dcDateCreated } : {}),
|
||||
...(dcDate ? { dcDate } : {}),
|
||||
...(dctermsType ? { dctermsType } : {}),
|
||||
...(dcType ? { dcType } : {}),
|
||||
...(dctermsAudience ? { dctermsAudience } : {}),
|
||||
...(dctermsSubject ? { dctermsSubject } : {}),
|
||||
...(dcSubject ? { dcSubject } : {}),
|
||||
...(dcDescription ? { dcDescription } : {}),
|
||||
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
||||
...(modifiedTime ? { modifiedTime } : {}),
|
||||
...(publishedTime ? { publishedTime } : {}),
|
||||
...(articleTag ? { articleTag } : {}),
|
||||
...(articleSection ? { articleSection } : {}),
|
||||
...(sourceURL ? { sourceURL } : {}),
|
||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||
...(pageError ? { pageError } : {}),
|
||||
...customMetadata,
|
||||
};
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
import cheerio, { CheerioAPI } from "cheerio";
|
||||
|
||||
interface Replacement {
|
||||
start: number;
|
||||
end: number;
|
||||
markdownTable: string;
|
||||
}
|
||||
|
||||
export const parseTablesToMarkdown = async (html: string): Promise<string> => {
|
||||
const soup: CheerioAPI = cheerio.load(html, {
|
||||
xmlMode: true,
|
||||
withStartIndices: true,
|
||||
withEndIndices: true
|
||||
});
|
||||
let tables = soup("table");
|
||||
let replacements: Replacement[] = [];
|
||||
|
||||
if (tables.length) {
|
||||
tables.each((_, tableElement) => {
|
||||
const start: number = tableElement.startIndex;
|
||||
const end: number = tableElement.endIndex + 1; // Include the closing tag properly
|
||||
let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
|
||||
const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
|
||||
if (isTableEmpty) {
|
||||
markdownTable = '';
|
||||
}
|
||||
replacements.push({ start, end, markdownTable });
|
||||
});
|
||||
}
|
||||
|
||||
replacements.sort((a, b) => b.start - a.start);
|
||||
|
||||
let modifiedHtml: string = html;
|
||||
replacements.forEach(({ start, end, markdownTable }) => {
|
||||
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
||||
});
|
||||
|
||||
return modifiedHtml.trim();
|
||||
};
|
||||
|
||||
export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
|
||||
let rows: string[] = [];
|
||||
let headerRowFound: boolean = false;
|
||||
tableSoup("tr").each((i, tr) => {
|
||||
const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
|
||||
let cellText: string = tableSoup(cell).text().trim();
|
||||
if (tableSoup(cell).is("th") && !headerRowFound) {
|
||||
headerRowFound = true;
|
||||
}
|
||||
return ` ${cellText} |`;
|
||||
}).get().join("");
|
||||
if (cells) {
|
||||
rows.push(`|${cells}`);
|
||||
}
|
||||
if (headerRowFound && i === 0) { // Header row
|
||||
rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
|
||||
}
|
||||
});
|
||||
|
||||
return rows.join('\n').trim();
|
||||
};
|
||||
|
||||
export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
|
||||
const cells: string = rowSoup("td, th").map((_, cell) => {
|
||||
let cellText: string = rowSoup(cell).text().trim();
|
||||
return ` ${cellText} |`;
|
||||
}).get().join("");
|
||||
|
||||
return `|${cells}`;
|
||||
};
|
||||
|
||||
export function createMarkdownDividerRow(cellCount: number): string {
|
||||
return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
|
||||
}
|
|
@ -1,140 +0,0 @@
|
|||
import axios, { AxiosResponse } from "axios";
|
||||
import fs from "fs/promises";
|
||||
import { createReadStream, createWriteStream } from "node:fs";
|
||||
import FormData from "form-data";
|
||||
import dotenv from "dotenv";
|
||||
import pdf from "pdf-parse";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { axiosTimeout } from "../../../lib/timeout";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
await fs.unlink(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
||||
return { content: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
Logger.debug("Processing pdf document w/ LlamaIndex");
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
};
|
||||
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
|
||||
const fileType2 = "application/pdf";
|
||||
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append("file", createReadStream(filePath), {
|
||||
filename: filePath,
|
||||
contentType: fileType2,
|
||||
});
|
||||
|
||||
const uploadUrl = `${base_url}/upload`;
|
||||
const uploadResponse = await axios.post(uploadUrl, formData, {
|
||||
headers: {
|
||||
...headers,
|
||||
...formData.getHeaders(),
|
||||
},
|
||||
});
|
||||
|
||||
const jobId = uploadResponse.data.id;
|
||||
const resultType = "text";
|
||||
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
|
||||
|
||||
let resultResponse: AxiosResponse;
|
||||
let attempt = 0;
|
||||
const maxAttempts = 10; // Maximum number of attempts
|
||||
let resultAvailable = false;
|
||||
while (attempt < maxAttempts && !resultAvailable) {
|
||||
try {
|
||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||
if (resultResponse.status === 200) {
|
||||
resultAvailable = true; // Exit condition met
|
||||
} else {
|
||||
// If the status code is not 200, increment the attempt counter and wait
|
||||
attempt++;
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||
attempt++;
|
||||
if (attempt >= maxAttempts) {
|
||||
Logger.error("Max attempts reached, unable to fetch result.");
|
||||
break; // Exit the loop if max attempts are reached
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
}
|
||||
}
|
||||
|
||||
if (!resultAvailable) {
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
content = resultResponse.data[resultType];
|
||||
} catch (error) {
|
||||
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else if (parsePDF) {
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
content = await fs.readFile(filePath, "utf-8");
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to read PDF file: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
async function processPdf(file: string) {
|
||||
try {
|
||||
const fileContent = await fs.readFile(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
} catch (error) {
|
||||
throw error;
|
||||
}
|
||||
}
|
|
@ -1,82 +0,0 @@
|
|||
import { AnyNode, Cheerio, load } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions,
|
||||
) => {
|
||||
let soup = load(html);
|
||||
|
||||
if (
|
||||
pageOptions.onlyIncludeTags &&
|
||||
pageOptions.onlyIncludeTags.length > 0 &&
|
||||
pageOptions.onlyIncludeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
|
||||
soup = load(newRoot.html());
|
||||
}
|
||||
}
|
||||
|
||||
soup("script, style, noscript, meta, head").remove();
|
||||
|
||||
if (
|
||||
pageOptions.removeTags &&
|
||||
pageOptions.removeTags.length > 0 &&
|
||||
pageOptions.removeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
}
|
||||
|
||||
if (Array.isArray(pageOptions.removeTags)) {
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
let elementsToRemove: Cheerio<AnyNode>;
|
||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||
let classMatch = false;
|
||||
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), "i");
|
||||
elementsToRemove = soup("*").filter((i, element) => {
|
||||
if (element.type === "tag") {
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||
);
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`),
|
||||
);
|
||||
}
|
||||
return tagNameMatches || attributesMatch || classMatch;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
} else {
|
||||
elementsToRemove = soup(tag);
|
||||
}
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
const cleanedHtml = soup.html();
|
||||
return cleanedHtml;
|
||||
};
|
|
@ -1,85 +0,0 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
import { Document } from "../../../lib/entities";
|
||||
|
||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
try {
|
||||
documents.forEach((document) => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const paths =
|
||||
document.content.match(
|
||||
/!?\[.*?\]\(.*?\)|href=".+?"/g
|
||||
) || [];
|
||||
|
||||
paths.forEach((path: string) => {
|
||||
try {
|
||||
const isImage = path.startsWith("!");
|
||||
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
|
||||
let url = matchedUrl[1];
|
||||
|
||||
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
||||
if (url.startsWith("/")) {
|
||||
url = url.substring(1);
|
||||
}
|
||||
url = new URL(url, baseUrl).toString();
|
||||
}
|
||||
|
||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||
// Image is handled afterwards
|
||||
if (!isImage) {
|
||||
document.content = document.content.replace(
|
||||
path,
|
||||
`${markdownLinkOrImageText}(${url})`
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
});
|
||||
document.markdown = document.content;
|
||||
});
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
||||
|
||||
export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
try {
|
||||
documents.forEach((document) => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const images =
|
||||
document.content.match(
|
||||
/!\[.*?\]\(.*?\)/g
|
||||
) || [];
|
||||
|
||||
images.forEach((image: string) => {
|
||||
let imageUrl = image.match(/\((.*?)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (!imageUrl.startsWith("data:image")) {
|
||||
if (!imageUrl.startsWith("http")) {
|
||||
if (imageUrl.startsWith("/")) {
|
||||
imageUrl = imageUrl.substring(1);
|
||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||
} else {
|
||||
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.content = document.content.replace(
|
||||
image,
|
||||
`![${altText}](${imageUrl})`
|
||||
);
|
||||
});
|
||||
document.markdown = document.content;
|
||||
});
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
|
@ -1,59 +0,0 @@
|
|||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
|
||||
export async function attemptScrapWithRequests(
|
||||
urlToScrap: string
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||
|
||||
if (!response.data) {
|
||||
Logger.debug("Failed normal requests as well");
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function sanitizeText(text: string): string {
|
||||
return text.replace("\u0000", "");
|
||||
}
|
||||
|
||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const $ = cheerio.load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
$('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (href) {
|
||||
try {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
} catch (error) {
|
||||
// Log the error and continue
|
||||
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates and return
|
||||
return [...new Set(links)];
|
||||
}
|
25
apps/api/src/scraper/scrapeURL/README.md
Normal file
25
apps/api/src/scraper/scrapeURL/README.md
Normal file
|
@ -0,0 +1,25 @@
|
|||
# `scrapeURL`
|
||||
New URL scraper for Firecrawl
|
||||
|
||||
## Signal flow
|
||||
```mermaid
|
||||
flowchart TD;
|
||||
scrapeURL-.->buildFallbackList;
|
||||
buildFallbackList-.->scrapeURLWithEngine;
|
||||
scrapeURLWithEngine-.->parseMarkdown;
|
||||
parseMarkdown-.->wasScrapeSuccessful{{Was scrape successful?}};
|
||||
wasScrapeSuccessful-."No".->areEnginesLeft{{Are there engines left to try?}};
|
||||
areEnginesLeft-."Yes, try next engine".->scrapeURLWithEngine;
|
||||
areEnginesLeft-."No".->NoEnginesLeftError[/NoEnginesLeftError/]
|
||||
wasScrapeSuccessful-."Yes".->asd;
|
||||
```
|
||||
|
||||
## Differences from `WebScraperDataProvider`
|
||||
- The job of `WebScraperDataProvider.validateInitialUrl` has been delegated to the zod layer above `scrapeUrl`.
|
||||
- `WebScraperDataProvider.mode` has no equivalent, only `scrape_url` is supported.
|
||||
- You may no longer specify multiple URLs.
|
||||
- Built on `v1` definitons, instead of `v0`.
|
||||
- PDFs are now converted straight to markdown using LlamaParse, instead of converting to just plaintext.
|
||||
- DOCXs are now converted straight to HTML (and then later to markdown) using mammoth, instead of converting to just plaintext.
|
||||
- Using new JSON Schema OpenAI API -- schema fails with LLM Extract will be basically non-existant.
|
||||
|
15
apps/api/src/scraper/scrapeURL/engines/docx/index.ts
Normal file
15
apps/api/src/scraper/scrapeURL/engines/docx/index.ts
Normal file
|
@ -0,0 +1,15 @@
|
|||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
statusCode: response.status,
|
||||
|
||||
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
|
||||
}
|
||||
}
|
28
apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Normal file
28
apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Normal file
|
@ -0,0 +1,28 @@
|
|||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000;
|
||||
|
||||
const response = await Promise.race([
|
||||
fetch(meta.url, {
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
|
||||
})()
|
||||
]);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
html: await response.text(),
|
||||
statusCode: response.status,
|
||||
// TODO: error?
|
||||
};
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { EngineError } from "../../error";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.literal("completed"),
|
||||
processing: z.literal(false),
|
||||
|
||||
// timeTaken: z.number(),
|
||||
content: z.string(),
|
||||
url: z.string().optional(),
|
||||
|
||||
pageStatusCode: z.number(),
|
||||
pageError: z.string().optional(),
|
||||
|
||||
// TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
|
||||
responseHeaders: z.record(z.string(), z.string()).optional(),
|
||||
|
||||
// timeTakenCookie: z.number().optional(),
|
||||
// timeTakenRequest: z.number().optional(),
|
||||
|
||||
// legacy: playwright only
|
||||
screenshot: z.string().optional(),
|
||||
|
||||
// new: actions
|
||||
screenshots: z.string().array().optional(),
|
||||
actionContent: z.object({
|
||||
url: z.string(),
|
||||
html: z.string(),
|
||||
}).array().optional(),
|
||||
})
|
||||
|
||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||
|
||||
const processingSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown"]),
|
||||
processing: z.boolean(),
|
||||
});
|
||||
|
||||
const failedSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.literal("failed"),
|
||||
processing: z.literal(false),
|
||||
error: z.string(),
|
||||
});
|
||||
|
||||
export class StillProcessingError extends Error {
|
||||
constructor(jobId: string) {
|
||||
super("Job is still under processing", { cause: { jobId } })
|
||||
}
|
||||
}
|
||||
|
||||
export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
const status = await Sentry.startSpan({
|
||||
name: "fire-engine: Check status",
|
||||
attributes: {
|
||||
jobId,
|
||||
}
|
||||
}, async span => {
|
||||
return await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||
method: "GET",
|
||||
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
}
|
||||
)
|
||||
});
|
||||
|
||||
const successParse = successSchema.safeParse(status);
|
||||
const processingParse = processingSchema.safeParse(status);
|
||||
const failedParse = failedSchema.safeParse(status);
|
||||
|
||||
if (successParse.success) {
|
||||
logger.debug("Scrape succeeded!", { jobId });
|
||||
return successParse.data;
|
||||
} else if (processingParse.success) {
|
||||
logger.debug("Scrape is still processing", { jobId });
|
||||
throw new StillProcessingError(jobId);
|
||||
} else if (failedParse.success) {
|
||||
logger.debug("Scrape job failed", { status, jobId });
|
||||
throw new EngineError("Scrape job failed", {
|
||||
cause: {
|
||||
status, jobId
|
||||
}
|
||||
});
|
||||
} else {
|
||||
logger.debug("Check status returned response not matched by any schema", { status, jobId });
|
||||
throw new Error("Check status returned response not matched by any schema", {
|
||||
cause: {
|
||||
status, jobId
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
33
apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
Normal file
33
apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
Normal file
|
@ -0,0 +1,33 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
await Sentry.startSpan({
|
||||
name: "fire-engine: Delete scrape",
|
||||
attributes: {
|
||||
jobId,
|
||||
}
|
||||
}, async span => {
|
||||
await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||
method: "DELETE",
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
ignoreResponse: true,
|
||||
ignoreFailure: true,
|
||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||
}
|
||||
)
|
||||
});
|
||||
|
||||
// We do not care whether this fails or not.
|
||||
}
|
198
apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Normal file
198
apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Normal file
|
@ -0,0 +1,198 @@
|
|||
import { Logger } from "winston";
|
||||
import { Meta } from "../..";
|
||||
import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
|
||||
import { EngineError, TimeoutError } from "../../error";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
const defaultTimeout = 20000;
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
// `scrapeURLWithFireEngine*` functions.
|
||||
async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
timeout = defaultTimeout,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
|
||||
|
||||
const startTime = Date.now();
|
||||
const errorLimit = 3;
|
||||
let errors: any[] = [];
|
||||
let status: FireEngineCheckStatusSuccess | undefined = undefined;
|
||||
|
||||
while (status === undefined) {
|
||||
if (errors.length >= errorLimit) {
|
||||
logger.error("Error limit hit.", { errors });
|
||||
throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
|
||||
}
|
||||
|
||||
if (Date.now() - startTime > timeout) {
|
||||
logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
|
||||
throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
|
||||
}
|
||||
|
||||
try {
|
||||
status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
|
||||
} catch (error) {
|
||||
if (error instanceof StillProcessingError) {
|
||||
logger.debug("Scrape is still processing...");
|
||||
} else if (error instanceof EngineError) {
|
||||
logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
errors.push(error);
|
||||
logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const actions: Action[] = [
|
||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||
...(meta.options.waitFor !== 0 ? [{
|
||||
type: "wait" as const,
|
||||
milliseconds: meta.options.waitFor,
|
||||
}] : []),
|
||||
|
||||
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||
...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
||||
}] : []),
|
||||
|
||||
// Include specified actions
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
|
||||
url: meta.url,
|
||||
engine: "chrome-cdp",
|
||||
instantReturn: true,
|
||||
skipTlsVerification: meta.options.skipTlsVerification,
|
||||
headers: meta.options.headers,
|
||||
...(actions.length > 0 ? ({
|
||||
actions,
|
||||
}) : {}),
|
||||
priority: meta.internalOptions.priority,
|
||||
geolocation: meta.options.geolocation,
|
||||
mobile: meta.options.mobile,
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
|
||||
meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
|
||||
response.screenshot = (response.screenshots ?? [])[0];
|
||||
(response.screenshots ?? []).splice(0, 1);
|
||||
meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
|
||||
}
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
screenshot: response.screenshot,
|
||||
...(actions.length > 0 ? {
|
||||
actions: {
|
||||
screenshots: response.screenshots ?? [],
|
||||
scrapes: response.actionContent ?? [],
|
||||
}
|
||||
} : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
|
||||
url: meta.url,
|
||||
engine: "playwright",
|
||||
instantReturn: true,
|
||||
|
||||
headers: meta.options.headers,
|
||||
priority: meta.internalOptions.priority,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||
wait: meta.options.waitFor,
|
||||
geolocation: meta.options.geolocation,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
|
||||
screenshot: response.screenshots[0],
|
||||
}) : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
|
||||
url: meta.url,
|
||||
engine: "tlsclient",
|
||||
instantReturn: true,
|
||||
|
||||
headers: meta.options.headers,
|
||||
priority: meta.internalOptions.priority,
|
||||
|
||||
atsv: meta.internalOptions.atsv,
|
||||
geolocation: meta.options.geolocation,
|
||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
};
|
||||
}
|
94
apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
Normal file
94
apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
Normal file
|
@ -0,0 +1,94 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export type FireEngineScrapeRequestCommon = {
|
||||
url: string;
|
||||
|
||||
headers?: { [K: string]: string };
|
||||
|
||||
blockMedia?: boolean; // default: true
|
||||
blockAds?: boolean; // default: true
|
||||
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
|
||||
|
||||
// useProxy?: boolean; // unused, default: true
|
||||
// customProxy?: string; // unused
|
||||
|
||||
// disableSmartWaitCache?: boolean; // unused, default: false
|
||||
// skipDnsCheck?: boolean; // unused, default: false
|
||||
|
||||
priority?: number; // default: 1
|
||||
// team_id?: string; // unused
|
||||
logRequest?: boolean; // default: true
|
||||
instantReturn?: boolean; // default: false
|
||||
geolocation?: { country?: string; languages?: string[]; };
|
||||
}
|
||||
|
||||
export type FireEngineScrapeRequestChromeCDP = {
|
||||
engine: "chrome-cdp";
|
||||
skipTlsVerification?: boolean;
|
||||
actions?: Action[];
|
||||
blockMedia?: true; // cannot be false
|
||||
mobile?: boolean;
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestPlaywright = {
|
||||
engine: "playwright";
|
||||
blockAds?: boolean; // default: true
|
||||
|
||||
// mutually exclusive, default: false
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
|
||||
wait?: number; // default: 0
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestTLSClient = {
|
||||
engine: "tlsclient";
|
||||
atsv?: boolean; // v0 only, default: false
|
||||
disableJsDom?: boolean; // v0 only, default: false
|
||||
// blockAds?: boolean; // default: true
|
||||
};
|
||||
|
||||
const schema = z.object({
|
||||
jobId: z.string(),
|
||||
processing: z.boolean(),
|
||||
});
|
||||
|
||||
export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
): Promise<z.infer<typeof schema>> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
// TODO: retries
|
||||
|
||||
const scrapeRequest = await Sentry.startSpan({
|
||||
name: "fire-engine: Scrape",
|
||||
attributes: {
|
||||
url: request.url,
|
||||
},
|
||||
}, async span => {
|
||||
return await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape`,
|
||||
method: "POST",
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
body: request,
|
||||
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||
schema,
|
||||
tryCount: 3,
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
return scrapeRequest;
|
||||
}
|
295
apps/api/src/scraper/scrapeURL/engines/index.ts
Normal file
295
apps/api/src/scraper/scrapeURL/engines/index.ts
Normal file
|
@ -0,0 +1,295 @@
|
|||
import { ScrapeActionContent } from "../../../lib/entities";
|
||||
import { Meta } from "..";
|
||||
import { scrapeDOCX } from "./docx";
|
||||
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
|
||||
import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
||||
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
||||
...(usePlaywright ? [ "playwright" as const ] : []),
|
||||
"fetch",
|
||||
"pdf",
|
||||
"docx",
|
||||
];
|
||||
|
||||
export const featureFlags = [
|
||||
"actions",
|
||||
"waitFor",
|
||||
"screenshot",
|
||||
"screenshot@fullScreen",
|
||||
"pdf",
|
||||
"docx",
|
||||
"atsv",
|
||||
"location",
|
||||
"mobile",
|
||||
"skipTlsVerification",
|
||||
"useFastMode",
|
||||
] as const;
|
||||
|
||||
export type FeatureFlag = typeof featureFlags[number];
|
||||
|
||||
export const featureFlagOptions: {
|
||||
[F in FeatureFlag]: {
|
||||
priority: number;
|
||||
}
|
||||
} = {
|
||||
"actions": { priority: 20 },
|
||||
"waitFor": { priority: 1 },
|
||||
"screenshot": { priority: 10 },
|
||||
"screenshot@fullScreen": { priority: 10 },
|
||||
"pdf": { priority: 100 },
|
||||
"docx": { priority: 100 },
|
||||
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
||||
"useFastMode": { priority: 90 },
|
||||
"location": { priority: 10 },
|
||||
"mobile": { priority: 10 },
|
||||
"skipTlsVerification": { priority: 10 },
|
||||
} as const;
|
||||
|
||||
export type EngineScrapeResult = {
|
||||
url: string;
|
||||
|
||||
html: string;
|
||||
markdown?: string;
|
||||
statusCode: number;
|
||||
error?: string;
|
||||
|
||||
screenshot?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
scrapes: ScrapeActionContent[];
|
||||
};
|
||||
}
|
||||
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
||||
} = {
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
|
||||
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
|
||||
"playwright": scrapeURLWithPlaywright,
|
||||
"fetch": scrapeURLWithFetch,
|
||||
"pdf": scrapePDF,
|
||||
"docx": scrapeDOCX,
|
||||
};
|
||||
|
||||
export const engineOptions: {
|
||||
[E in Engine]: {
|
||||
// A list of feature flags the engine supports.
|
||||
features: { [F in FeatureFlag]: boolean },
|
||||
|
||||
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
||||
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
||||
quality: number,
|
||||
}
|
||||
} = {
|
||||
"fire-engine;chrome-cdp": {
|
||||
features: {
|
||||
"actions": true,
|
||||
"waitFor": true, // through actions transform
|
||||
"screenshot": true, // through actions transform
|
||||
"screenshot@fullScreen": true, // through actions transform
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": true,
|
||||
"mobile": true,
|
||||
"skipTlsVerification": true,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 50,
|
||||
},
|
||||
"fire-engine;playwright": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 40,
|
||||
},
|
||||
"scrapingbee": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 30,
|
||||
},
|
||||
"scrapingbeeLoad": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 29,
|
||||
},
|
||||
"playwright": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 20,
|
||||
},
|
||||
"fire-engine;tlsclient": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": true,
|
||||
"location": true,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: 10,
|
||||
},
|
||||
"fetch": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: 5,
|
||||
},
|
||||
"pdf": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": true,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: -10,
|
||||
},
|
||||
"docx": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": true,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: -10,
|
||||
},
|
||||
};
|
||||
|
||||
export function buildFallbackList(meta: Meta): {
|
||||
engine: Engine,
|
||||
unsupportedFeatures: Set<FeatureFlag>,
|
||||
}[] {
|
||||
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||
const priorityThreshold = Math.floor(prioritySum / 2);
|
||||
let selectedEngines: {
|
||||
engine: Engine,
|
||||
supportScore: number,
|
||||
unsupportedFeatures: Set<FeatureFlag>,
|
||||
}[] = [];
|
||||
|
||||
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
|
||||
|
||||
for (const engine of currentEngines) {
|
||||
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
|
||||
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||
|
||||
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
||||
for (const flag of meta.featureFlags) {
|
||||
if (supportedFlags.has(flag)) {
|
||||
unsupportedFeatures.delete(flag);
|
||||
}
|
||||
}
|
||||
|
||||
if (supportScore >= priorityThreshold) {
|
||||
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
||||
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
|
||||
} else {
|
||||
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
|
||||
}
|
||||
}
|
||||
|
||||
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
|
||||
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
|
||||
}
|
||||
|
||||
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
|
||||
|
||||
return selectedEngines;
|
||||
}
|
||||
|
||||
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
|
||||
const fn = engineHandlers[engine];
|
||||
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
|
||||
const _meta = {
|
||||
...meta,
|
||||
logger,
|
||||
};
|
||||
|
||||
return await fn(_meta);
|
||||
}
|
114
apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Normal file
114
apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Normal file
|
@ -0,0 +1,114 @@
|
|||
import { createReadStream, promises as fs } from "node:fs";
|
||||
import FormData from "form-data";
|
||||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import * as marked from "marked";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { z } from "zod";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import escapeHtml from "escape-html";
|
||||
import PdfParse from "pdf-parse";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
|
||||
type PDFProcessorResult = {html: string, markdown?: string};
|
||||
|
||||
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
||||
|
||||
const uploadForm = new FormData();
|
||||
uploadForm.append("file", createReadStream(tempFilePath), {
|
||||
filename: tempFilePath,
|
||||
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
|
||||
});
|
||||
|
||||
const upload = await robustFetch({
|
||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||
},
|
||||
body: uploadForm,
|
||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
|
||||
schema: z.object({
|
||||
id: z.string(),
|
||||
}),
|
||||
});
|
||||
|
||||
const jobId = upload.id;
|
||||
|
||||
// TODO: timeout, retries
|
||||
const result = await robustFetch({
|
||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||
},
|
||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
|
||||
schema: z.object({
|
||||
markdown: z.string(),
|
||||
}),
|
||||
});
|
||||
|
||||
return {
|
||||
markdown: result.markdown,
|
||||
html: await marked.parse(result.markdown, { async: true }),
|
||||
};
|
||||
}
|
||||
|
||||
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||
|
||||
const result = await PdfParse(await fs.readFile(tempFilePath));
|
||||
const escaped = escapeHtml(result.text);
|
||||
|
||||
return {
|
||||
markdown: escaped,
|
||||
html: escaped,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
if (!meta.options.parsePDF) {
|
||||
const file = await fetchFileToBuffer(meta.url);
|
||||
const content = file.buffer.toString("base64");
|
||||
return {
|
||||
url: file.response.url,
|
||||
statusCode: file.response.status,
|
||||
|
||||
html: content,
|
||||
markdown: content,
|
||||
};
|
||||
}
|
||||
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
try {
|
||||
result = await scrapePDFWithLlamaParse({
|
||||
...meta,
|
||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
||||
}, tempFilePath);
|
||||
} catch (error) {
|
||||
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
||||
Sentry.captureException(error);
|
||||
}
|
||||
}
|
||||
|
||||
if (result === null) {
|
||||
result = await scrapePDFWithParsePDF({
|
||||
...meta,
|
||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
|
||||
}, tempFilePath);
|
||||
}
|
||||
|
||||
await fs.unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
statusCode: response.status,
|
||||
|
||||
html: result.html,
|
||||
markdown: result.markdown,
|
||||
}
|
||||
}
|
42
apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
Normal file
42
apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
Normal file
|
@ -0,0 +1,42 @@
|
|||
import { z } from "zod";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000 + meta.options.waitFor;
|
||||
|
||||
const response = await Promise.race([
|
||||
await robustFetch({
|
||||
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
url: meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
method: "POST",
|
||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||
schema: z.object({
|
||||
content: z.string(),
|
||||
pageStatusCode: z.number(),
|
||||
pageError: z.string().optional(),
|
||||
}),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||
throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
|
||||
})(),
|
||||
]);
|
||||
|
||||
return {
|
||||
url: meta.url, // TODO: impove redirect following
|
||||
html: response.content,
|
||||
statusCode: response.pageStatusCode,
|
||||
error: response.pageError,
|
||||
}
|
||||
}
|
66
apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
Normal file
66
apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
Normal file
|
@ -0,0 +1,66 @@
|
|||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { AxiosError, type AxiosResponse } from "axios";
|
||||
import { EngineError } from "../../error";
|
||||
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
|
||||
export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise<EngineScrapeResult>) {
|
||||
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||
let response: AxiosResponse<any>;
|
||||
try {
|
||||
response = await client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout: 15000, // TODO: dynamic timeout based on request timeout
|
||||
wait_browser: wait_browser,
|
||||
wait: Math.min(meta.options.waitFor, 35000),
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"),
|
||||
},
|
||||
headers: {
|
||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof AxiosError && error.response !== undefined) {
|
||||
response = error.response;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const data: Buffer = response.data;
|
||||
const body = JSON.parse(new TextDecoder().decode(data));
|
||||
|
||||
const headers = body.headers ?? {};
|
||||
const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]);
|
||||
|
||||
if (body.errors || body.body?.error || isHiddenEngineError) {
|
||||
meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body });
|
||||
throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } });
|
||||
}
|
||||
|
||||
if (typeof body.body !== "string") {
|
||||
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
||||
throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } });
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers);
|
||||
|
||||
return {
|
||||
url: body["resolved-url"] ?? meta.url,
|
||||
|
||||
html: body.body,
|
||||
error: response.status >= 300 ? response.statusText : undefined,
|
||||
statusCode: response.status,
|
||||
...(body.screenshot ? ({
|
||||
screenshot: `data:image/png;base64,${body.screenshot}`,
|
||||
}) : {}),
|
||||
};
|
||||
};
|
||||
}
|
45
apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
Normal file
45
apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
Normal file
|
@ -0,0 +1,45 @@
|
|||
import path from "path";
|
||||
import os from "os";
|
||||
import { createWriteStream, promises as fs } from "node:fs";
|
||||
import { EngineError } from "../../error";
|
||||
import { Writable } from "stream";
|
||||
import { v4 as uuid } from "uuid";
|
||||
|
||||
export async function fetchFileToBuffer(url: string): Promise<{
|
||||
response: Response,
|
||||
buffer: Buffer
|
||||
}> {
|
||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||
return {
|
||||
response,
|
||||
buffer: Buffer.from(await response.arrayBuffer()),
|
||||
};
|
||||
}
|
||||
|
||||
export async function downloadFile(id: string, url: string): Promise<{
|
||||
response: Response
|
||||
tempFilePath: string
|
||||
}> {
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||
const tempFileWrite = createWriteStream(tempFilePath);
|
||||
|
||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||
|
||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||
if (response.body === null) {
|
||||
throw new EngineError("Response body was null", { cause: { response } });
|
||||
}
|
||||
|
||||
response.body.pipeTo(Writable.toWeb(tempFileWrite));
|
||||
await new Promise((resolve, reject) => {
|
||||
tempFileWrite.on("finish", () => resolve(null));
|
||||
tempFileWrite.on("error", (error) => {
|
||||
reject(new EngineError("Failed to write to temp file", { cause: { error } }));
|
||||
});
|
||||
})
|
||||
|
||||
return {
|
||||
response,
|
||||
tempFilePath,
|
||||
};
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
import { Logger } from "winston";
|
||||
import { AddFeatureError } from "../../error";
|
||||
|
||||
export function specialtyScrapeCheck(logger: Logger, headers: Record<string, string> | undefined) {
|
||||
const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1];
|
||||
|
||||
if (contentType === undefined) {
|
||||
logger.warn("Failed to check contentType -- was not present in headers", { headers });
|
||||
} else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf
|
||||
throw new AddFeatureError(["pdf"]);
|
||||
} else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx
|
||||
throw new AddFeatureError(["docx"]);
|
||||
}
|
||||
}
|
34
apps/api/src/scraper/scrapeURL/error.ts
Normal file
34
apps/api/src/scraper/scrapeURL/error.ts
Normal file
|
@ -0,0 +1,34 @@
|
|||
import { EngineResultsTracker } from "."
|
||||
import { Engine, FeatureFlag } from "./engines"
|
||||
|
||||
export class EngineError extends Error {
|
||||
constructor(message?: string, options?: ErrorOptions) {
|
||||
super(message, options)
|
||||
}
|
||||
}
|
||||
|
||||
export class TimeoutError extends Error {
|
||||
constructor(message?: string, options?: ErrorOptions) {
|
||||
super(message, options)
|
||||
}
|
||||
}
|
||||
|
||||
export class NoEnginesLeftError extends Error {
|
||||
public fallbackList: Engine[];
|
||||
public results: EngineResultsTracker;
|
||||
|
||||
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
||||
super("All scraping engines failed!");
|
||||
this.fallbackList = fallbackList;
|
||||
this.results = results;
|
||||
}
|
||||
}
|
||||
|
||||
export class AddFeatureError extends Error {
|
||||
public featureFlags: FeatureFlag[];
|
||||
|
||||
constructor(featureFlags: FeatureFlag[]) {
|
||||
super("New feature flags have been discovered: " + featureFlags.join(", "));
|
||||
this.featureFlags = featureFlags;
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user