mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
WebScraper
refactor into scrapeURL
(#714)
* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
ed5a0d3cf2
commit
8d467c8ca7
2
.github/archive/js-sdk.yml
vendored
2
.github/archive/js-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
||||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||||
HOST: ${{ secrets.HOST }}
|
HOST: ${{ secrets.HOST }}
|
||||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
|
||||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||||
|
@ -21,7 +20,6 @@ env:
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
|
||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
2
.github/archive/python-sdk.yml
vendored
2
.github/archive/python-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
||||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||||
HOST: ${{ secrets.HOST }}
|
HOST: ${{ secrets.HOST }}
|
||||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
|
||||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||||
|
@ -21,7 +20,6 @@ env:
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
|
||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
2
.github/archive/rust-sdk.yml
vendored
2
.github/archive/rust-sdk.yml
vendored
|
@ -8,7 +8,6 @@ env:
|
||||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||||
HOST: ${{ secrets.HOST }}
|
HOST: ${{ secrets.HOST }}
|
||||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
|
||||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||||
|
@ -21,7 +20,6 @@ env:
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
|
||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
|
|
||||||
|
|
||||||
|
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
@ -12,7 +12,6 @@ env:
|
||||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||||
HOST: ${{ secrets.HOST }}
|
HOST: ${{ secrets.HOST }}
|
||||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
|
||||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||||
|
@ -25,7 +24,6 @@ env:
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
|
||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||||
|
|
32
.github/workflows/deploy-image-staging.yml
vendored
Normal file
32
.github/workflows/deploy-image-staging.yml
vendored
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
name: STAGING Deploy Images to GHCR
|
||||||
|
|
||||||
|
env:
|
||||||
|
DOTNET_VERSION: '6.0.x'
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- mog/webscraper-refactor
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push-app-image:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: './apps/api'
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout GitHub Action'
|
||||||
|
uses: actions/checkout@main
|
||||||
|
|
||||||
|
- name: 'Login to GitHub Container Registry'
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{github.actor}}
|
||||||
|
password: ${{secrets.GITHUB_TOKEN}}
|
||||||
|
|
||||||
|
- name: 'Build Inventory Image'
|
||||||
|
run: |
|
||||||
|
docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
|
||||||
|
docker push ghcr.io/mendableai/firecrawl-staging:latest
|
|
@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
||||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||||
BULL_AUTH_KEY= @
|
BULL_AUTH_KEY= @
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||||
|
|
|
@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
||||||
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
||||||
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
||||||
BULL_AUTH_KEY= @
|
BULL_AUTH_KEY= @
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||||
|
|
|
@ -33,8 +33,6 @@ SCRAPING_BEE_API_KEY=
|
||||||
# add for LLM dependednt features (image alt generation, etc.)
|
# add for LLM dependednt features (image alt generation, etc.)
|
||||||
OPENAI_API_KEY=
|
OPENAI_API_KEY=
|
||||||
BULL_AUTH_KEY=@
|
BULL_AUTH_KEY=@
|
||||||
# use if you're configuring basic logging with logtail
|
|
||||||
LOGTAIL_KEY=
|
|
||||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
LLAMAPARSE_API_KEY=
|
LLAMAPARSE_API_KEY=
|
||||||
# set if you'd like to send slack server health status messages
|
# set if you'd like to send slack server health status messages
|
||||||
|
@ -54,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
|
||||||
STRIPE_PRICE_ID_GROWTH=
|
STRIPE_PRICE_ID_GROWTH=
|
||||||
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||||
|
|
||||||
HYPERDX_API_KEY=
|
|
||||||
HDX_NODE_BETA_MODE=1
|
|
||||||
|
|
||||||
# set if you'd like to use the fire engine closed beta
|
# set if you'd like to use the fire engine closed beta
|
||||||
FIRE_ENGINE_BETA_URL=
|
FIRE_ENGINE_BETA_URL=
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
global.fetch = require('jest-fetch-mock');
|
// global.fetch = require('jest-fetch-mock');
|
||||||
|
|
|
@ -32,9 +32,11 @@
|
||||||
"@tsconfig/recommended": "^1.0.3",
|
"@tsconfig/recommended": "^1.0.3",
|
||||||
"@types/body-parser": "^1.19.2",
|
"@types/body-parser": "^1.19.2",
|
||||||
"@types/cors": "^2.8.13",
|
"@types/cors": "^2.8.13",
|
||||||
|
"@types/escape-html": "^1.0.4",
|
||||||
"@types/express": "^4.17.17",
|
"@types/express": "^4.17.17",
|
||||||
"@types/jest": "^29.5.12",
|
"@types/jest": "^29.5.12",
|
||||||
"@types/node": "^20.14.1",
|
"@types/node": "^20.14.1",
|
||||||
|
"@types/pdf-parse": "^1.1.4",
|
||||||
"body-parser": "^1.20.1",
|
"body-parser": "^1.20.1",
|
||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
"jest": "^29.6.3",
|
"jest": "^29.6.3",
|
||||||
|
@ -53,9 +55,7 @@
|
||||||
"@bull-board/api": "^5.20.5",
|
"@bull-board/api": "^5.20.5",
|
||||||
"@bull-board/express": "^5.20.5",
|
"@bull-board/express": "^5.20.5",
|
||||||
"@devil7softwares/pos": "^1.0.2",
|
"@devil7softwares/pos": "^1.0.2",
|
||||||
"@dqbd/tiktoken": "^1.0.13",
|
"@dqbd/tiktoken": "^1.0.16",
|
||||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
|
||||||
"@logtail/node": "^0.4.12",
|
|
||||||
"@nangohq/node": "^0.40.8",
|
"@nangohq/node": "^0.40.8",
|
||||||
"@sentry/cli": "^2.33.1",
|
"@sentry/cli": "^2.33.1",
|
||||||
"@sentry/node": "^8.26.0",
|
"@sentry/node": "^8.26.0",
|
||||||
|
@ -78,6 +78,7 @@
|
||||||
"date-fns": "^3.6.0",
|
"date-fns": "^3.6.0",
|
||||||
"dotenv": "^16.3.1",
|
"dotenv": "^16.3.1",
|
||||||
"dotenv-cli": "^7.4.2",
|
"dotenv-cli": "^7.4.2",
|
||||||
|
"escape-html": "^1.0.3",
|
||||||
"express-rate-limit": "^7.3.1",
|
"express-rate-limit": "^7.3.1",
|
||||||
"express-ws": "^5.0.2",
|
"express-ws": "^5.0.2",
|
||||||
"form-data": "^4.0.0",
|
"form-data": "^4.0.0",
|
||||||
|
@ -92,6 +93,7 @@
|
||||||
"languagedetect": "^2.0.0",
|
"languagedetect": "^2.0.0",
|
||||||
"logsnag": "^1.0.0",
|
"logsnag": "^1.0.0",
|
||||||
"luxon": "^3.4.3",
|
"luxon": "^3.4.3",
|
||||||
|
"marked": "^14.1.2",
|
||||||
"md5": "^2.3.0",
|
"md5": "^2.3.0",
|
||||||
"moment": "^2.29.4",
|
"moment": "^2.29.4",
|
||||||
"mongoose": "^8.4.4",
|
"mongoose": "^8.4.4",
|
||||||
|
@ -114,6 +116,8 @@
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
"unstructured-client": "^0.11.3",
|
"unstructured-client": "^0.11.3",
|
||||||
"uuid": "^10.0.0",
|
"uuid": "^10.0.0",
|
||||||
|
"winston": "^3.14.2",
|
||||||
|
"winston-transport": "^4.8.0",
|
||||||
"wordpos": "^2.1.0",
|
"wordpos": "^2.1.0",
|
||||||
"ws": "^8.18.0",
|
"ws": "^8.18.0",
|
||||||
"xml2js": "^0.6.2",
|
"xml2js": "^0.6.2",
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,15 +1,15 @@
|
||||||
### Crawl Website
|
### Crawl Website
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||||
Authorization: Bearer fc-
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"corterix.com"
|
"url":"firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||||
Authorization: Bearer fc-
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
|
@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||||
Authorization: Bearer fc-
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -45,7 +45,7 @@ content-type: application/json
|
||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -56,12 +56,12 @@ content-type: application/json
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
### Get Job Result
|
### Get Job Result
|
||||||
|
|
||||||
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
|
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -70,7 +70,7 @@ content-type: application/json
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
|
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
|
||||||
Authorization: Bearer
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
### Get Active Jobs Count
|
### Get Active Jobs Count
|
||||||
GET http://localhost:3002/serverHealthCheck
|
GET http://localhost:3002/serverHealthCheck
|
||||||
|
|
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
Normal file
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
html-to-markdown.so
|
||||||
|
html-to-markdown.h
|
|
@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
|
||||||
expect(crawlInitResponse.statusCode).toBe(200);
|
expect(crawlInitResponse.statusCode).toBe(200);
|
||||||
expect(crawlInitResponse.body).toHaveProperty("jobId");
|
expect(crawlInitResponse.body).toHaveProperty("jobId");
|
||||||
|
|
||||||
let crawlStatus: string;
|
let crawlStatus: string = "scraping";
|
||||||
let crawlData = [];
|
let crawlData = [];
|
||||||
while (crawlStatus !== "completed") {
|
while (crawlStatus !== "completed") {
|
||||||
const statusResponse = await request(TEST_URL)
|
const statusResponse = await request(TEST_URL)
|
||||||
|
|
|
@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||||
process.env.SCRAPING_BEE_API_KEY = "";
|
process.env.SCRAPING_BEE_API_KEY = "";
|
||||||
process.env.OPENAI_API_KEY = "";
|
process.env.OPENAI_API_KEY = "";
|
||||||
process.env.BULL_AUTH_KEY = "";
|
process.env.BULL_AUTH_KEY = "";
|
||||||
process.env.LOGTAIL_KEY = "";
|
|
||||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||||
process.env.LLAMAPARSE_API_KEY = "";
|
process.env.LLAMAPARSE_API_KEY = "";
|
||||||
process.env.TEST_API_KEY = "";
|
process.env.TEST_API_KEY = "";
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequest,
|
ScrapeRequestInput,
|
||||||
ScrapeResponseRequestTest,
|
ScrapeResponseRequestTest,
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test",
|
url: "https://facebook.com/fake-test",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/abs/2410.04840",
|
url: "https://arxiv.org/abs/2410.04840",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key and includeHtml set to true",
|
"should return a successful response with a valid API key and includeHtml set to true",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["markdown", "html"],
|
formats: ["markdown", "html"],
|
||||||
};
|
};
|
||||||
|
@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
30000
|
30000
|
||||||
);
|
);
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||||
// formats: ["markdown", "html"],
|
// formats: ["markdown", "html"],
|
||||||
};
|
};
|
||||||
|
@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false // default is true
|
||||||
};
|
};
|
||||||
|
@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||||
|
|
||||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
excludeTags: ['.nav', '#footer', 'strong'],
|
excludeTags: ['.nav', '#footer', 'strong'],
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false // default is true
|
||||||
|
@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key and includeHtml set to true",
|
"should return a successful response with a valid API key and includeHtml set to true",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["html","rawHtml"],
|
formats: ["html","rawHtml"],
|
||||||
};
|
};
|
||||||
|
@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with waitFor",
|
"should return a successful response with waitFor",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://ycombinator.com/companies",
|
url: "https://ycombinator.com/companies",
|
||||||
formats: ["markdown"],
|
formats: ["markdown"],
|
||||||
waitFor: 8000
|
waitFor: 8000
|
||||||
|
@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid links on page",
|
"should return a successful response with a valid links on page",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["links"],
|
formats: ["links"],
|
||||||
};
|
};
|
||||||
|
@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test",
|
url: "https://facebook.com/fake-test",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
603
apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
Normal file
603
apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts
Normal file
|
@ -0,0 +1,603 @@
|
||||||
|
import request from "supertest";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import {
|
||||||
|
ScrapeRequest,
|
||||||
|
ScrapeResponseRequestTest,
|
||||||
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
||||||
|
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
|
||||||
|
|
||||||
|
describe("E2E Tests for v1 API Routes", () => {
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post('/v1/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send({ url: 'https://httpstat.us/403' });
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data).toHaveProperty('markdown');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||||
|
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||||
|
// expect(response.body.data.markdown).toContain("Loading...");
|
||||||
|
expect(response.body.data.markdown).toContain("Click me!");
|
||||||
|
expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
|
||||||
|
expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
|
||||||
|
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
||||||
|
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
||||||
|
expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'formats:html' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["html"]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
expect(response.body.data).not.toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("html");
|
||||||
|
|
||||||
|
expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||||
|
expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["rawHtml"]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data).not.toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("rawHtml");
|
||||||
|
|
||||||
|
expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||||
|
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// - TODO: tests for links
|
||||||
|
// - TODO: tests for screenshot
|
||||||
|
// - TODO: tests for screenshot@fullPage
|
||||||
|
|
||||||
|
it.concurrent("should handle 'headers' parameter correctly", async () => {
|
||||||
|
// @ts-ignore
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
headers: { "e2e-header-test": "firecrawl" }
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'includeTags' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
includeTags: ['#content-1']
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||||
|
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'excludeTags' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
excludeTags: ['#content-1']
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||||
|
expect(response.body.data.markdown).not.toContain("Content with id #content-1");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'onlyMainContent' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["html", "markdown"],
|
||||||
|
onlyMainContent: false
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||||
|
expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'timeout' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
timeout: 500
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(408);
|
||||||
|
|
||||||
|
if (!("error" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'error' property");
|
||||||
|
}
|
||||||
|
expect(response.body.error).toBe("Request timed out");
|
||||||
|
expect(response.body.success).toBe(false);
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
|
||||||
|
it.concurrent("should handle 'mobile' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
mobile: true
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data.markdown).toContain("This content is only visible on mobile");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'parsePDF' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
|
||||||
|
expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||||
|
|
||||||
|
const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(responseNoParsePDF.statusCode).toBe(200);
|
||||||
|
expect(responseNoParsePDF.body).toHaveProperty('data');
|
||||||
|
if (!("data" in responseNoParsePDF.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// it.concurrent("should handle 'location' parameter correctly",
|
||||||
|
// async () => {
|
||||||
|
// const scrapeRequest: ScrapeRequest = {
|
||||||
|
// url: "https://roastmywebsite.ai",
|
||||||
|
// location: {
|
||||||
|
// country: "US",
|
||||||
|
// languages: ["en"]
|
||||||
|
// }
|
||||||
|
// };
|
||||||
|
|
||||||
|
// const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
// .post("/v1/scrape")
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
// .set("Content-Type", "application/json")
|
||||||
|
// .send(scrapeRequest);
|
||||||
|
|
||||||
|
// expect(response.statusCode).toBe(200);
|
||||||
|
// // Add assertions to verify location is handled correctly
|
||||||
|
// },
|
||||||
|
// 30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'skipTlsVerification' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: "https://expired.badssl.com/",
|
||||||
|
timeout: 120000
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
console.log("Error1a")
|
||||||
|
// console.log(response.body)
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
|
console.log("Error?")
|
||||||
|
|
||||||
|
const scrapeRequestWithSkipTlsVerification = {
|
||||||
|
url: "https://expired.badssl.com/",
|
||||||
|
skipTlsVerification: true,
|
||||||
|
timeout: 120000
|
||||||
|
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequestWithSkipTlsVerification);
|
||||||
|
|
||||||
|
console.log("Error1b")
|
||||||
|
// console.log(responseWithSkipTlsVerification.body)
|
||||||
|
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
|
||||||
|
if (!("data" in responseWithSkipTlsVerification.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
// console.log(responseWithSkipTlsVerification.body.data)
|
||||||
|
expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
|
||||||
|
},
|
||||||
|
60000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'removeBase64Images' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
removeBase64Images: true
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
// console.log(response.body.data.markdown)
|
||||||
|
// - TODO: not working for every image
|
||||||
|
// expect(response.body.data.markdown).toContain("Image-Removed");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'action wait' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
actions: [{
|
||||||
|
type: "wait",
|
||||||
|
milliseconds: 10000
|
||||||
|
}]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data.markdown).not.toContain("Loading...");
|
||||||
|
expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// screenshot
|
||||||
|
it.concurrent("should handle 'action screenshot' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
actions: [{
|
||||||
|
type: "screenshot"
|
||||||
|
}]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
if (!response.body.data.actions?.screenshots) {
|
||||||
|
throw new Error("Expected response body to have screenshots array");
|
||||||
|
}
|
||||||
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||||
|
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||||
|
|
||||||
|
// TODO compare screenshot with expected screenshot
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
actions: [{
|
||||||
|
type: "screenshot",
|
||||||
|
fullPage: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type:"scrape"
|
||||||
|
}]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
// console.log(response.body.data.actions?.screenshots[0])
|
||||||
|
if (!response.body.data.actions?.screenshots) {
|
||||||
|
throw new Error("Expected response body to have screenshots array");
|
||||||
|
}
|
||||||
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||||
|
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||||
|
|
||||||
|
if (!response.body.data.actions?.scrapes) {
|
||||||
|
throw new Error("Expected response body to have scrapes array");
|
||||||
|
}
|
||||||
|
expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
|
||||||
|
expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||||
|
// TODO compare screenshot with expected full page screenshot
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'action click' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
actions: [{
|
||||||
|
type: "click",
|
||||||
|
selector: "#click-me"
|
||||||
|
}]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data.markdown).not.toContain("Click me!");
|
||||||
|
expect(response.body.data.markdown).toContain("Text changed after click!");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
it.concurrent("should handle 'action write' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["html"],
|
||||||
|
actions: [{
|
||||||
|
type: "click",
|
||||||
|
selector: "#input-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: "write",
|
||||||
|
text: "Hello, world!"
|
||||||
|
}
|
||||||
|
]} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
|
// uncomment the following line:
|
||||||
|
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
|
it.concurrent("should handle 'action pressKey' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["markdown"],
|
||||||
|
actions: [
|
||||||
|
{
|
||||||
|
type: "press",
|
||||||
|
key: "ArrowDown"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
// // TODO: fix this test (need to fix fire-engine first)
|
||||||
|
// // right now response.body is: { success: false, error: '(Internal server error) - null' }
|
||||||
|
// expect(response.statusCode).toBe(200);
|
||||||
|
// if (!("data" in response.body)) {
|
||||||
|
// throw new Error("Expected response body to have 'data' property");
|
||||||
|
// }
|
||||||
|
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
|
it.concurrent("should handle 'action scroll' parameter correctly",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest = {
|
||||||
|
url: E2E_TEST_SERVER_URL,
|
||||||
|
formats: ["markdown"],
|
||||||
|
actions: [
|
||||||
|
{
|
||||||
|
type: "click",
|
||||||
|
selector: "#scroll-bottom-loader"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: "scroll",
|
||||||
|
direction: "down",
|
||||||
|
amount: 2000
|
||||||
|
}
|
||||||
|
]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
// TODO: uncomment this tests
|
||||||
|
// expect(response.statusCode).toBe(200);
|
||||||
|
// if (!("data" in response.body)) {
|
||||||
|
// throw new Error("Expected response body to have 'data' property");
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
||||||
|
},
|
||||||
|
30000);
|
||||||
|
|
||||||
|
// TODO: test scrape action
|
||||||
|
|
||||||
|
});
|
|
@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||||
await new Promise((r) => setTimeout(r, 10000));
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
const completedResponse = await request(TEST_URL)
|
const completedResponse = await request(TEST_URL)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.maxResponseSize(4000000000);
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
|
|
@ -9,9 +9,8 @@ import {
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { withAuth } from "../lib/withAuth";
|
import { withAuth } from "../lib/withAuth";
|
||||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
|
||||||
import { sendNotification } from "../services/notification/email_notification";
|
import { sendNotification } from "../services/notification/email_notification";
|
||||||
import { Logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { redlock } from "../services/redlock";
|
import { redlock } from "../services/redlock";
|
||||||
import { deleteKey, getValue } from "../services/redis";
|
import { deleteKey, getValue } from "../services/redis";
|
||||||
import { setValue } from "../services/redis";
|
import { setValue } from "../services/redis";
|
||||||
|
@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||||
export async function setCachedACUC(
|
export async function setCachedACUC(
|
||||||
api_key: string,
|
api_key: string,
|
||||||
acuc:
|
acuc:
|
||||||
| AuthCreditUsageChunk
|
| AuthCreditUsageChunk | null
|
||||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
|
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
||||||
) {
|
) {
|
||||||
const cacheKeyACUC = `acuc_${api_key}`;
|
const cacheKeyACUC = `acuc_${api_key}`;
|
||||||
const redLockKey = `lock_${cacheKeyACUC}`;
|
const redLockKey = `lock_${cacheKeyACUC}`;
|
||||||
|
@ -49,7 +48,7 @@ export async function setCachedACUC(
|
||||||
try {
|
try {
|
||||||
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
||||||
if (typeof acuc === "function") {
|
if (typeof acuc === "function") {
|
||||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
|
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
|
||||||
|
|
||||||
if (acuc === null) {
|
if (acuc === null) {
|
||||||
if (signal.aborted) {
|
if (signal.aborted) {
|
||||||
|
@ -69,7 +68,7 @@ export async function setCachedACUC(
|
||||||
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
|
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -103,7 +102,7 @@ export async function getACUC(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger.warn(
|
logger.warn(
|
||||||
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
||||||
);
|
);
|
||||||
retries++;
|
retries++;
|
||||||
|
@ -146,33 +145,14 @@ export async function authenticateUser(
|
||||||
res,
|
res,
|
||||||
mode?: RateLimiterMode
|
mode?: RateLimiterMode
|
||||||
): Promise<AuthResponse> {
|
): Promise<AuthResponse> {
|
||||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
|
||||||
}
|
|
||||||
|
|
||||||
function setTrace(team_id: string, api_key: string) {
|
|
||||||
try {
|
|
||||||
setTraceAttributes({
|
|
||||||
team_id,
|
|
||||||
api_key,
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
Sentry.captureException(error);
|
|
||||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function supaAuthenticateUser(
|
export async function supaAuthenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
mode?: RateLimiterMode
|
mode?: RateLimiterMode
|
||||||
): Promise<{
|
): Promise<AuthResponse> {
|
||||||
success: boolean;
|
|
||||||
team_id?: string;
|
|
||||||
error?: string;
|
|
||||||
status?: number;
|
|
||||||
plan?: PlanType;
|
|
||||||
chunk?: AuthCreditUsageChunk;
|
|
||||||
}> {
|
|
||||||
const authHeader =
|
const authHeader =
|
||||||
req.headers.authorization ??
|
req.headers.authorization ??
|
||||||
(req.headers["sec-websocket-protocol"]
|
(req.headers["sec-websocket-protocol"]
|
||||||
|
@ -200,7 +180,7 @@ export async function supaAuthenticateUser(
|
||||||
|
|
||||||
let teamId: string | null = null;
|
let teamId: string | null = null;
|
||||||
let priceId: string | null = null;
|
let priceId: string | null = null;
|
||||||
let chunk: AuthCreditUsageChunk;
|
let chunk: AuthCreditUsageChunk | null = null;
|
||||||
|
|
||||||
if (token == "this_is_just_a_preview_token") {
|
if (token == "this_is_just_a_preview_token") {
|
||||||
if (mode == RateLimiterMode.CrawlStatus) {
|
if (mode == RateLimiterMode.CrawlStatus) {
|
||||||
|
@ -233,8 +213,6 @@ export async function supaAuthenticateUser(
|
||||||
priceId = chunk.price_id;
|
priceId = chunk.price_id;
|
||||||
|
|
||||||
const plan = getPlanByPriceId(priceId);
|
const plan = getPlanByPriceId(priceId);
|
||||||
// HyperDX Logging
|
|
||||||
setTrace(teamId, normalizedApi);
|
|
||||||
subscriptionData = {
|
subscriptionData = {
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
plan,
|
plan,
|
||||||
|
@ -291,7 +269,7 @@ export async function supaAuthenticateUser(
|
||||||
try {
|
try {
|
||||||
await rateLimiter.consume(team_endpoint_token);
|
await rateLimiter.consume(team_endpoint_token);
|
||||||
} catch (rateLimiterRes) {
|
} catch (rateLimiterRes) {
|
||||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||||
|
|
||||||
|
@ -318,7 +296,7 @@ export async function supaAuthenticateUser(
|
||||||
mode === RateLimiterMode.CrawlStatus ||
|
mode === RateLimiterMode.CrawlStatus ||
|
||||||
mode === RateLimiterMode.Search)
|
mode === RateLimiterMode.Search)
|
||||||
) {
|
) {
|
||||||
return { success: true, team_id: "preview" };
|
return { success: true, team_id: "preview", chunk: null };
|
||||||
// check the origin of the request and make sure its from firecrawl.dev
|
// check the origin of the request and make sure its from firecrawl.dev
|
||||||
// const origin = req.headers.origin;
|
// const origin = req.headers.origin;
|
||||||
// if (origin && origin.includes("firecrawl.dev")){
|
// if (origin && origin.includes("firecrawl.dev")){
|
||||||
|
@ -333,12 +311,12 @@ export async function supaAuthenticateUser(
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
team_id: subscriptionData.team_id,
|
team_id: teamId ?? undefined,
|
||||||
plan: (subscriptionData.plan ?? "") as PlanType,
|
plan: (subscriptionData?.plan ?? "") as PlanType,
|
||||||
chunk,
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
function getPlanByPriceId(price_id: string): PlanType {
|
function getPlanByPriceId(price_id: string | null): PlanType {
|
||||||
switch (price_id) {
|
switch (price_id) {
|
||||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||||
return "starter";
|
return "starter";
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { supabase_service } from "../../../services/supabase";
|
import { supabase_service } from "../../../services/supabase";
|
||||||
import { clearACUC } from "../../auth";
|
import { clearACUC } from "../../auth";
|
||||||
import { Logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
|
|
||||||
export async function acucCacheClearController(req: Request, res: Response) {
|
export async function acucCacheClearController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
|
@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
|
||||||
.select("*")
|
.select("*")
|
||||||
.eq("team_id", team_id);
|
.eq("team_id", team_id);
|
||||||
|
|
||||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
|
||||||
|
|
||||||
res.json({ ok: true });
|
res.json({ ok: true });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||||
res.status(500).json({ error: "Internal server error" });
|
res.status(500).json({ error: "Internal server error" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
|
|
||||||
import { Job } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import { Logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { getScrapeQueue } from "../../../services/queue-service";
|
import { getScrapeQueue } from "../../../services/queue-service";
|
||||||
import { checkAlerts } from "../../../services/alerts";
|
import { checkAlerts } from "../../../services/alerts";
|
||||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||||
|
@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response
|
res: Response
|
||||||
) {
|
) {
|
||||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
logger.info("🐂 Cleaning jobs older than 24h");
|
||||||
try {
|
try {
|
||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const batchSize = 10;
|
const batchSize = 10;
|
||||||
|
@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
|
||||||
).flat();
|
).flat();
|
||||||
const before24hJobs =
|
const before24hJobs =
|
||||||
completedJobs.filter(
|
completedJobs.filter(
|
||||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
(job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
let count = 0;
|
let count = 0;
|
||||||
|
@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
|
||||||
await job.remove();
|
await job.remove();
|
||||||
count++;
|
count++;
|
||||||
} catch (jobError) {
|
} catch (jobError) {
|
||||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||||
return res.status(500).send("Failed to clean jobs");
|
return res.status(500).send("Failed to clean jobs");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
|
||||||
await checkAlerts();
|
await checkAlerts();
|
||||||
return res.status(200).send("Alerts initialized");
|
return res.status(200).send("Alerts initialized");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
logger.debug(`Failed to initialize alerts: ${error}`);
|
||||||
return res.status(500).send("Failed to initialize alerts");
|
return res.status(500).send("Failed to initialize alerts");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
|
||||||
noActiveJobs,
|
noActiveJobs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (targetMachineCount !== activeMachines) {
|
if (targetMachineCount !== activeMachines) {
|
||||||
Logger.info(
|
logger.info(
|
||||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
||||||
count: activeMachines,
|
count: activeMachines,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).send("Failed to initialize autoscaler");
|
return res.status(500).send("Failed to initialize autoscaler");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { Logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||||
|
|
||||||
export async function redisHealthController(req: Request, res: Response) {
|
export async function redisHealthController(req: Request, res: Response) {
|
||||||
|
@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||||
return await operation();
|
return await operation();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (attempt === retries) throw error;
|
if (attempt === retries) throw error;
|
||||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
const queueRedis = new Redis(process.env.REDIS_URL!);
|
||||||
|
|
||||||
const testKey = "test";
|
const testKey = "test";
|
||||||
const testValue = "test";
|
const testValue = "test";
|
||||||
|
@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||||
await retryOperation(() => queueRedis.del(testKey));
|
await retryOperation(() => queueRedis.del(testKey));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`queueRedis health check failed: ${error}`);
|
logger.error(`queueRedis health check failed: ${error}`);
|
||||||
queueRedisHealth = null;
|
queueRedisHealth = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||||
);
|
);
|
||||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||||
redisRateLimitHealth = null;
|
redisRateLimitHealth = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||||
healthStatus.queueRedis === "healthy" &&
|
healthStatus.queueRedis === "healthy" &&
|
||||||
healthStatus.redisRateLimitClient === "healthy"
|
healthStatus.redisRateLimitClient === "healthy"
|
||||||
) {
|
) {
|
||||||
Logger.info("Both Redis instances are healthy");
|
logger.info("Both Redis instances are healthy");
|
||||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||||
} else {
|
} else {
|
||||||
Logger.info(
|
logger.info(
|
||||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||||
);
|
);
|
||||||
// await sendSlackWebhook(
|
// await sendSlackWebhook(
|
||||||
|
@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
||||||
.json({ status: "unhealthy", details: healthStatus });
|
.json({ status: "unhealthy", details: healthStatus });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Redis health check failed: ${error}`);
|
logger.error(`Redis health check failed: ${error}`);
|
||||||
// await sendSlackWebhook(
|
// await sendSlackWebhook(
|
||||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||||
// true
|
// true
|
||||||
|
|
|
@ -2,7 +2,7 @@ import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
import { RateLimiterMode } from "../../../src/types";
|
||||||
import { supabase_service } from "../../../src/services/supabase";
|
import { supabase_service } from "../../../src/services/supabase";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.CrawlStatus
|
RateLimiterMode.CrawlStatus
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id } = auth;
|
||||||
|
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
|
@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||||
sc.cancelled = true;
|
sc.cancelled = true;
|
||||||
await saveCrawl(req.params.jobId, sc);
|
await saveCrawl(req.params.jobId, sc);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
|
@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,15 +2,17 @@ import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
import { RateLimiterMode } from "../../../src/types";
|
||||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||||
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Job } from "bullmq";
|
||||||
|
import { toLegacyDocument } from "../v1/types";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function getJobs(crawlId: string, ids: string[]) {
|
export async function getJobs(crawlId: string, ids: string[]) {
|
||||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
|
||||||
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
||||||
|
@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) {
|
||||||
|
|
||||||
export async function crawlStatusController(req: Request, res: Response) {
|
export async function crawlStatusController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.CrawlStatus
|
RateLimiterMode.CrawlStatus
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id } = auth;
|
||||||
|
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
|
@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) {
|
||||||
status: jobStatus,
|
status: jobStatus,
|
||||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||||
total: jobs.length,
|
total: jobs.length,
|
||||||
data: jobStatus === "completed" ? data : null,
|
data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
|
||||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
|
||||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
|
||||||
|
import { ZodError } from "zod";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Crawl
|
RateLimiterMode.Crawl
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id, plan, chunk } = auth;
|
||||||
|
|
||||||
if (req.headers["x-idempotency-key"]) {
|
if (req.headers["x-idempotency-key"]) {
|
||||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||||
if (!isIdempotencyValid) {
|
if (!isIdempotencyValid) {
|
||||||
|
@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
createIdempotencyKey(req);
|
createIdempotencyKey(req);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
// TODO: need to do this to v1
|
// TODO: need to do this to v1
|
||||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||||
|
|
||||||
let url = req.body.url;
|
let url = urlSchema.parse(req.body.url);
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
// documents: docs,
|
// documents: docs,
|
||||||
// });
|
// });
|
||||||
// } catch (error) {
|
// } catch (error) {
|
||||||
// Logger.error(error);
|
// logger.error(error);
|
||||||
// return res.status(500).json({ error: error.message });
|
// return res.status(500).json({ error: error.message });
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
|
|
||||||
await logCrawl(id, team_id);
|
await logCrawl(id, team_id);
|
||||||
|
|
||||||
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
originUrl: url,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
|
@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
data: {
|
data: {
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
pageOptions: pageOptions,
|
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
|
@ -208,10 +216,11 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan: plan!,
|
||||||
pageOptions: pageOptions,
|
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
},
|
},
|
||||||
|
@ -226,7 +235,9 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
res.json({ jobId: id });
|
res.json({ jobId: id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error instanceof ZodError
|
||||||
|
? "Invalid URL"
|
||||||
|
: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,15 +3,16 @@ import { authenticateUser } from "../auth";
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
import { RateLimiterMode } from "../../../src/types";
|
||||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Preview
|
RateLimiterMode.Preview
|
||||||
|
@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
|
|
||||||
const team_id = "preview";
|
const team_id = "preview";
|
||||||
|
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { plan } = auth;
|
||||||
|
|
||||||
let url = req.body.url;
|
let url = req.body.url;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
|
@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
// documents: docs,
|
// documents: docs,
|
||||||
// });
|
// });
|
||||||
// } catch (error) {
|
// } catch (error) {
|
||||||
// Logger.error(error);
|
// logger.error(error);
|
||||||
// return res.status(500).json({ error: error.message });
|
// return res.status(500).json({ error: error.message });
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
robots = await this.getRobotsTxt();
|
robots = await this.getRobotsTxt();
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
|
||||||
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
originUrl: url,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
robots,
|
robots,
|
||||||
|
@ -107,10 +113,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
await addScrapeJob({
|
await addScrapeJob({
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan: plan!,
|
||||||
pageOptions: pageOptions,
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
|
@ -123,10 +130,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
await addScrapeJob({
|
await addScrapeJob({
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan: plan!,
|
||||||
pageOptions: pageOptions,
|
crawlerOptions,
|
||||||
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
}, {}, jobId);
|
}, {}, jobId);
|
||||||
|
@ -136,7 +144,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
res.json({ jobId: id });
|
res.json({ jobId: id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,13 +8,14 @@ import { authenticateUser } from "../auth";
|
||||||
export const keyAuthController = async (req: Request, res: Response) => {
|
export const keyAuthController = async (req: Request, res: Response) => {
|
||||||
try {
|
try {
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res
|
res
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
// if success, return success: true
|
// if success, return success: true
|
||||||
return res.status(200).json({ success: true });
|
return res.status(200).json({ success: true });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
@ -7,7 +7,7 @@ import {
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
import {
|
import {
|
||||||
|
@ -19,9 +19,11 @@ import {
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||||
|
import { ZodError } from "zod";
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
|
@ -35,10 +37,10 @@ export async function scrapeHelper(
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
data?: Document;
|
data?: Document | { url: string };
|
||||||
returnCode: number;
|
returnCode: number;
|
||||||
}> {
|
}> {
|
||||||
const url = req.body.url;
|
const url = urlSchema.parse(req.body.url);
|
||||||
if (typeof url !== "string") {
|
if (typeof url !== "string") {
|
||||||
return { success: false, error: "Url is required", returnCode: 400 };
|
return { success: false, error: "Url is required", returnCode: 400 };
|
||||||
}
|
}
|
||||||
|
@ -54,15 +56,16 @@ export async function scrapeHelper(
|
||||||
|
|
||||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||||
|
|
||||||
|
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions,
|
|
||||||
team_id,
|
team_id,
|
||||||
pageOptions,
|
scrapeOptions,
|
||||||
plan,
|
internalOptions,
|
||||||
extractorOptions,
|
plan: plan!,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
},
|
},
|
||||||
|
@ -81,7 +84,7 @@ export async function scrapeHelper(
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
try {
|
try {
|
||||||
doc = (await waitForJob(jobId, timeout))[0];
|
doc = (await waitForJob<Document>(jobId, timeout));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||||
span.setAttribute("timedOut", true);
|
span.setAttribute("timedOut", true);
|
||||||
|
@ -149,7 +152,7 @@ export async function scrapeHelper(
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: toLegacyDocument(doc, internalOptions),
|
||||||
returnCode: 200,
|
returnCode: 200,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Scrape
|
RateLimiterMode.Scrape
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id, plan, chunk } = auth;
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||||
const extractorOptions = {
|
const extractorOptions = {
|
||||||
|
@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
error:
|
error:
|
||||||
|
@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens =
|
const numTokens =
|
||||||
result.data && result.data.markdown
|
result.data && (result.data as Document).markdown
|
||||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
|
@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
if (creditsToBeBilled > 0) {
|
if (creditsToBeBilled > 0) {
|
||||||
// billing for doc done on queue end, bill only for llm extraction
|
// billing for doc done on queue end, bill only for llm extraction
|
||||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
|
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
|
||||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
|
|
||||||
let doc = result.data;
|
let doc = result.data;
|
||||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
if (doc && doc.rawHtml) {
|
if (doc && (doc as Document).rawHtml) {
|
||||||
delete doc.rawHtml;
|
delete (doc as Document).rawHtml;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(pageOptions && pageOptions.includeExtract) {
|
if(pageOptions && pageOptions.includeExtract) {
|
||||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
||||||
delete doc.markdown;
|
delete (doc as Document).markdown;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: result.success,
|
success: result.success,
|
||||||
|
@ -276,19 +283,20 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
mode: "scrape",
|
mode: "scrape",
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
scrapeOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
extractor_options: extractorOptions,
|
|
||||||
num_tokens: numTokens,
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
error:
|
error:
|
||||||
typeof error === "string"
|
error instanceof ZodError
|
||||||
|
? "Invalid URL"
|
||||||
|
: typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: error?.message ?? "Internal Server Error",
|
: error?.message ?? "Internal Server Error",
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
|
||||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
|
@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||||
import { search } from "../../search";
|
import { search } from "../../search";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { Job } from "bullmq";
|
||||||
|
import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
subscription_id: string,
|
subscription_id: string | null | undefined,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
searchOptions: SearchOptions,
|
searchOptions: SearchOptions,
|
||||||
plan: PlanType
|
plan: PlanType | undefined
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
|
@ -35,8 +36,8 @@ export async function searchHelper(
|
||||||
return { success: false, error: "Query is required", returnCode: 400 };
|
return { success: false, error: "Query is required", returnCode: 400 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const tbs = searchOptions.tbs ?? null;
|
const tbs = searchOptions.tbs ?? undefined;
|
||||||
const filter = searchOptions.filter ?? null;
|
const filter = searchOptions.filter ?? undefined;
|
||||||
let num_results = Math.min(searchOptions.limit ?? 7, 10);
|
let num_results = Math.min(searchOptions.limit ?? 7, 10);
|
||||||
|
|
||||||
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||||
|
@ -58,10 +59,11 @@ export async function searchHelper(
|
||||||
|
|
||||||
let justSearch = pageOptions.fetchPageContent === false;
|
let justSearch = pageOptions.fetchPageContent === false;
|
||||||
|
|
||||||
|
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
|
||||||
|
|
||||||
if (justSearch) {
|
if (justSearch) {
|
||||||
billTeam(team_id, subscription_id, res.length).catch(error => {
|
billTeam(team_id, subscription_id, res.length).catch(error => {
|
||||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
return { success: true, data: res, returnCode: 200 };
|
return { success: true, data: res, returnCode: 200 };
|
||||||
|
@ -88,9 +90,9 @@ export async function searchHelper(
|
||||||
data: {
|
data: {
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
pageOptions: pageOptions,
|
scrapeOptions,
|
||||||
|
internalOptions,
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
|
@ -104,7 +106,7 @@ export async function searchHelper(
|
||||||
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
|
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
|
||||||
}
|
}
|
||||||
|
|
||||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]);
|
const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
|
@ -115,7 +117,7 @@ export async function searchHelper(
|
||||||
|
|
||||||
// make sure doc.content is not empty
|
// make sure doc.content is not empty
|
||||||
const filteredDocs = docs.filter(
|
const filteredDocs = docs.filter(
|
||||||
(doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
|
(doc: any) => doc && doc.content && doc.content.trim().length > 0
|
||||||
);
|
);
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
|
@ -132,14 +134,15 @@ export async function searchHelper(
|
||||||
export async function searchController(req: Request, res: Response) {
|
export async function searchController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Search
|
RateLimiterMode.Search
|
||||||
);
|
);
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
const { team_id, plan, chunk } = auth;
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||||
|
@ -162,7 +165,7 @@ export async function searchController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: "Internal server error" });
|
return res.status(500).json({ error: "Internal server error" });
|
||||||
}
|
}
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
|
@ -189,7 +192,6 @@ export async function searchController(req: Request, res: Response) {
|
||||||
mode: "search",
|
mode: "search",
|
||||||
url: req.body.query,
|
url: req.body.query,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
|
||||||
origin: origin,
|
origin: origin,
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
|
@ -199,7 +201,7 @@ export async function searchController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error("Unhandled error occurred in search", { error });
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||||
import { getJobs } from "./crawl-status";
|
import { getJobs } from "./crawl-status";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,6 @@ import {
|
||||||
BatchScrapeRequest,
|
BatchScrapeRequest,
|
||||||
batchScrapeRequestSchema,
|
batchScrapeRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
legacyExtractorOptions,
|
|
||||||
legacyScrapeOptions,
|
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
|
@ -29,19 +27,16 @@ export async function batchScrapeController(
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
let { remainingCredits } = req.account;
|
let { remainingCredits } = req.account!;
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
if(!useDbAuthentication){
|
if(!useDbAuthentication){
|
||||||
remainingCredits = Infinity;
|
remainingCredits = Infinity;
|
||||||
}
|
}
|
||||||
|
|
||||||
const pageOptions = legacyScrapeOptions(req.body);
|
|
||||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
|
||||||
|
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
pageOptions,
|
scrapeOptions: req.body,
|
||||||
|
internalOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
@ -64,10 +59,9 @@ export async function batchScrapeController(
|
||||||
url: x,
|
url: x,
|
||||||
mode: "single_urls" as const,
|
mode: "single_urls" as const,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan!,
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
pageOptions,
|
scrapeOptions: req.body,
|
||||||
extractorOptions,
|
|
||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { supabase_service } from "../../services/supabase";
|
import { supabase_service } from "../../services/supabase";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
||||||
sc.cancelled = true;
|
sc.cancelled = true;
|
||||||
await saveCrawl(req.params.jobId, sc);
|
await saveCrawl(req.params.jobId, sc);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
|
@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
import { authMiddleware } from "../../routes/v1";
|
import { authMiddleware } from "../../routes/v1";
|
||||||
import { RateLimiterMode } from "../../types";
|
import { RateLimiterMode } from "../../types";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
|
||||||
import { WebSocket } from "ws";
|
import { WebSocket } from "ws";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { getJob, getJobs } from "./crawl-status";
|
import { getJob, getJobs } from "./crawl-status";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { Job, JobState } from "bullmq";
|
||||||
|
|
||||||
type ErrorMessage = {
|
type ErrorMessage = {
|
||||||
type: "error",
|
type: "error",
|
||||||
|
@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
||||||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||||
}
|
}
|
||||||
|
|
||||||
let doneJobIDs = [];
|
let doneJobIDs: string[] = [];
|
||||||
let finished = false;
|
let finished = false;
|
||||||
|
|
||||||
const loop = async () => {
|
const loop = async () => {
|
||||||
|
@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
||||||
|
|
||||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||||
|
const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
|
||||||
for (const jobID of newlyDoneJobIDs) {
|
|
||||||
const job = await getJob(jobID);
|
|
||||||
|
|
||||||
|
for (const job of newlyDoneJobs) {
|
||||||
if (job.returnvalue) {
|
if (job.returnvalue) {
|
||||||
send(ws, {
|
send(ws, {
|
||||||
type: "document",
|
type: "document",
|
||||||
data: legacyDocumentConverter(job.returnvalue),
|
data: job.returnvalue,
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||||
|
@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
const throttledJobsSet = new Set(throttledJobs);
|
||||||
|
|
||||||
const validJobStatuses = [];
|
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||||
const validJobIDs = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||||
|
@ -126,7 +126,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
||||||
completed: doneJobIDs.length,
|
completed: doneJobIDs.length,
|
||||||
creditsUsed: jobIDs.length,
|
creditsUsed: jobIDs.length,
|
||||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||||
data: data.map(x => legacyDocumentConverter(x)),
|
data: data,
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
||||||
// Basically just middleware and error wrapping
|
// Basically just middleware and error wrapping
|
||||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||||
try {
|
try {
|
||||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
null,
|
null,
|
||||||
RateLimiterMode.CrawlStatus,
|
RateLimiterMode.CrawlStatus,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
return close(ws, 3000, {
|
return close(ws, 3000, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error,
|
error: auth.error,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id, plan } = auth;
|
||||||
|
|
||||||
req.auth = { team_id, plan };
|
req.auth = { team_id, plan };
|
||||||
|
|
||||||
await crawlStatusWS(ws, req);
|
await crawlStatusWS(ws, req);
|
||||||
|
@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||||
return close(ws, 1011, {
|
return close(ws, 1011, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
|
||||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
|
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Job, JobState } from "bullmq";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function getJob(id: string) {
|
export async function getJob(id: string) {
|
||||||
|
@ -24,7 +25,7 @@ export async function getJob(id: string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getJobs(ids: string[]) {
|
export async function getJobs(ids: string[]) {
|
||||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
|
||||||
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const supabaseData = await supabaseGetJobsById(ids);
|
const supabaseData = await supabaseGetJobsById(ids);
|
||||||
|
@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
const throttledJobsSet = new Set(throttledJobs);
|
||||||
|
|
||||||
const validJobStatuses = [];
|
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||||
const validJobIDs = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||||
|
@ -81,7 +82,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
||||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||||
|
|
||||||
let doneJobs = [];
|
let doneJobs: Job[] = [];
|
||||||
|
|
||||||
if (end === undefined) { // determine 10 megabyte limit
|
if (end === undefined) { // determine 10 megabyte limit
|
||||||
let bytes = 0;
|
let bytes = 0;
|
||||||
|
@ -98,7 +99,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
||||||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||||
const job = jobs[ii];
|
const job = jobs[ii];
|
||||||
doneJobs.push(job);
|
doneJobs.push(job);
|
||||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
bytes += JSON.stringify(job.returnvalue).length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +123,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data.length > 0) {
|
if (data.length > 0) {
|
||||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
|
||||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||||
if (data[ii]) {
|
if (data[ii]) {
|
||||||
delete data[ii].rawHtml;
|
delete data[ii].rawHtml;
|
||||||
|
@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
||||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||||
? undefined
|
? undefined
|
||||||
: nextURL.href,
|
: nextURL.href,
|
||||||
data: data.map(x => legacyDocumentConverter(x)),
|
data: data,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,8 @@ import {
|
||||||
CrawlRequest,
|
CrawlRequest,
|
||||||
crawlRequestSchema,
|
crawlRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
legacyCrawlerOptions,
|
|
||||||
legacyScrapeOptions,
|
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
|
toLegacyCrawlerOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJob,
|
addCrawlJob,
|
||||||
|
@ -20,9 +19,10 @@ import {
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
|
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
||||||
|
|
||||||
export async function crawlController(
|
export async function crawlController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
|
@ -34,18 +34,22 @@ export async function crawlController(
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
let { remainingCredits } = req.account;
|
let { remainingCredits } = req.account!;
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
if(!useDbAuthentication){
|
if(!useDbAuthentication){
|
||||||
remainingCredits = Infinity;
|
remainingCredits = Infinity;
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
const crawlerOptions = {
|
||||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
...req.body,
|
||||||
|
url: undefined,
|
||||||
|
scrapeOptions: undefined,
|
||||||
|
};
|
||||||
|
const scrapeOptions = req.body.scrapeOptions;
|
||||||
|
|
||||||
// TODO: @rafa, is this right? copied from v0
|
// TODO: @rafa, is this right? copied from v0
|
||||||
if (Array.isArray(crawlerOptions.includes)) {
|
if (Array.isArray(crawlerOptions.includePaths)) {
|
||||||
for (const x of crawlerOptions.includes) {
|
for (const x of crawlerOptions.includePaths) {
|
||||||
try {
|
try {
|
||||||
new RegExp(x);
|
new RegExp(x);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -54,8 +58,8 @@ export async function crawlController(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Array.isArray(crawlerOptions.excludes)) {
|
if (Array.isArray(crawlerOptions.excludePaths)) {
|
||||||
for (const x of crawlerOptions.excludes) {
|
for (const x of crawlerOptions.excludePaths) {
|
||||||
try {
|
try {
|
||||||
new RegExp(x);
|
new RegExp(x);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -68,8 +72,9 @@ export async function crawlController(
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
crawlerOptions,
|
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||||
pageOptions,
|
scrapeOptions,
|
||||||
|
internalOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
@ -78,9 +83,9 @@ export async function crawlController(
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.debug(
|
logger.debug(
|
||||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||||
e
|
e
|
||||||
)}`
|
)}`
|
||||||
|
@ -112,7 +117,7 @@ export async function crawlController(
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
scrapeOptions,
|
||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
|
@ -142,10 +147,10 @@ export async function crawlController(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||||
|
plan: req.auth.plan!,
|
||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import {
|
import {
|
||||||
legacyCrawlerOptions,
|
|
||||||
mapRequestSchema,
|
mapRequestSchema,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
|
scrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { MapResponse, MapRequest } from "./types";
|
import { MapResponse, MapRequest } from "./types";
|
||||||
|
@ -18,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
|
|
||||||
// Max Links that /map can return
|
// Max Links that /map can return
|
||||||
const MAX_MAP_LIMIT = 5000;
|
const MAX_MAP_LIMIT = 5000;
|
||||||
|
@ -44,8 +44,12 @@ export async function mapController(
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
crawlerOptions: {
|
||||||
pageOptions: {},
|
...req.body,
|
||||||
|
scrapeOptions: undefined,
|
||||||
|
},
|
||||||
|
scrapeOptions: scrapeOptions.parse({}),
|
||||||
|
internalOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
@ -65,8 +69,8 @@ export async function mapController(
|
||||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||||
const cachedResult = null;
|
const cachedResult = null;
|
||||||
|
|
||||||
let allResults: any[];
|
let allResults: any[] = [];
|
||||||
let pagePromises: Promise<any>[];
|
let pagePromises: Promise<any>[] = [];
|
||||||
|
|
||||||
if (cachedResult) {
|
if (cachedResult) {
|
||||||
allResults = JSON.parse(cachedResult);
|
allResults = JSON.parse(cachedResult);
|
||||||
|
@ -139,7 +143,7 @@ export async function mapController(
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.filter((x) => x !== null);
|
.filter((x) => x !== null) as string[];
|
||||||
|
|
||||||
// allows for subdomains to be included
|
// allows for subdomains to be included
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||||
|
@ -153,7 +157,7 @@ export async function mapController(
|
||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||||
Logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
|
@ -175,9 +179,8 @@ export async function mapController(
|
||||||
mode: "map",
|
mode: "map",
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
pageOptions: {},
|
scrapeOptions: {},
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
extractor_options: { mode: "markdown" },
|
|
||||||
num_tokens: 0,
|
num_tokens: 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {
|
||||||
|
|
||||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||||
|
|
||||||
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "You are not allowed to access this resource.",
|
error: "You are not allowed to access this resource.",
|
||||||
|
|
|
@ -1,10 +1,7 @@
|
||||||
import { Request, Response } from "express";
|
import { Response } from "express";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import {
|
import {
|
||||||
Document,
|
Document,
|
||||||
legacyDocumentConverter,
|
|
||||||
legacyExtractorOptions,
|
|
||||||
legacyScrapeOptions,
|
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
scrapeRequestSchema,
|
scrapeRequestSchema,
|
||||||
|
@ -12,7 +9,6 @@ import {
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
@ -28,8 +24,6 @@ export async function scrapeController(
|
||||||
|
|
||||||
const origin = req.body.origin;
|
const origin = req.body.origin;
|
||||||
const timeout = req.body.timeout;
|
const timeout = req.body.timeout;
|
||||||
const pageOptions = legacyScrapeOptions(req.body);
|
|
||||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
|
@ -43,11 +37,10 @@ export async function scrapeController(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: {},
|
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
scrapeOptions: req.body,
|
||||||
pageOptions,
|
internalOptions: {},
|
||||||
extractorOptions,
|
plan: req.auth.plan!,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
},
|
},
|
||||||
|
@ -56,13 +49,13 @@ export async function scrapeController(
|
||||||
jobPriority
|
jobPriority
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||||
|
|
||||||
let doc: any | undefined;
|
let doc: Document;
|
||||||
try {
|
try {
|
||||||
doc = (await waitForJob(jobId, timeout + totalWait))[0];
|
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.error(`Error in scrapeController: ${e}`);
|
logger.error(`Error in scrapeController: ${e}`);
|
||||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||||
return res.status(408).json({
|
return res.status(408).json({
|
||||||
success: false,
|
success: false,
|
||||||
|
@ -71,34 +64,19 @@ export async function scrapeController(
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
|
||||||
extractorOptions && extractorOptions.mode !== "markdown"
|
|
||||||
? " - Could be due to LLM parsing issues"
|
|
||||||
: ""
|
|
||||||
}`,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await getScrapeQueue().remove(jobId);
|
await getScrapeQueue().remove(jobId);
|
||||||
|
|
||||||
if (!doc) {
|
|
||||||
console.error("!!! PANIC DOC IS", doc);
|
|
||||||
return res.status(200).json({
|
|
||||||
success: true,
|
|
||||||
warning: "No page found",
|
|
||||||
data: doc,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
delete doc.index;
|
|
||||||
delete doc.provider;
|
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens =
|
const numTokens =
|
||||||
doc && doc.markdown
|
doc && doc.extract
|
||||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||||
|
? 0 // TODO: fix
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
|
@ -111,22 +89,16 @@ export async function scrapeController(
|
||||||
}
|
}
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
if (!req.body.formats.includes("rawHtml")) {
|
||||||
if (doc && doc.rawHtml) {
|
if (doc && doc.rawHtml) {
|
||||||
delete doc.rawHtml;
|
delete doc.rawHtml;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(pageOptions && pageOptions.includeExtract) {
|
|
||||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
|
||||||
delete doc.markdown;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: true,
|
success: true,
|
||||||
|
@ -137,16 +109,14 @@ export async function scrapeController(
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
mode: "scrape",
|
mode: "scrape",
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
crawlerOptions: {},
|
scrapeOptions: req.body,
|
||||||
pageOptions: pageOptions,
|
|
||||||
origin: origin,
|
origin: origin,
|
||||||
extractor_options: extractorOptions,
|
|
||||||
num_tokens: numTokens,
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: legacyDocumentConverter(doc),
|
data: doc,
|
||||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
|
||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { countries } from "../../lib/validate-country";
|
import { countries } from "../../lib/validate-country";
|
||||||
|
import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
|
||||||
|
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||||
|
|
||||||
export type Format =
|
export type Format =
|
||||||
| "markdown"
|
| "markdown"
|
||||||
|
@ -167,6 +168,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
});
|
});
|
||||||
|
|
||||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||||
|
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||||
|
|
||||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||||
urls: url.array(),
|
urls: url.array(),
|
||||||
|
@ -240,7 +242,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
limit: z.number().min(1).max(5000).default(5000).optional(),
|
limit: z.number().min(1).max(5000).default(5000),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
|
@ -252,13 +254,14 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||||
|
|
||||||
export type Document = {
|
export type Document = {
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
extract?: string;
|
extract?: any;
|
||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
actions?: {
|
actions?: {
|
||||||
screenshots: string[];
|
screenshots?: string[];
|
||||||
|
scrapes?: ScrapeActionContent[];
|
||||||
};
|
};
|
||||||
warning?: string;
|
warning?: string;
|
||||||
metadata: {
|
metadata: {
|
||||||
|
@ -291,11 +294,11 @@ export type Document = {
|
||||||
publishedTime?: string;
|
publishedTime?: string;
|
||||||
articleTag?: string;
|
articleTag?: string;
|
||||||
articleSection?: string;
|
articleSection?: string;
|
||||||
|
url?: string;
|
||||||
sourceURL?: string;
|
sourceURL?: string;
|
||||||
statusCode?: number;
|
statusCode?: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
[key: string]: string | string[] | number | undefined;
|
[key: string]: string | string[] | number | undefined;
|
||||||
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -366,7 +369,7 @@ export type CrawlStatusResponse =
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: PlanType;
|
plan: PlanType | undefined;
|
||||||
};
|
};
|
||||||
|
|
||||||
type Account = {
|
type Account = {
|
||||||
|
@ -439,7 +442,7 @@ export interface ResponseWithSentry<
|
||||||
sentry?: string,
|
sentry?: string,
|
||||||
}
|
}
|
||||||
|
|
||||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
return {
|
return {
|
||||||
includes: x.includePaths,
|
includes: x.includePaths,
|
||||||
excludes: x.excludePaths,
|
excludes: x.excludePaths,
|
||||||
|
@ -453,68 +456,90 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
|
||||||
return {
|
return {
|
||||||
includeMarkdown: x.formats.includes("markdown"),
|
crawlOptions: crawlerOptions.parse({
|
||||||
includeHtml: x.formats.includes("html"),
|
includePaths: x.includes,
|
||||||
includeRawHtml: x.formats.includes("rawHtml"),
|
excludePaths: x.excludes,
|
||||||
includeExtract: x.formats.includes("extract"),
|
limit: x.maxCrawledLinks ?? x.limit,
|
||||||
onlyIncludeTags: x.includeTags,
|
maxDepth: x.maxDepth,
|
||||||
removeTags: x.excludeTags,
|
allowBackwardLinks: x.allowBackwardCrawling,
|
||||||
onlyMainContent: x.onlyMainContent,
|
allowExternalLinks: x.allowExternalContentLinks,
|
||||||
waitFor: x.waitFor,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
headers: x.headers,
|
// TODO: returnOnlyUrls support
|
||||||
includeLinks: x.formats.includes("links"),
|
}),
|
||||||
screenshot: x.formats.includes("screenshot"),
|
internalOptions: {
|
||||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||||
parsePDF: x.parsePDF,
|
|
||||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
|
||||||
geolocation: x.location ?? x.geolocation,
|
|
||||||
skipTlsVerification: x.skipTlsVerification,
|
|
||||||
removeBase64Images: x.removeBase64Images,
|
|
||||||
mobile: x.mobile,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
|
||||||
return {
|
|
||||||
mode: x.mode ? "llm-extraction" : "markdown",
|
|
||||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
|
||||||
extractionSchema: x.schema,
|
|
||||||
userPrompt: x.prompt ?? "",
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export function legacyDocumentConverter(doc: any): Document {
|
|
||||||
if (doc === null || doc === undefined) return null;
|
|
||||||
|
|
||||||
if (doc.metadata) {
|
|
||||||
if (doc.metadata.screenshot) {
|
|
||||||
doc.screenshot = doc.metadata.screenshot;
|
|
||||||
delete doc.metadata.screenshot;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc.metadata.fullPageScreenshot) {
|
|
||||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
|
||||||
delete doc.metadata.fullPageScreenshot;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
markdown: doc.markdown,
|
|
||||||
links: doc.linksOnPage,
|
|
||||||
rawHtml: doc.rawHtml,
|
|
||||||
html: doc.html,
|
|
||||||
extract: doc.llm_extraction,
|
|
||||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
|
||||||
actions: doc.actions ?? undefined,
|
|
||||||
warning: doc.warning ?? undefined,
|
|
||||||
metadata: {
|
|
||||||
...doc.metadata,
|
|
||||||
pageError: undefined,
|
|
||||||
pageStatusCode: undefined,
|
|
||||||
error: doc.metadata?.pageError,
|
|
||||||
statusCode: doc.metadata?.pageStatusCode,
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
|
||||||
|
return {
|
||||||
|
scrapeOptions: scrapeOptions.parse({
|
||||||
|
formats: [
|
||||||
|
(pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
|
||||||
|
(pageOptions.includeHtml ?? false) ? "html" as const : null,
|
||||||
|
(pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
|
||||||
|
(pageOptions.screenshot ?? false) ? "screenshot" as const : null,
|
||||||
|
(pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
|
||||||
|
(extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
|
||||||
|
"links"
|
||||||
|
].filter(x => x !== null),
|
||||||
|
waitFor: pageOptions.waitFor,
|
||||||
|
headers: pageOptions.headers,
|
||||||
|
includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
|
||||||
|
excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
|
||||||
|
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||||
|
timeout: timeout,
|
||||||
|
parsePDF: pageOptions.parsePDF,
|
||||||
|
actions: pageOptions.actions,
|
||||||
|
location: pageOptions.geolocation,
|
||||||
|
skipTlsVerification: pageOptions.skipTlsVerification,
|
||||||
|
removeBase64Images: pageOptions.removeBase64Images,
|
||||||
|
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
|
||||||
|
systemPrompt: extractorOptions.extractionPrompt,
|
||||||
|
prompt: extractorOptions.userPrompt,
|
||||||
|
schema: extractorOptions.extractionSchema,
|
||||||
|
} : undefined,
|
||||||
|
mobile: pageOptions.mobile,
|
||||||
|
}),
|
||||||
|
internalOptions: {
|
||||||
|
atsv: pageOptions.atsv,
|
||||||
|
v0DisableJsDom: pageOptions.disableJsDom,
|
||||||
|
v0UseFastMode: pageOptions.useFastMode,
|
||||||
|
},
|
||||||
|
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
|
||||||
|
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||||
|
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||||
|
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||||
|
}
|
||||||
|
|
||||||
|
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
|
||||||
|
if (internalOptions.v0CrawlOnlyUrls) {
|
||||||
|
return { url: document.metadata.sourceURL! };
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: document.markdown!,
|
||||||
|
markdown: document.markdown!,
|
||||||
|
html: document.html,
|
||||||
|
rawHtml: document.rawHtml,
|
||||||
|
linksOnPage: document.links,
|
||||||
|
llm_extraction: document.extract,
|
||||||
|
metadata: {
|
||||||
|
...document.metadata,
|
||||||
|
error: undefined,
|
||||||
|
statusCode: undefined,
|
||||||
|
pageError: document.metadata.error,
|
||||||
|
pageStatusCode: document.metadata.statusCode,
|
||||||
|
screenshot: document.screenshot,
|
||||||
|
},
|
||||||
|
actions: document.actions ,
|
||||||
|
warning: document.warning,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
import { WebScraperDataProvider } from "./scraper/WebScraper";
|
|
||||||
|
|
||||||
async function example() {
|
|
||||||
const example = new WebScraperDataProvider();
|
|
||||||
|
|
||||||
await example.setOptions({
|
|
||||||
jobId: "TEST",
|
|
||||||
mode: "crawl",
|
|
||||||
urls: ["https://mendable.ai"],
|
|
||||||
crawlerOptions: {},
|
|
||||||
});
|
|
||||||
const docs = await example.getDocuments(false);
|
|
||||||
docs.map((doc) => {
|
|
||||||
console.log(doc.metadata.sourceURL);
|
|
||||||
});
|
|
||||||
console.log(docs.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
// example();
|
|
|
@ -6,28 +6,24 @@ import bodyParser from "body-parser";
|
||||||
import cors from "cors";
|
import cors from "cors";
|
||||||
import { getScrapeQueue } from "./services/queue-service";
|
import { getScrapeQueue } from "./services/queue-service";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import { Logger } from "./lib/logger";
|
import { logger } from "./lib/logger";
|
||||||
import { adminRouter } from "./routes/admin";
|
import { adminRouter } from "./routes/admin";
|
||||||
import { ScrapeEvents } from "./lib/scrape-events";
|
|
||||||
import http from 'node:http';
|
import http from 'node:http';
|
||||||
import https from 'node:https';
|
import https from 'node:https';
|
||||||
import CacheableLookup from 'cacheable-lookup';
|
import CacheableLookup from 'cacheable-lookup';
|
||||||
import { v1Router } from "./routes/v1";
|
import { v1Router } from "./routes/v1";
|
||||||
import expressWs from "express-ws";
|
import expressWs from "express-ws";
|
||||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
|
||||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||||
import { ZodError } from "zod";
|
import { ZodError } from "zod";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import dns from 'node:dns';
|
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
const { ExpressAdapter } = require("@bull-board/express");
|
const { ExpressAdapter } = require("@bull-board/express");
|
||||||
|
|
||||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||||
|
|
||||||
const cacheable = new CacheableLookup()
|
const cacheable = new CacheableLookup()
|
||||||
|
|
||||||
|
@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
serverAdapter: serverAdapter,
|
serverAdapter: serverAdapter,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
app.use(
|
app.use(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||||
serverAdapter.getRouter()
|
serverAdapter.getRouter()
|
||||||
|
@ -78,15 +73,10 @@ app.use(adminRouter);
|
||||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||||
const HOST = process.env.HOST ?? "localhost";
|
const HOST = process.env.HOST ?? "localhost";
|
||||||
|
|
||||||
// HyperDX OpenTelemetry
|
|
||||||
if (process.env.ENV === "production") {
|
|
||||||
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
|
||||||
}
|
|
||||||
|
|
||||||
function startServer(port = DEFAULT_PORT) {
|
function startServer(port = DEFAULT_PORT) {
|
||||||
const server = app.listen(Number(port), HOST, () => {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||||
Logger.info(
|
logger.info(
|
||||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
const [waitingJobs] = await Promise.all([
|
const [waitingJobs] = await Promise.all([
|
||||||
scrapeQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noWaitingJobs = waitingJobs === 0;
|
const noWaitingJobs = waitingJobs === 0;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||||
|
@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
||||||
// Re-check the waiting jobs count after the timeout
|
// Re-check the waiting jobs count after the timeout
|
||||||
waitingJobsCount = await getWaitingJobsCount();
|
waitingJobsCount = await getWaitingJobsCount();
|
||||||
if (waitingJobsCount >= treshold) {
|
if (waitingJobsCount >= treshold) {
|
||||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
|
||||||
const message = {
|
const message = {
|
||||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||||
timeout / 60000
|
timeout / 60000
|
||||||
|
@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
Logger.error("Failed to send Slack notification");
|
logger.error("Failed to send Slack notification");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, timeout);
|
}, timeout);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.debug(error);
|
logger.debug(error);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => {
|
||||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||||
if (err instanceof ZodError) {
|
if (err instanceof ZodError) {
|
||||||
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
|
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
|
||||||
Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||||
}
|
}
|
||||||
|
|
||||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||||
|
@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||||
});
|
});
|
||||||
|
|
||||||
Logger.info(`Worker ${process.pid} started`);
|
logger.info(`Worker ${process.pid} started`);
|
||||||
|
|
||||||
// const sq = getScrapeQueue();
|
// const sq = getScrapeQueue();
|
||||||
|
|
||||||
|
|
|
@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||||
|
|
||||||
import { generateOpenAICompletions } from "./models";
|
import { generateOpenAICompletions } from "./models";
|
||||||
import { Document, ExtractorOptions } from "../entities";
|
import { Document, ExtractorOptions } from "../entities";
|
||||||
import { Logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
|
|
||||||
// Generate completion using OpenAI
|
// Generate completion using OpenAI
|
||||||
export async function generateCompletions(
|
export async function generateCompletions(
|
||||||
documents: Document[],
|
documents: Document[],
|
||||||
extractionOptions: ExtractorOptions,
|
extractionOptions: ExtractorOptions | undefined,
|
||||||
mode: "markdown" | "raw-html"
|
mode: "markdown" | "raw-html"
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
// const schema = zodToJsonSchema(options.schema)
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
const schema = extractionOptions.extractionSchema;
|
const schema = extractionOptions?.extractionSchema;
|
||||||
const systemPrompt = extractionOptions.extractionPrompt;
|
const systemPrompt = extractionOptions?.extractionPrompt;
|
||||||
const prompt = extractionOptions.userPrompt;
|
const prompt = extractionOptions?.userPrompt;
|
||||||
|
|
||||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ export async function generateCompletions(
|
||||||
|
|
||||||
return completionResult;
|
return completionResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error generating completions: ${error}`);
|
logger.error(`Error generating completions: ${error}`);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -95,7 +95,7 @@ export async function generateOpenAICompletions({
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llmExtraction = JSON.parse(
|
llmExtraction = JSON.parse(
|
||||||
jsonCompletion.choices[0].message.content.trim()
|
(jsonCompletion.choices[0].message.content ?? "").trim()
|
||||||
);
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
throw new Error("Invalid JSON");
|
throw new Error("Invalid JSON");
|
||||||
|
|
|
@ -3,7 +3,7 @@ export async function batchProcess<T>(
|
||||||
batchSize: number,
|
batchSize: number,
|
||||||
asyncFunction: (item: T, index: number) => Promise<void>
|
asyncFunction: (item: T, index: number) => Promise<void>
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const batches = [];
|
const batches: T[][] = [];
|
||||||
for (let i = 0; i < array.length; i += batchSize) {
|
for (let i = 0; i < array.length; i += batchSize) {
|
||||||
const batch = array.slice(i, i + batchSize);
|
const batch = array.slice(i, i + batchSize);
|
||||||
batches.push(batch);
|
batches.push(batch);
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
|
import { InternalOptions } from "../scraper/scrapeURL";
|
||||||
|
import { ScrapeOptions } from "../controllers/v1/types";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisConnection } from "../services/queue-service";
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
export type StoredCrawl = {
|
export type StoredCrawl = {
|
||||||
originUrl?: string;
|
originUrl?: string;
|
||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
pageOptions: any;
|
scrapeOptions: Omit<ScrapeOptions, "timeout">;
|
||||||
|
internalOptions: InternalOptions;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: string;
|
plan?: string;
|
||||||
robots?: string;
|
robots?: string;
|
||||||
cancelled?: boolean;
|
cancelled?: boolean;
|
||||||
createdAt: number;
|
createdAt: number;
|
||||||
|
@ -100,7 +103,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||||
urlO.hash = "";
|
urlO.hash = "";
|
||||||
url = urlO.href;
|
url = urlO.href;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
|
@ -117,7 +120,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
urlO.hash = "";
|
urlO.hash = "";
|
||||||
return urlO.href;
|
return urlO.href;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||||
}
|
}
|
||||||
|
|
||||||
return url;
|
return url;
|
||||||
|
@ -131,7 +134,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
initialUrl: sc.originUrl,
|
initialUrl: sc.originUrl!,
|
||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import type { Document as V1Document } from "../controllers/v1/types";
|
||||||
|
|
||||||
export interface Progress {
|
export interface Progress {
|
||||||
current: number;
|
current: number;
|
||||||
total: number;
|
total: number;
|
||||||
|
@ -129,7 +131,8 @@ export class Document {
|
||||||
provider?: string;
|
provider?: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
actions?: {
|
actions?: {
|
||||||
screenshots: string[];
|
screenshots?: string[];
|
||||||
|
scrapes?: ScrapeActionContent[];
|
||||||
}
|
}
|
||||||
|
|
||||||
index?: number;
|
index?: number;
|
||||||
|
|
|
@ -5,7 +5,7 @@ import "../services/sentry"
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
import dotenv from 'dotenv';
|
import dotenv from 'dotenv';
|
||||||
import { Logger } from './logger';
|
import { logger } from './logger';
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
// TODO: add a timeout to the Go parser
|
// TODO: add a timeout to the Go parser
|
||||||
|
@ -40,7 +40,7 @@ class GoMarkdownConverter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function parseMarkdown(html: string): Promise<string> {
|
export async function parseMarkdown(html: string | null | undefined): Promise<string> {
|
||||||
if (!html) {
|
if (!html) {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise<string> {
|
||||||
|
|
||||||
markdownContent = processMultiLineLinks(markdownContent);
|
markdownContent = processMultiLineLinks(markdownContent);
|
||||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||||
Logger.info(`HTML to Markdown conversion using Go parser successful`);
|
logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||||
return markdownContent;
|
return markdownContent;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { redisConnection } from "../../src/services/queue-service";
|
import { redisConnection } from "../../src/services/queue-service";
|
||||||
import { PlanType } from "../../src/types";
|
import { PlanType } from "../../src/types";
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
const SET_KEY_PREFIX = "limit_team_id:";
|
const SET_KEY_PREFIX = "limit_team_id:";
|
||||||
export async function addJobPriority(team_id, job_id) {
|
export async function addJobPriority(team_id, job_id) {
|
||||||
|
@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
|
||||||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||||
await redisConnection.expire(setKey, 60);
|
await redisConnection.expire(setKey, 60);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
||||||
// remove job_id from the set
|
// remove job_id from the set
|
||||||
await redisConnection.srem(setKey, job_id);
|
await redisConnection.srem(setKey, job_id);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ export async function getJobPriority({
|
||||||
team_id,
|
team_id,
|
||||||
basePriority = 10,
|
basePriority = 10,
|
||||||
}: {
|
}: {
|
||||||
plan: PlanType;
|
plan: PlanType | undefined;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
basePriority?: number;
|
basePriority?: number;
|
||||||
}): Promise<number> {
|
}): Promise<number> {
|
||||||
|
@ -95,7 +95,7 @@ export async function getJobPriority({
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.error(
|
logger.error(
|
||||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
||||||
);
|
);
|
||||||
return basePriority;
|
return basePriority;
|
||||||
|
|
|
@ -1,42 +0,0 @@
|
||||||
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
|
|
||||||
|
|
||||||
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
||||||
|
|
||||||
// const scrapInBatches = async (
|
|
||||||
// urls: string[],
|
|
||||||
// batchSize: number,
|
|
||||||
// delayMs: number
|
|
||||||
// ) => {
|
|
||||||
// let successCount = 0;
|
|
||||||
// let errorCount = 0;
|
|
||||||
|
|
||||||
// for (let i = 0; i < urls.length; i += batchSize) {
|
|
||||||
// const batch = urls
|
|
||||||
// .slice(i, i + batchSize)
|
|
||||||
// .map((url) => scrapWithFireEngine(url));
|
|
||||||
// try {
|
|
||||||
// const results = await Promise.all(batch);
|
|
||||||
// results.forEach((data, index) => {
|
|
||||||
// if (data.trim() === "") {
|
|
||||||
// errorCount++;
|
|
||||||
// } else {
|
|
||||||
// successCount++;
|
|
||||||
// console.log(
|
|
||||||
// `Scraping result ${i + index + 1}:`,
|
|
||||||
// data.trim().substring(0, 20) + "..."
|
|
||||||
// );
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// } catch (error) {
|
|
||||||
// console.error("Error during scraping:", error);
|
|
||||||
// }
|
|
||||||
// await delay(delayMs);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// console.log(`Total successful scrapes: ${successCount}`);
|
|
||||||
// console.log(`Total errored scrapes: ${errorCount}`);
|
|
||||||
// };
|
|
||||||
// function run() {
|
|
||||||
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
|
|
||||||
// scrapInBatches(urls, 10, 1000);
|
|
||||||
// }
|
|
|
@ -1,57 +1,82 @@
|
||||||
|
import * as winston from "winston";
|
||||||
|
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import Transport from "winston-transport";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
enum LogLevel {
|
const logFormat = winston.format.printf(info =>
|
||||||
NONE = 'NONE', // No logs will be output.
|
`${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
|
||||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
info.metadata,
|
||||||
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
(_, value) => {
|
||||||
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
if (value instanceof Error) {
|
||||||
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
return {
|
||||||
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
...value,
|
||||||
}
|
name: value.name,
|
||||||
export class Logger {
|
message: value.message,
|
||||||
static colors = {
|
stack: value.stack,
|
||||||
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
cause: value.cause,
|
||||||
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
}
|
||||||
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
} else {
|
||||||
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
return value;
|
||||||
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
|
||||||
};
|
|
||||||
|
|
||||||
static log (message: string, level: LogLevel) {
|
|
||||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
|
|
||||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
|
||||||
const currentLevelIndex = levels.indexOf(logLevel);
|
|
||||||
const messageLevelIndex = levels.indexOf(level);
|
|
||||||
|
|
||||||
if (currentLevelIndex >= messageLevelIndex) {
|
|
||||||
const color = Logger.colors[level];
|
|
||||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
|
||||||
|
|
||||||
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
|
||||||
// if (useDbAuthentication) {
|
|
||||||
// save to supabase? another place?
|
|
||||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static error(message: string | any) {
|
) : ""}`
|
||||||
Logger.log(message, LogLevel.ERROR);
|
)
|
||||||
|
|
||||||
|
export const logger = winston.createLogger({
|
||||||
|
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
|
||||||
|
format: winston.format.json({
|
||||||
|
replacer(key, value) {
|
||||||
|
if (value instanceof Error) {
|
||||||
|
return {
|
||||||
|
...value,
|
||||||
|
name: value.name,
|
||||||
|
message: value.message,
|
||||||
|
stack: value.stack,
|
||||||
|
cause: value.cause,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
transports: [
|
||||||
|
new winston.transports.Console({
|
||||||
|
format: winston.format.combine(
|
||||||
|
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||||
|
winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
|
||||||
|
...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
export type ArrayTransportOptions = Transport.TransportStreamOptions & {
|
||||||
|
array: any[];
|
||||||
|
scrapeId?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class ArrayTransport extends Transport {
|
||||||
|
private array: any[];
|
||||||
|
private scrapeId?: string;
|
||||||
|
|
||||||
|
constructor(opts: ArrayTransportOptions) {
|
||||||
|
super(opts);
|
||||||
|
this.array = opts.array;
|
||||||
|
this.scrapeId = opts.scrapeId;
|
||||||
}
|
}
|
||||||
|
|
||||||
static warn(message: string) {
|
log(info, next) {
|
||||||
Logger.log(message, LogLevel.WARN);
|
setImmediate(() => {
|
||||||
|
this.emit("logged", info);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
|
||||||
|
return next();
|
||||||
}
|
}
|
||||||
|
|
||||||
static info(message: string) {
|
this.array.push(info);
|
||||||
Logger.log(message, LogLevel.INFO);
|
|
||||||
}
|
|
||||||
|
|
||||||
static debug(message: string) {
|
next();
|
||||||
Logger.log(message, LogLevel.DEBUG);
|
|
||||||
}
|
|
||||||
|
|
||||||
static trace(message: string) {
|
|
||||||
Logger.log(message, LogLevel.TRACE);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||||
try {
|
try {
|
||||||
|
@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||||
links = a.map((item) => item.link);
|
links = a.map((item) => item.link);
|
||||||
return links;
|
return links;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error performing cosine similarity: ${error}`);
|
logger.error(`Error performing cosine similarity: ${error}`);
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import { Job } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
|
||||||
import { supabase_service as supabase } from "../services/supabase";
|
import { supabase_service as supabase } from "../services/supabase";
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export type ScrapeErrorEvent = {
|
export type ScrapeErrorEvent = {
|
||||||
|
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
|
||||||
type: "scrape",
|
type: "scrape",
|
||||||
url: string,
|
url: string,
|
||||||
worker?: string,
|
worker?: string,
|
||||||
method: (typeof baseScrapers)[number],
|
method: Engine,
|
||||||
result: null | {
|
result: null | {
|
||||||
success: boolean,
|
success: boolean,
|
||||||
response_code?: number,
|
response_code?: number,
|
||||||
|
@ -49,7 +49,7 @@ export class ScrapeEvents {
|
||||||
}).select().single();
|
}).select().single();
|
||||||
return (result.data as any).id;
|
return (result.data as any).id;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Logger.error(`Error inserting scrape event: ${error}`);
|
// logger.error(`Error inserting scrape event: ${error}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -69,7 +69,7 @@ export class ScrapeEvents {
|
||||||
}
|
}
|
||||||
}).eq("id", logId);
|
}).eq("id", logId);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error updating scrape result: ${error}`);
|
logger.error(`Error updating scrape result: ${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ export class ScrapeEvents {
|
||||||
worker: process.env.FLY_MACHINE_ID,
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error logging job event: ${error}`);
|
logger.error(`Error logging job event: ${error}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
|
||||||
.in("job_id", jobIds);
|
.in("job_id", jobIds);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
Logger.error(`Error in supabaseGetJobsById: ${error}`);
|
logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
||||||
.eq("crawl_id", crawlId)
|
.eq("crawl_id", crawlId)
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,30 +1,25 @@
|
||||||
import { AuthResponse } from "../../src/types";
|
import { AuthResponse } from "../../src/types";
|
||||||
import { Logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
let warningCount = 0;
|
let warningCount = 0;
|
||||||
|
|
||||||
export function withAuth<T extends AuthResponse, U extends any[]>(
|
export function withAuth<T, U extends any[]>(
|
||||||
originalFunction: (...args: U) => Promise<T>
|
originalFunction: (...args: U) => Promise<T>,
|
||||||
|
mockSuccess: T,
|
||||||
) {
|
) {
|
||||||
return async function (...args: U): Promise<T> {
|
return async function (...args: U): Promise<T> {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
if (!useDbAuthentication) {
|
if (!useDbAuthentication) {
|
||||||
if (warningCount < 5) {
|
if (warningCount < 5) {
|
||||||
Logger.warn("You're bypassing authentication");
|
logger.warn("You're bypassing authentication");
|
||||||
warningCount++;
|
warningCount++;
|
||||||
}
|
}
|
||||||
return { success: true } as T;
|
return { success: true } as T;
|
||||||
} else {
|
} else {
|
||||||
try {
|
|
||||||
return await originalFunction(...args);
|
return await originalFunction(...args);
|
||||||
} catch (error) {
|
|
||||||
Sentry.captureException(error);
|
|
||||||
Logger.error(`Error in withAuth function: ${error}`);
|
|
||||||
return { success: false, error: error.message } as T;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,151 +1,127 @@
|
||||||
import { Job } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import {
|
import {
|
||||||
CrawlResult,
|
|
||||||
WebScraperOptions,
|
WebScraperOptions,
|
||||||
RunWebScraperParams,
|
RunWebScraperParams,
|
||||||
RunWebScraperResult,
|
RunWebScraperResult,
|
||||||
} from "../types";
|
} from "../types";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
|
||||||
import { DocumentUrl, Progress } from "../lib/entities";
|
|
||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../controllers/v1/types";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||||
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
token,
|
token,
|
||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions>;
|
job: Job<WebScraperOptions> & { id: string };
|
||||||
token: string;
|
token: string;
|
||||||
}) {
|
}) {
|
||||||
let partialDocs: Document[] = [];
|
|
||||||
return (await runWebScraper({
|
return (await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
scrapeOptions: {
|
||||||
extractorOptions: job.data.extractorOptions,
|
...job.data.scrapeOptions,
|
||||||
pageOptions: {
|
|
||||||
...job.data.pageOptions,
|
|
||||||
...(job.data.crawl_id ? ({
|
...(job.data.crawl_id ? ({
|
||||||
includeRawHtml: true,
|
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
||||||
}): {}),
|
}): {}),
|
||||||
},
|
},
|
||||||
inProgress: (progress) => {
|
internalOptions: job.data.internalOptions,
|
||||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
// onSuccess: (result, mode) => {
|
||||||
if (progress.currentDocument) {
|
// logger.debug(`🐂 Job completed ${job.id}`);
|
||||||
partialDocs.push(progress.currentDocument);
|
// saveJob(job, result, token, mode);
|
||||||
if (partialDocs.length > 50) {
|
// },
|
||||||
partialDocs = partialDocs.slice(-50);
|
// onError: (error) => {
|
||||||
}
|
// logger.error(`🐂 Job failed ${job.id}`);
|
||||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
// ScrapeEvents.logJobEvent(job, "failed");
|
||||||
}
|
// },
|
||||||
},
|
|
||||||
onSuccess: (result, mode) => {
|
|
||||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
|
||||||
saveJob(job, result, token, mode);
|
|
||||||
},
|
|
||||||
onError: (error) => {
|
|
||||||
Logger.error(`🐂 Job failed ${job.id}`);
|
|
||||||
ScrapeEvents.logJobEvent(job, "failed");
|
|
||||||
job.moveToFailed(error, token, false);
|
|
||||||
},
|
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
priority: job.opts.priority,
|
priority: job.opts.priority,
|
||||||
is_scrape: job.data.is_scrape ?? false,
|
is_scrape: job.data.is_scrape ?? false,
|
||||||
})) as { success: boolean; message: string; docs: Document[] };
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function runWebScraper({
|
export async function runWebScraper({
|
||||||
url,
|
url,
|
||||||
mode,
|
mode,
|
||||||
crawlerOptions,
|
scrapeOptions,
|
||||||
pageOptions,
|
internalOptions,
|
||||||
extractorOptions,
|
// onSuccess,
|
||||||
inProgress,
|
// onError,
|
||||||
onSuccess,
|
|
||||||
onError,
|
|
||||||
team_id,
|
team_id,
|
||||||
bull_job_id,
|
bull_job_id,
|
||||||
priority,
|
priority,
|
||||||
is_scrape=false,
|
is_scrape=false,
|
||||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
|
let response: ScrapeUrlResponse | undefined = undefined;
|
||||||
|
let engines: EngineResultsTracker = {};
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
||||||
if (mode === "crawl") {
|
if (!response.success) {
|
||||||
await provider.setOptions({
|
if (response.error instanceof Error) {
|
||||||
jobId: bull_job_id,
|
throw response.error;
|
||||||
mode: mode,
|
|
||||||
urls: [url],
|
|
||||||
extractorOptions,
|
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
pageOptions: pageOptions,
|
|
||||||
bullJobId: bull_job_id,
|
|
||||||
priority,
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
|
||||||
jobId: bull_job_id,
|
|
||||||
mode: mode,
|
|
||||||
urls: url.split(","),
|
|
||||||
extractorOptions,
|
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
pageOptions: pageOptions,
|
|
||||||
priority,
|
|
||||||
teamId: team_id
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
|
||||||
inProgress(progress);
|
|
||||||
})) as Document[];
|
|
||||||
|
|
||||||
if (docs.length === 0) {
|
|
||||||
return {
|
|
||||||
success: true,
|
|
||||||
message: "No pages found",
|
|
||||||
docs: [],
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove docs with empty content
|
|
||||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
|
||||||
? docs.map((doc) => {
|
|
||||||
if (doc.metadata.sourceURL) {
|
|
||||||
return { url: doc.metadata.sourceURL };
|
|
||||||
}
|
|
||||||
})
|
|
||||||
: docs;
|
|
||||||
|
|
||||||
if(is_scrape === false) {
|
if(is_scrape === false) {
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
if (scrapeOptions.extract) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
|
||||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// This is where the returnvalue from the job is set
|
// This is where the returnvalue from the job is set
|
||||||
onSuccess(filteredDocs, mode);
|
// onSuccess(response.document, mode);
|
||||||
|
|
||||||
// this return doesn't matter too much for the job completion result
|
engines = response.engines;
|
||||||
return { success: true, message: "", docs: filteredDocs };
|
return response;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
onError(error);
|
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
|
||||||
return { success: false, message: error.message, docs: [] };
|
|
||||||
|
if (response !== undefined) {
|
||||||
|
return {
|
||||||
|
...response,
|
||||||
|
success: false,
|
||||||
|
error,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
|
||||||
|
}
|
||||||
|
// onError(error);
|
||||||
|
} finally {
|
||||||
|
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
|
||||||
|
|
||||||
|
for (const engine of engineOrder) {
|
||||||
|
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
|
||||||
|
ScrapeEvents.insert(bull_job_id, {
|
||||||
|
type: "scrape",
|
||||||
|
url,
|
||||||
|
method: engine,
|
||||||
|
result: {
|
||||||
|
success: result.state === "success",
|
||||||
|
response_code: (result.state === "success" ? result.result.statusCode : undefined),
|
||||||
|
response_size: (result.state === "success" ? result.result.html.length : undefined),
|
||||||
|
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
|
||||||
|
time_taken: result.finishedAt - result.startedAt,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
if (useDbAuthentication) {
|
if (useDbAuthentication) {
|
||||||
|
@ -173,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||||
}
|
}
|
||||||
ScrapeEvents.logJobEvent(job, "completed");
|
ScrapeEvents.logJobEvent(job, "completed");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
logger.error(`🐂 Failed to update job status: ${error}`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -6,8 +6,8 @@ import {
|
||||||
cleanBefore24hCompleteJobsController,
|
cleanBefore24hCompleteJobsController,
|
||||||
queuesController,
|
queuesController,
|
||||||
} from "../controllers/v0/admin/queue";
|
} from "../controllers/v0/admin/queue";
|
||||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
|
||||||
import { wrap } from "./v1";
|
import { wrap } from "./v1";
|
||||||
|
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||||
|
|
||||||
export const adminRouter = express.Router();
|
export const adminRouter = express.Router();
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ import expressWs from "express-ws";
|
||||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||||
import { Logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||||
|
@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
||||||
if (!minimum && req.body) {
|
if (!minimum && req.body) {
|
||||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||||
}
|
}
|
||||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
|
||||||
|
if (chunk) {
|
||||||
req.acuc = chunk;
|
req.acuc = chunk;
|
||||||
|
}
|
||||||
if (!success) {
|
if (!success) {
|
||||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
||||||
}
|
}
|
||||||
|
@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
||||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
const auth = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
rateLimiterMode,
|
rateLimiterMode,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!success) {
|
if (!auth.success) {
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(status).json({ success: false, error });
|
return res.status(auth.status).json({ success: false, error: auth.error });
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { team_id, plan, chunk } = auth;
|
||||||
|
|
||||||
req.auth = { team_id, plan };
|
req.auth = { team_id, plan };
|
||||||
req.acuc = chunk;
|
req.acuc = chunk ?? undefined;
|
||||||
|
if (chunk) {
|
||||||
|
req.account = { remainingCredits: chunk.remaining_credits };
|
||||||
|
}
|
||||||
next();
|
next();
|
||||||
})()
|
})()
|
||||||
.catch(err => next(err));
|
.catch(err => next(err));
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
import { WebCrawler } from '../crawler';
|
import { WebCrawler } from '../crawler';
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import robotsParser from 'robots-parser';
|
import robotsParser from 'robots-parser';
|
||||||
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
|
|
||||||
|
|
||||||
jest.mock('axios');
|
jest.mock('axios');
|
||||||
jest.mock('robots-parser');
|
jest.mock('robots-parser');
|
||||||
|
@ -35,165 +34,6 @@ describe('WebCrawler', () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
|
|
||||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
|
||||||
const enteredMaxCrawledDepth = 2;
|
|
||||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
|
||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
|
||||||
jobId: "TEST",
|
|
||||||
initialUrl: initialUrl,
|
|
||||||
includes: [],
|
|
||||||
excludes: [],
|
|
||||||
limit: 100,
|
|
||||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
|
||||||
});
|
|
||||||
|
|
||||||
// Mock sitemap fetching function to return controlled links
|
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
||||||
initialUrl, // depth 0
|
|
||||||
initialUrl + '/page1', // depth 1
|
|
||||||
initialUrl + '/page1/page2', // depth 2
|
|
||||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
|
||||||
]);
|
|
||||||
|
|
||||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
|
||||||
expect(results).toEqual([
|
|
||||||
{ url: initialUrl, html: '' },
|
|
||||||
{ url: initialUrl + '/page1', html: '' },
|
|
||||||
{ url: initialUrl + '/page1/page2', html: '' }
|
|
||||||
]);
|
|
||||||
|
|
||||||
|
|
||||||
// Ensure that the link with depth 3 is not included
|
|
||||||
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
|
|
||||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
|
||||||
const enteredMaxCrawledDepth = 0;
|
|
||||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
|
||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
|
||||||
jobId: "TEST",
|
|
||||||
initialUrl: initialUrl,
|
|
||||||
includes: [],
|
|
||||||
excludes: [],
|
|
||||||
limit: 100,
|
|
||||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
|
||||||
});
|
|
||||||
|
|
||||||
// Mock sitemap fetching function to return controlled links
|
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
||||||
initialUrl, // depth 0
|
|
||||||
initialUrl + '/page1', // depth 1
|
|
||||||
initialUrl + '/page1/page2', // depth 2
|
|
||||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
|
||||||
]);
|
|
||||||
|
|
||||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
|
||||||
expect(results).toEqual([
|
|
||||||
{ url: initialUrl, html: '' },
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
|
|
||||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
|
||||||
const enteredMaxCrawledDepth = 1;
|
|
||||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
|
||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
|
||||||
jobId: "TEST",
|
|
||||||
initialUrl: initialUrl,
|
|
||||||
includes: [],
|
|
||||||
excludes: [],
|
|
||||||
limit: 100,
|
|
||||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
|
||||||
});
|
|
||||||
|
|
||||||
// Mock sitemap fetching function to return controlled links
|
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
||||||
initialUrl, // depth 0
|
|
||||||
initialUrl + '/page2', // depth 1
|
|
||||||
initialUrl + '/page2/page3', // depth 2
|
|
||||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
|
||||||
]);
|
|
||||||
|
|
||||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
|
||||||
expect(results).toEqual([
|
|
||||||
{ url: initialUrl, html: '' },
|
|
||||||
{ url: initialUrl + '/page2', html: '' }
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
|
|
||||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
|
||||||
const enteredMaxCrawledDepth = 2;
|
|
||||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
|
||||||
|
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
|
||||||
jobId: "TEST",
|
|
||||||
initialUrl: initialUrl,
|
|
||||||
includes: [],
|
|
||||||
excludes: [],
|
|
||||||
limit: 100,
|
|
||||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
|
||||||
});
|
|
||||||
|
|
||||||
// Mock sitemap fetching function to return controlled links
|
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
||||||
initialUrl, // depth 0
|
|
||||||
initialUrl + '/page2', // depth 1
|
|
||||||
initialUrl + '/page2/page3', // depth 2
|
|
||||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
|
||||||
]);
|
|
||||||
|
|
||||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
|
||||||
expect(results).toEqual([
|
|
||||||
{ url: initialUrl, html: '' },
|
|
||||||
{ url: initialUrl + '/page2', html: '' },
|
|
||||||
{ url: initialUrl + '/page2/page3', html: '' }
|
|
||||||
]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should handle allowBackwardCrawling option correctly', async () => {
|
|
||||||
const initialUrl = 'https://mendable.ai/blog';
|
|
||||||
|
|
||||||
// Setup the crawler with the specific test case options
|
|
||||||
const crawler = new WebCrawler({
|
|
||||||
jobId: "TEST",
|
|
||||||
initialUrl: initialUrl,
|
|
||||||
includes: [],
|
|
||||||
excludes: [],
|
|
||||||
limit: 100,
|
|
||||||
maxCrawledDepth: 3, // Example depth
|
|
||||||
allowBackwardCrawling: true
|
|
||||||
});
|
|
||||||
|
|
||||||
// Mock the sitemap fetching function to simulate backward crawling
|
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
||||||
initialUrl,
|
|
||||||
'https://mendable.ai', // backward link
|
|
||||||
initialUrl + '/page1',
|
|
||||||
initialUrl + '/page1/page2'
|
|
||||||
]);
|
|
||||||
|
|
||||||
const results = await crawler.start();
|
|
||||||
expect(results).toEqual([
|
|
||||||
{ url: initialUrl, html: '' },
|
|
||||||
{ url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
|
|
||||||
{ url: initialUrl + '/page1', html: '' },
|
|
||||||
{ url: initialUrl + '/page1/page2', html: '' }
|
|
||||||
]);
|
|
||||||
|
|
||||||
// Check that the backward link is included if allowBackwardCrawling is true
|
|
||||||
expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should respect the limit parameter by not returning more links than specified', async () => {
|
it('should respect the limit parameter by not returning more links than specified', async () => {
|
||||||
const initialUrl = 'http://example.com';
|
const initialUrl = 'http://example.com';
|
||||||
const limit = 2; // Set a limit for the number of links
|
const limit = 2; // Set a limit for the number of links
|
||||||
|
|
|
@ -1,37 +0,0 @@
|
||||||
import { scrapSingleUrl } from '../single_url';
|
|
||||||
import { PageOptions } from '../../../lib/entities';
|
|
||||||
|
|
||||||
|
|
||||||
jest.mock('../single_url', () => {
|
|
||||||
const originalModule = jest.requireActual('../single_url');
|
|
||||||
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
|
|
||||||
|
|
||||||
return originalModule;
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('scrapSingleUrl', () => {
|
|
||||||
it('should handle includeHtml option correctly', async () => {
|
|
||||||
const url = 'https://roastmywebsite.ai';
|
|
||||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
|
||||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
|
||||||
|
|
||||||
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
|
||||||
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
|
||||||
|
|
||||||
expect(resultWithHtml.html).toBeDefined();
|
|
||||||
expect(resultWithoutHtml.html).toBeUndefined();
|
|
||||||
}, 10000);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
|
||||||
const url = 'https://flutterbricks.com';
|
|
||||||
const pageOptions: PageOptions = { includeHtml: true };
|
|
||||||
|
|
||||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
|
||||||
|
|
||||||
// Check if the result contains a list of links
|
|
||||||
expect(result.linksOnPage).toBeDefined();
|
|
||||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
|
||||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
|
||||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
|
||||||
}, 15000);
|
|
|
@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios";
|
||||||
import cheerio, { load } from "cheerio";
|
import cheerio, { load } from "cheerio";
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
|
||||||
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
|
||||||
import { scrapSingleUrl } from "./single_url";
|
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import https from "https";
|
import https from "https";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
|
@ -73,7 +70,7 @@ export class WebCrawler {
|
||||||
try {
|
try {
|
||||||
url = new URL(link.trim(), this.baseUrl);
|
url = new URL(link.trim(), this.baseUrl);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
@ -132,7 +129,7 @@ export class WebCrawler {
|
||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,7 +158,7 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||||
|
@ -170,115 +167,6 @@ export class WebCrawler {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async start(
|
|
||||||
inProgress?: (progress: Progress) => void,
|
|
||||||
pageOptions?: PageOptions,
|
|
||||||
crawlerOptions?: CrawlerOptions,
|
|
||||||
concurrencyLimit: number = 5,
|
|
||||||
limit: number = 10000,
|
|
||||||
maxDepth: number = 10
|
|
||||||
): Promise<{ url: string, html: string }[]> {
|
|
||||||
|
|
||||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
|
||||||
// Fetch and parse robots.txt
|
|
||||||
try {
|
|
||||||
const txt = await this.getRobotsTxt();
|
|
||||||
this.importRobotsTxt(txt);
|
|
||||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!crawlerOptions?.ignoreSitemap){
|
|
||||||
const sm = await this.tryGetSitemap();
|
|
||||||
if (sm !== null) {
|
|
||||||
return sm;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const urls = await this.crawlUrls(
|
|
||||||
[this.initialUrl],
|
|
||||||
pageOptions,
|
|
||||||
concurrencyLimit,
|
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
|
|
||||||
if (
|
|
||||||
urls.length === 0 &&
|
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
|
||||||
) {
|
|
||||||
return [{ url: this.initialUrl, html: "" }];
|
|
||||||
}
|
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
|
||||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
|
||||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
|
||||||
}
|
|
||||||
|
|
||||||
private async crawlUrls(
|
|
||||||
urls: string[],
|
|
||||||
pageOptions: PageOptions,
|
|
||||||
concurrencyLimit: number,
|
|
||||||
inProgress?: (progress: Progress) => void,
|
|
||||||
): Promise<{ url: string, html: string }[]> {
|
|
||||||
const queue = async.queue(async (task: string, callback) => {
|
|
||||||
Logger.debug(`Crawling ${task}`);
|
|
||||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
|
||||||
if (callback && typeof callback === "function") {
|
|
||||||
callback();
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const newUrls = await this.crawl(task, pageOptions);
|
|
||||||
// add the initial url if not already added
|
|
||||||
// if (this.visited.size === 1) {
|
|
||||||
// let normalizedInitial = this.initialUrl;
|
|
||||||
// if (!normalizedInitial.endsWith("/")) {
|
|
||||||
// normalizedInitial = normalizedInitial + "/";
|
|
||||||
// }
|
|
||||||
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
|
||||||
// newUrls.push({ url: this.initialUrl, html: "" });
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
|
||||||
|
|
||||||
if (inProgress && newUrls.length > 0) {
|
|
||||||
inProgress({
|
|
||||||
current: this.crawledUrls.size,
|
|
||||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
|
||||||
status: "SCRAPING",
|
|
||||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
|
||||||
});
|
|
||||||
} else if (inProgress) {
|
|
||||||
inProgress({
|
|
||||||
current: this.crawledUrls.size,
|
|
||||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
|
||||||
status: "SCRAPING",
|
|
||||||
currentDocumentUrl: task,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
|
||||||
if (callback && typeof callback === "function") {
|
|
||||||
callback();
|
|
||||||
}
|
|
||||||
}, concurrencyLimit);
|
|
||||||
|
|
||||||
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
|
||||||
queue.push(
|
|
||||||
urls.filter(
|
|
||||||
(url) =>
|
|
||||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
|
||||||
),
|
|
||||||
(err) => {
|
|
||||||
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
|
||||||
}
|
|
||||||
);
|
|
||||||
await queue.drain();
|
|
||||||
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
|
||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
|
||||||
}
|
|
||||||
|
|
||||||
public filterURL(href: string, url: string): string | null {
|
public filterURL(href: string, url: string): string | null {
|
||||||
let fullUrl = href;
|
let fullUrl = href;
|
||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
|
@ -346,79 +234,9 @@ export class WebCrawler {
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
this.visited.add(url);
|
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
|
||||||
url = "https://" + url;
|
|
||||||
}
|
|
||||||
if (url.endsWith("/")) {
|
|
||||||
url = url.slice(0, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
let content: string = "";
|
|
||||||
let pageStatusCode: number;
|
|
||||||
let pageError: string | undefined = undefined;
|
|
||||||
|
|
||||||
// If it is the first link, fetch with single url
|
|
||||||
if (this.visited.size === 1) {
|
|
||||||
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
|
||||||
content = page.html ?? "";
|
|
||||||
pageStatusCode = page.metadata?.pageStatusCode;
|
|
||||||
pageError = page.metadata?.pageError || undefined;
|
|
||||||
} else {
|
|
||||||
const response = await axios.get(url, { timeout: axiosTimeout });
|
|
||||||
content = response.data ?? "";
|
|
||||||
pageStatusCode = response.status;
|
|
||||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
const $ = load(content);
|
|
||||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
|
||||||
|
|
||||||
// Add the initial URL to the list of links
|
|
||||||
if (this.visited.size === 1) {
|
|
||||||
links.push({ url, html: content, pageStatusCode, pageError });
|
|
||||||
}
|
|
||||||
|
|
||||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
|
||||||
|
|
||||||
if (this.visited.size === 1) {
|
|
||||||
return links;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a new list to return to avoid modifying the visited list
|
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
|
||||||
} catch (error) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private isRobotsAllowed(url: string): boolean {
|
private isRobotsAllowed(url: string): boolean {
|
||||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||||
}
|
}
|
||||||
private normalizeCrawlUrl(url: string): string {
|
|
||||||
try{
|
|
||||||
const urlObj = new URL(url);
|
|
||||||
urlObj.searchParams.sort(); // Sort query parameters to normalize
|
|
||||||
return urlObj.toString();
|
|
||||||
} catch (error) {
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private matchesIncludes(url: string): boolean {
|
|
||||||
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
|
||||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
|
||||||
}
|
|
||||||
|
|
||||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||||
return this.excludes.some((pattern) => {
|
return this.excludes.some((pattern) => {
|
||||||
|
@ -503,7 +321,7 @@ export class WebCrawler {
|
||||||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
||||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error processing URL in isFile: ${error}`);
|
logger.error(`Error processing URL in isFile: ${error}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -524,7 +342,6 @@ export class WebCrawler {
|
||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||||
const normalizeUrl = (url: string) => {
|
const normalizeUrl = (url: string) => {
|
||||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
@ -546,7 +363,7 @@ export class WebCrawler {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
|
@ -565,7 +382,7 @@ export class WebCrawler {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { Logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
|
@ -6,7 +6,7 @@ export async function handleCustomScraping(
|
||||||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||||
// Check for Readme Docs special case
|
// Check for Readme Docs special case
|
||||||
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
|
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
|
||||||
Logger.debug(
|
logger.debug(
|
||||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
|
@ -21,7 +21,7 @@ export async function handleCustomScraping(
|
||||||
|
|
||||||
// Check for Vanta security portals
|
// Check for Vanta security portals
|
||||||
if (text.includes('<link href="https://static.vanta.com')) {
|
if (text.includes('<link href="https://static.vanta.com')) {
|
||||||
Logger.debug(
|
logger.debug(
|
||||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
|
@ -36,7 +36,7 @@ export async function handleCustomScraping(
|
||||||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||||
if (googleDriveMetaMatch) {
|
if (googleDriveMetaMatch) {
|
||||||
const url = googleDriveMetaMatch[1];
|
const url = googleDriveMetaMatch[1];
|
||||||
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||||
|
|
||||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||||
if (fileIdMatch) {
|
if (fileIdMatch) {
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
export const universalTimeout = 15000;
|
|
|
@ -1,743 +0,0 @@
|
||||||
import {
|
|
||||||
Document,
|
|
||||||
ExtractorOptions,
|
|
||||||
PageOptions,
|
|
||||||
WebScraperOptions,
|
|
||||||
} from "../../lib/entities";
|
|
||||||
import { Progress } from "../../lib/entities";
|
|
||||||
import { scrapSingleUrl } from "./single_url";
|
|
||||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
|
||||||
import { WebCrawler } from "./crawler";
|
|
||||||
import { getValue, setValue } from "../../services/redis";
|
|
||||||
import { getImageDescription } from "./utils/imageDescription";
|
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
|
||||||
import {
|
|
||||||
replaceImgPathsWithAbsolutePaths,
|
|
||||||
replacePathsWithAbsolutePaths,
|
|
||||||
} from "./utils/replacePaths";
|
|
||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
|
||||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
|
||||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
|
||||||
import { Logger } from "../../lib/logger";
|
|
||||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
|
||||||
private jobId: string;
|
|
||||||
private bullJobId: string;
|
|
||||||
private urls: string[] = [""];
|
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
|
||||||
private includes: string | string[];
|
|
||||||
private excludes: string | string[];
|
|
||||||
private maxCrawledLinks: number;
|
|
||||||
private maxCrawledDepth: number = 10;
|
|
||||||
private returnOnlyUrls: boolean;
|
|
||||||
private limit: number = 10000;
|
|
||||||
private concurrentRequests: number = 20;
|
|
||||||
private generateImgAltText: boolean = false;
|
|
||||||
private ignoreSitemap: boolean = false;
|
|
||||||
private pageOptions?: PageOptions;
|
|
||||||
private extractorOptions?: ExtractorOptions;
|
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
|
||||||
"gpt-4-turbo";
|
|
||||||
private crawlerMode: string = "default";
|
|
||||||
private allowBackwardCrawling: boolean = false;
|
|
||||||
private allowExternalContentLinks: boolean = false;
|
|
||||||
private priority?: number;
|
|
||||||
private teamId?: string;
|
|
||||||
|
|
||||||
authorize(): void {
|
|
||||||
throw new Error("Method not implemented.");
|
|
||||||
}
|
|
||||||
|
|
||||||
authorizeNango(): Promise<void> {
|
|
||||||
throw new Error("Method not implemented.");
|
|
||||||
}
|
|
||||||
|
|
||||||
private async convertUrlsToDocuments(
|
|
||||||
urls: string[],
|
|
||||||
inProgress?: (progress: Progress) => void,
|
|
||||||
allHtmls?: string[]
|
|
||||||
): Promise<Document[]> {
|
|
||||||
const totalUrls = urls.length;
|
|
||||||
let processedUrls = 0;
|
|
||||||
|
|
||||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
|
||||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
|
||||||
await Promise.all(
|
|
||||||
batchUrls.map(async (url, index) => {
|
|
||||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
|
||||||
const result = await scrapSingleUrl(
|
|
||||||
this.jobId,
|
|
||||||
url,
|
|
||||||
this.pageOptions,
|
|
||||||
this.extractorOptions,
|
|
||||||
existingHTML,
|
|
||||||
this.priority,
|
|
||||||
this.teamId,
|
|
||||||
);
|
|
||||||
processedUrls++;
|
|
||||||
if (inProgress) {
|
|
||||||
inProgress({
|
|
||||||
current: processedUrls,
|
|
||||||
total: totalUrls,
|
|
||||||
status: "SCRAPING",
|
|
||||||
currentDocumentUrl: url,
|
|
||||||
currentDocument: { ...result, index: processedUrls },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
results[i + index] = result;
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return results.filter((result) => result !== null) as Document[];
|
|
||||||
}
|
|
||||||
|
|
||||||
async getDocuments(
|
|
||||||
useCaching: boolean = false,
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
this.validateInitialUrl();
|
|
||||||
if (!useCaching) {
|
|
||||||
return this.processDocumentsWithoutCache(inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.processDocumentsWithCache(inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
private validateInitialUrl(): void {
|
|
||||||
if (this.urls[0].trim() === "") {
|
|
||||||
throw new Error("Url is required");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Process documents without cache handling each mode
|
|
||||||
* @param inProgress inProgress
|
|
||||||
* @returns documents
|
|
||||||
*/
|
|
||||||
private async processDocumentsWithoutCache(
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
switch (this.mode) {
|
|
||||||
case "crawl":
|
|
||||||
return this.handleCrawlMode(inProgress);
|
|
||||||
case "single_urls":
|
|
||||||
return this.handleSingleUrlsMode(inProgress);
|
|
||||||
case "sitemap":
|
|
||||||
return this.handleSitemapMode(inProgress);
|
|
||||||
default:
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private async cleanIrrelevantPath(links: string[]) {
|
|
||||||
return links.filter((link) => {
|
|
||||||
const normalizedInitialUrl = new URL(this.urls[0]);
|
|
||||||
const normalizedLink = new URL(link);
|
|
||||||
|
|
||||||
// Normalize the hostname to account for www and non-www versions
|
|
||||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
|
||||||
/^www\./,
|
|
||||||
""
|
|
||||||
);
|
|
||||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
|
||||||
|
|
||||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
|
||||||
return (
|
|
||||||
linkHostname === initialHostname &&
|
|
||||||
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private async handleCrawlMode(
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
let includes: string[];
|
|
||||||
if (Array.isArray(this.includes)) {
|
|
||||||
if (this.includes[0] != "") {
|
|
||||||
includes = this.includes;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
includes = this.includes.split(',');
|
|
||||||
}
|
|
||||||
|
|
||||||
let excludes: string[];
|
|
||||||
if (Array.isArray(this.excludes)) {
|
|
||||||
if (this.excludes[0] != "") {
|
|
||||||
excludes = this.excludes;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
excludes = this.excludes.split(',');
|
|
||||||
}
|
|
||||||
|
|
||||||
const crawler = new WebCrawler({
|
|
||||||
jobId: this.jobId,
|
|
||||||
initialUrl: this.urls[0],
|
|
||||||
includes,
|
|
||||||
excludes,
|
|
||||||
maxCrawledLinks: this.maxCrawledLinks,
|
|
||||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
|
||||||
limit: this.limit,
|
|
||||||
generateImgAltText: this.generateImgAltText,
|
|
||||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
|
||||||
allowExternalContentLinks: this.allowExternalContentLinks,
|
|
||||||
});
|
|
||||||
|
|
||||||
let links = await crawler.start(
|
|
||||||
inProgress,
|
|
||||||
this.pageOptions,
|
|
||||||
{
|
|
||||||
ignoreSitemap: this.ignoreSitemap,
|
|
||||||
},
|
|
||||||
5,
|
|
||||||
this.limit,
|
|
||||||
this.maxCrawledDepth
|
|
||||||
);
|
|
||||||
|
|
||||||
let allLinks = links.map((e) => e.url);
|
|
||||||
const allHtmls = links.map((e) => e.html);
|
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
|
||||||
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = [];
|
|
||||||
// check if fast mode is enabled and there is html inside the links
|
|
||||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
|
||||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
|
||||||
} else {
|
|
||||||
documents = await this.processLinks(allLinks, inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
|
||||||
}
|
|
||||||
|
|
||||||
private async handleSingleUrlsMode(
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
const links = this.urls;
|
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
private async handleSitemapMode(
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
|
||||||
links = await this.cleanIrrelevantPath(links);
|
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
|
||||||
return this.cacheAndFinalizeDocuments(documents, links);
|
|
||||||
}
|
|
||||||
|
|
||||||
private async returnOnlyUrlsResponse(
|
|
||||||
links: string[],
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
inProgress?.({
|
|
||||||
current: links.length,
|
|
||||||
total: links.length,
|
|
||||||
status: "COMPLETED",
|
|
||||||
currentDocumentUrl: this.urls[0],
|
|
||||||
});
|
|
||||||
return links.map((url) => ({
|
|
||||||
content: "",
|
|
||||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
|
||||||
markdown: "",
|
|
||||||
metadata: { sourceURL: url, pageStatusCode: 200 },
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
private async processLinks(
|
|
||||||
links: string[],
|
|
||||||
inProgress?: (progress: Progress) => void,
|
|
||||||
allHtmls?: string[]
|
|
||||||
): Promise<Document[]> {
|
|
||||||
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
|
||||||
const docLinks = links.filter(
|
|
||||||
(link) => link.endsWith(".doc") || link.endsWith(".docx")
|
|
||||||
);
|
|
||||||
|
|
||||||
const [pdfDocuments, docxDocuments] = await Promise.all([
|
|
||||||
this.fetchPdfDocuments(pdfLinks),
|
|
||||||
this.fetchDocxDocuments(docLinks),
|
|
||||||
]);
|
|
||||||
|
|
||||||
links = links.filter(
|
|
||||||
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
|
|
||||||
);
|
|
||||||
|
|
||||||
let [documents, sitemapData] = await Promise.all([
|
|
||||||
this.convertUrlsToDocuments(links, inProgress, allHtmls),
|
|
||||||
this.mode === "single_urls" && links.length > 0
|
|
||||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
|
||||||
(error) => {
|
|
||||||
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
: Promise.resolve(null),
|
|
||||||
]);
|
|
||||||
|
|
||||||
if (this.mode === "single_urls" && documents.length > 0) {
|
|
||||||
documents[0].metadata.sitemap = sitemapData ?? undefined;
|
|
||||||
} else {
|
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.pageOptions.includeMarkdown) {
|
|
||||||
documents = this.applyPathReplacements(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this.pageOptions.includeHtml) {
|
|
||||||
for (let document of documents) {
|
|
||||||
delete document.html;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// documents = await this.applyImgAltText(documents);
|
|
||||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
|
||||||
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
|
||||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
|
||||||
|
|
||||||
if (
|
|
||||||
extractionMode === "llm-extraction" ||
|
|
||||||
extractionMode === "llm-extraction-from-markdown" ||
|
|
||||||
extractionMode === "llm-extraction-from-raw-html"
|
|
||||||
) {
|
|
||||||
documents = await generateCompletions(
|
|
||||||
documents,
|
|
||||||
this.extractorOptions,
|
|
||||||
completionMode
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
|
||||||
}
|
|
||||||
|
|
||||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
|
||||||
return Promise.all(
|
|
||||||
pdfLinks.map(async (pdfLink) => {
|
|
||||||
const timer = Date.now();
|
|
||||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
|
||||||
type: "scrape",
|
|
||||||
url: pdfLink,
|
|
||||||
worker: process.env.FLY_MACHINE_ID,
|
|
||||||
method: "pdf-scrape",
|
|
||||||
result: null,
|
|
||||||
});
|
|
||||||
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
|
||||||
pdfLink,
|
|
||||||
this.pageOptions.parsePDF
|
|
||||||
);
|
|
||||||
|
|
||||||
const insertedLogId = await logInsertPromise;
|
|
||||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
|
||||||
response_size: content.length,
|
|
||||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
|
||||||
error: pageError,
|
|
||||||
response_code: pageStatusCode,
|
|
||||||
time_taken: Date.now() - timer,
|
|
||||||
});
|
|
||||||
return {
|
|
||||||
content: content,
|
|
||||||
markdown: content,
|
|
||||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
|
||||||
provider: "web-scraper",
|
|
||||||
};
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
|
||||||
return Promise.all(
|
|
||||||
docxLinks.map(async (docxLink) => {
|
|
||||||
const timer = Date.now();
|
|
||||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
|
||||||
type: "scrape",
|
|
||||||
url: docxLink,
|
|
||||||
worker: process.env.FLY_MACHINE_ID,
|
|
||||||
method: "docx-scrape",
|
|
||||||
result: null,
|
|
||||||
});
|
|
||||||
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
|
||||||
docxLink
|
|
||||||
);
|
|
||||||
|
|
||||||
const insertedLogId = await logInsertPromise;
|
|
||||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
|
||||||
response_size: content.length,
|
|
||||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
|
||||||
error: pageError,
|
|
||||||
response_code: pageStatusCode,
|
|
||||||
time_taken: Date.now() - timer,
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
content,
|
|
||||||
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
|
||||||
provider: "web-scraper",
|
|
||||||
};
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private applyPathReplacements(documents: Document[]): Document[] {
|
|
||||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
|
||||||
documents = replacePathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
return replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
|
||||||
return this.generateImgAltText
|
|
||||||
? this.generatesImgAltText(documents)
|
|
||||||
: documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
private async cacheAndFinalizeDocuments(
|
|
||||||
documents: Document[],
|
|
||||||
links: string[]
|
|
||||||
): Promise<Document[]> {
|
|
||||||
// await this.setCachedDocuments(documents, links);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
return documents.splice(0, this.limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
private async processDocumentsWithCache(
|
|
||||||
inProgress?: (progress: Progress) => void
|
|
||||||
): Promise<Document[]> {
|
|
||||||
let documents = await this.getCachedDocuments(
|
|
||||||
this.urls.slice(0, this.limit)
|
|
||||||
);
|
|
||||||
if (documents.length < this.limit) {
|
|
||||||
const newDocuments: Document[] = await this.getDocuments(
|
|
||||||
false,
|
|
||||||
inProgress
|
|
||||||
);
|
|
||||||
documents = this.mergeNewDocuments(documents, newDocuments);
|
|
||||||
}
|
|
||||||
documents = this.filterDocsExcludeInclude(documents);
|
|
||||||
documents = this.filterDepth(documents);
|
|
||||||
documents = this.removeChildLinks(documents);
|
|
||||||
return documents.splice(0, this.limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
private mergeNewDocuments(
|
|
||||||
existingDocuments: Document[],
|
|
||||||
newDocuments: Document[]
|
|
||||||
): Document[] {
|
|
||||||
newDocuments.forEach((doc) => {
|
|
||||||
if (
|
|
||||||
!existingDocuments.some(
|
|
||||||
(d) =>
|
|
||||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
|
||||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
existingDocuments.push(doc);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return existingDocuments;
|
|
||||||
}
|
|
||||||
|
|
||||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
|
||||||
return documents.filter((document) => {
|
|
||||||
const url = new URL(document.metadata.sourceURL);
|
|
||||||
const path = url.pathname;
|
|
||||||
|
|
||||||
if (!Array.isArray(this.excludes)) {
|
|
||||||
this.excludes = this.excludes.split(',');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
|
||||||
// Check if the link should be excluded
|
|
||||||
if (
|
|
||||||
this.excludes.some((excludePattern) =>
|
|
||||||
new RegExp(excludePattern).test(path)
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!Array.isArray(this.includes)) {
|
|
||||||
this.includes = this.includes.split(',');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
|
||||||
// Check if the link matches the include patterns, if any are specified
|
|
||||||
if (this.includes.length > 0) {
|
|
||||||
return this.includes.some((includePattern) =>
|
|
||||||
new RegExp(includePattern).test(path)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private normalizeUrl(url: string): string {
|
|
||||||
if (url.includes("//www.")) {
|
|
||||||
return url.replace("//www.", "//");
|
|
||||||
}
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
|
|
||||||
private removeChildLinks(documents: Document[]): Document[] {
|
|
||||||
for (let document of documents) {
|
|
||||||
if (document?.childrenLinks) delete document.childrenLinks;
|
|
||||||
}
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
|
|
||||||
for (const document of documents) {
|
|
||||||
if (document.content.trim().length === 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
|
||||||
await setValue(
|
|
||||||
"web-scraper-cache:" + normalizedUrl,
|
|
||||||
JSON.stringify({
|
|
||||||
...document,
|
|
||||||
childrenLinks: childrenLinks || [],
|
|
||||||
}),
|
|
||||||
60 * 60
|
|
||||||
); // 10 days
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async getCachedDocuments(urls: string[]): Promise<Document[]> {
|
|
||||||
let documents: Document[] = [];
|
|
||||||
for (const url of urls) {
|
|
||||||
const normalizedUrl = this.normalizeUrl(url);
|
|
||||||
Logger.debug(
|
|
||||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
|
||||||
);
|
|
||||||
const cachedDocumentString = await getValue(
|
|
||||||
"web-scraper-cache:" + normalizedUrl
|
|
||||||
);
|
|
||||||
if (cachedDocumentString) {
|
|
||||||
const cachedDocument = JSON.parse(cachedDocumentString);
|
|
||||||
documents.push(cachedDocument);
|
|
||||||
|
|
||||||
// get children documents
|
|
||||||
for (const childUrl of cachedDocument.childrenLinks || []) {
|
|
||||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
|
||||||
const childCachedDocumentString = await getValue(
|
|
||||||
"web-scraper-cache:" + normalizedChildUrl
|
|
||||||
);
|
|
||||||
if (childCachedDocumentString) {
|
|
||||||
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
|
||||||
if (
|
|
||||||
!documents.find(
|
|
||||||
(doc) =>
|
|
||||||
doc.metadata.sourceURL ===
|
|
||||||
childCachedDocument.metadata.sourceURL
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
documents.push(childCachedDocument);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
setOptions(options: WebScraperOptions): void {
|
|
||||||
if (!options.urls) {
|
|
||||||
throw new Error("Urls are required");
|
|
||||||
}
|
|
||||||
|
|
||||||
this.jobId = options.jobId;
|
|
||||||
this.bullJobId = options.bullJobId;
|
|
||||||
this.urls = options.urls;
|
|
||||||
this.mode = options.mode;
|
|
||||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
|
||||||
this.includes = options.crawlerOptions?.includes ?? [];
|
|
||||||
this.excludes = options.crawlerOptions?.excludes ?? [];
|
|
||||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
|
||||||
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
|
|
||||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
|
||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
|
||||||
this.generateImgAltText =
|
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
|
||||||
this.pageOptions = {
|
|
||||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
|
||||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
|
||||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
|
||||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
|
||||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
|
||||||
removeTags: options.pageOptions?.removeTags ?? [],
|
|
||||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
|
||||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
|
||||||
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
|
||||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
|
||||||
headers: options.pageOptions?.headers ?? undefined,
|
|
||||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
|
||||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
|
||||||
screenshot: options.pageOptions?.screenshot ?? false,
|
|
||||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
|
||||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
|
||||||
atsv: options.pageOptions?.atsv ?? false,
|
|
||||||
actions: options.pageOptions?.actions ?? undefined,
|
|
||||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
|
||||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
|
||||||
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
|
|
||||||
mobile: options.pageOptions?.mobile ?? false,
|
|
||||||
};
|
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
|
||||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
|
||||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
|
||||||
false;
|
|
||||||
|
|
||||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
|
||||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
|
||||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
|
||||||
}
|
|
||||||
|
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
|
||||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
|
||||||
this.allowBackwardCrawling =
|
|
||||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
|
||||||
this.allowExternalContentLinks =
|
|
||||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
|
||||||
this.priority = options.priority;
|
|
||||||
this.teamId = options.teamId ?? null;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// make sure all urls start with https://
|
|
||||||
this.urls = this.urls.map((url) => {
|
|
||||||
if (!url.trim().startsWith("http")) {
|
|
||||||
return `https://${url}`;
|
|
||||||
}
|
|
||||||
return url;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
|
||||||
const sitemapData = await fetchSitemapData(baseUrl);
|
|
||||||
if (sitemapData) {
|
|
||||||
for (let i = 0; i < documents.length; i++) {
|
|
||||||
const docInSitemapData = sitemapData.find(
|
|
||||||
(data) =>
|
|
||||||
this.normalizeUrl(data.loc) ===
|
|
||||||
this.normalizeUrl(documents[i].metadata.sourceURL)
|
|
||||||
);
|
|
||||||
if (docInSitemapData) {
|
|
||||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
|
||||||
if (docInSitemapData.changefreq) {
|
|
||||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
|
||||||
}
|
|
||||||
if (docInSitemapData.priority) {
|
|
||||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
|
||||||
}
|
|
||||||
if (docInSitemapData.lastmod) {
|
|
||||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
|
||||||
}
|
|
||||||
if (Object.keys(sitemapDocData).length !== 0) {
|
|
||||||
documents[i].metadata.sitemap = sitemapDocData;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
private async getSitemapDataForSingleUrl(
|
|
||||||
baseUrl: string,
|
|
||||||
url: string,
|
|
||||||
timeout?: number
|
|
||||||
) {
|
|
||||||
const sitemapData = await fetchSitemapData(baseUrl, timeout);
|
|
||||||
if (sitemapData) {
|
|
||||||
const docInSitemapData = sitemapData.find(
|
|
||||||
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
|
|
||||||
);
|
|
||||||
if (docInSitemapData) {
|
|
||||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
|
||||||
if (docInSitemapData.changefreq) {
|
|
||||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
|
||||||
}
|
|
||||||
if (docInSitemapData.priority) {
|
|
||||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
|
||||||
}
|
|
||||||
if (docInSitemapData.lastmod) {
|
|
||||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
|
||||||
}
|
|
||||||
if (Object.keys(sitemapDocData).length !== 0) {
|
|
||||||
return sitemapDocData;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
|
||||||
await Promise.all(
|
|
||||||
documents.map(async (document) => {
|
|
||||||
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
|
||||||
|
|
||||||
await Promise.all(
|
|
||||||
images.map(async (image: string) => {
|
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
|
||||||
|
|
||||||
if (
|
|
||||||
!altText &&
|
|
||||||
!imageUrl.startsWith("data:image") &&
|
|
||||||
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
|
|
||||||
) {
|
|
||||||
const imageIndex = document.content.indexOf(image);
|
|
||||||
const contentLength = document.content.length;
|
|
||||||
let backText = document.content.substring(
|
|
||||||
imageIndex + image.length,
|
|
||||||
Math.min(imageIndex + image.length + 1000, contentLength)
|
|
||||||
);
|
|
||||||
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
|
||||||
let frontText = document.content.substring(
|
|
||||||
frontTextStartIndex,
|
|
||||||
imageIndex
|
|
||||||
);
|
|
||||||
altText = await getImageDescription(
|
|
||||||
imageUrl,
|
|
||||||
backText,
|
|
||||||
frontText,
|
|
||||||
this.generateImgAltTextModel
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
document.content = document.content.replace(
|
|
||||||
image,
|
|
||||||
`![${altText}](${imageUrl})`
|
|
||||||
);
|
|
||||||
})
|
|
||||||
);
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
return documents;
|
|
||||||
};
|
|
||||||
|
|
||||||
filterDepth(documents: Document[]): Document[] {
|
|
||||||
return documents.filter((document) => {
|
|
||||||
const url = new URL(document.metadata.sourceURL);
|
|
||||||
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,89 +0,0 @@
|
||||||
import axios from "axios";
|
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
import { universalTimeout } from "../global";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Scrapes a URL with Axios
|
|
||||||
* @param url The URL to scrape
|
|
||||||
* @param pageOptions The options for the page
|
|
||||||
* @returns The scraped content
|
|
||||||
*/
|
|
||||||
export async function scrapWithFetch(
|
|
||||||
url: string,
|
|
||||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
|
||||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
|
||||||
const logParams = {
|
|
||||||
url,
|
|
||||||
scraper: "fetch",
|
|
||||||
success: false,
|
|
||||||
response_code: null,
|
|
||||||
time_taken_seconds: null,
|
|
||||||
error_message: null,
|
|
||||||
html: "",
|
|
||||||
startTime: Date.now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
|
||||||
const response = await axios.get(url, {
|
|
||||||
headers: {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
timeout: universalTimeout,
|
|
||||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
Logger.debug(
|
|
||||||
`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
|
|
||||||
);
|
|
||||||
logParams.error_message = response.statusText;
|
|
||||||
logParams.response_code = response.status;
|
|
||||||
return {
|
|
||||||
content: "",
|
|
||||||
pageStatusCode: response.status,
|
|
||||||
pageError: response.statusText,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
|
||||||
logParams.success = true;
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
|
||||||
url,
|
|
||||||
pageOptions?.parsePDF
|
|
||||||
);
|
|
||||||
logParams.response_code = pageStatusCode;
|
|
||||||
logParams.error_message = pageError;
|
|
||||||
return { content, pageStatusCode: response.status, pageError };
|
|
||||||
} else {
|
|
||||||
const text = response.data;
|
|
||||||
logParams.success = true;
|
|
||||||
logParams.html = text;
|
|
||||||
logParams.response_code = response.status;
|
|
||||||
return {
|
|
||||||
content: text,
|
|
||||||
pageStatusCode: response.status,
|
|
||||||
pageError: null,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
if (error.code === "ECONNABORTED") {
|
|
||||||
logParams.error_message = "Request timed out";
|
|
||||||
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
|
||||||
} else {
|
|
||||||
logParams.error_message = error.message || error;
|
|
||||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
content: "",
|
|
||||||
pageStatusCode: error.response?.status ?? null,
|
|
||||||
pageError: logParams.error_message,
|
|
||||||
};
|
|
||||||
} finally {
|
|
||||||
const endTime = Date.now();
|
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
|
||||||
await logScrape(logParams);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,230 +0,0 @@
|
||||||
import axios from "axios";
|
|
||||||
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
|
||||||
import { generateRequestParams } from "../single_url";
|
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
import { universalTimeout } from "../global";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
import * as Sentry from "@sentry/node";
|
|
||||||
import axiosRetry from 'axios-retry';
|
|
||||||
|
|
||||||
axiosRetry(axios, { retries: 3 , onRetry:()=>{
|
|
||||||
console.log("Retrying (fire-engine)...");
|
|
||||||
}, retryDelay: axiosRetry.exponentialDelay});
|
|
||||||
/**
|
|
||||||
* Scrapes a URL with Fire-Engine
|
|
||||||
* @param url The URL to scrape
|
|
||||||
* @param waitFor The time to wait for the page to load
|
|
||||||
* @param screenshot Whether to take a screenshot
|
|
||||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
|
||||||
* @param pageOptions The options for the page
|
|
||||||
* @param headers The headers to send with the request
|
|
||||||
* @param options The options for the request
|
|
||||||
* @returns The scraped content
|
|
||||||
*/
|
|
||||||
export async function scrapWithFireEngine({
|
|
||||||
url,
|
|
||||||
actions,
|
|
||||||
waitFor = 0,
|
|
||||||
screenshot = false,
|
|
||||||
fullPageScreenshot = false,
|
|
||||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
|
|
||||||
fireEngineOptions = {},
|
|
||||||
headers,
|
|
||||||
options,
|
|
||||||
priority,
|
|
||||||
teamId,
|
|
||||||
}: {
|
|
||||||
url: string;
|
|
||||||
actions?: Action[];
|
|
||||||
waitFor?: number;
|
|
||||||
screenshot?: boolean;
|
|
||||||
fullPageScreenshot?: boolean;
|
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
|
|
||||||
fireEngineOptions?: FireEngineOptions;
|
|
||||||
headers?: Record<string, string>;
|
|
||||||
options?: any;
|
|
||||||
priority?: number;
|
|
||||||
teamId?: string;
|
|
||||||
}): Promise<FireEngineResponse> {
|
|
||||||
const logParams = {
|
|
||||||
url,
|
|
||||||
scraper: "fire-engine",
|
|
||||||
success: false,
|
|
||||||
response_code: null,
|
|
||||||
time_taken_seconds: null,
|
|
||||||
error_message: null,
|
|
||||||
html: "",
|
|
||||||
startTime: Date.now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
|
||||||
const reqParams = await generateRequestParams(url);
|
|
||||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
|
||||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
|
||||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
|
||||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
|
||||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
|
||||||
|
|
||||||
|
|
||||||
let endpoint = "/scrape";
|
|
||||||
|
|
||||||
if(options?.endpoint === "request") {
|
|
||||||
endpoint = "/request";
|
|
||||||
}
|
|
||||||
|
|
||||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
|
||||||
|
|
||||||
if (pageOptions?.useFastMode) {
|
|
||||||
fireEngineOptionsParam.engine = "tlsclient";
|
|
||||||
engine = "tlsclient";
|
|
||||||
}
|
|
||||||
|
|
||||||
Logger.info(
|
|
||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
|
||||||
);
|
|
||||||
|
|
||||||
// atsv is only available for beta customers
|
|
||||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
|
||||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
|
||||||
|
|
||||||
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
|
||||||
fireEngineOptionsParam.atsv = true;
|
|
||||||
} else {
|
|
||||||
pageOptions.atsv = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const axiosInstance = axios.create({
|
|
||||||
headers: { "Content-Type": "application/json" }
|
|
||||||
});
|
|
||||||
|
|
||||||
const startTime = Date.now();
|
|
||||||
const _response = await Sentry.startSpan({
|
|
||||||
name: "Call to fire-engine"
|
|
||||||
}, async span => {
|
|
||||||
|
|
||||||
return await axiosInstance.post(
|
|
||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
|
||||||
{
|
|
||||||
url: url,
|
|
||||||
headers: headers,
|
|
||||||
wait: waitParam,
|
|
||||||
screenshot: screenshotParam,
|
|
||||||
fullPageScreenshot: fullPageScreenshotParam,
|
|
||||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
|
||||||
priority,
|
|
||||||
engine,
|
|
||||||
instantReturn: true,
|
|
||||||
mobile: pageOptions?.mobile ?? false,
|
|
||||||
...fireEngineOptionsParam,
|
|
||||||
atsv: pageOptions?.atsv ?? false,
|
|
||||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
|
||||||
geolocation: pageOptions?.geolocation,
|
|
||||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
|
||||||
removeBase64Images: pageOptions?.removeBase64Images ?? true,
|
|
||||||
actions: actions,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
...(Sentry.isInitialized() ? ({
|
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
|
||||||
"baggage": Sentry.spanToBaggageHeader(span),
|
|
||||||
}) : {}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
|
||||||
|
|
||||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
|
||||||
|
|
||||||
// added 5 seconds to the timeout to account for 'smart wait'
|
|
||||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
|
||||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkStatusResponse.data.processing) {
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
|
||||||
axiosInstance.delete(
|
|
||||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
|
||||||
validateStatus: (status) => true
|
|
||||||
}
|
|
||||||
).catch((error) => {
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
|
||||||
logParams.error_message = "Request timed out";
|
|
||||||
return { html: "", pageStatusCode: null, pageError: "" };
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
|
||||||
Logger.debug(
|
|
||||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
|
|
||||||
);
|
|
||||||
|
|
||||||
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
|
||||||
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
|
||||||
|
|
||||||
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
|
||||||
|
|
||||||
return {
|
|
||||||
html: "",
|
|
||||||
pageStatusCode,
|
|
||||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
|
|
||||||
|
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
|
||||||
url,
|
|
||||||
pageOptions?.parsePDF
|
|
||||||
);
|
|
||||||
logParams.success = true;
|
|
||||||
logParams.response_code = pageStatusCode;
|
|
||||||
logParams.error_message = pageError;
|
|
||||||
return { html: content, pageStatusCode, pageError };
|
|
||||||
} else {
|
|
||||||
const data = checkStatusResponse.data;
|
|
||||||
|
|
||||||
logParams.success =
|
|
||||||
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
|
||||||
data.pageStatusCode === 404;
|
|
||||||
logParams.html = data.content ?? "";
|
|
||||||
logParams.response_code = data.pageStatusCode;
|
|
||||||
logParams.error_message = data.pageError ?? data.error;
|
|
||||||
|
|
||||||
return {
|
|
||||||
html: data.content ?? "",
|
|
||||||
screenshots: data.screenshots ?? [data.screenshot] ?? [],
|
|
||||||
pageStatusCode: data.pageStatusCode,
|
|
||||||
pageError: data.pageError ?? data.error,
|
|
||||||
scrapeActionContent: data?.actionContent ?? [],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
if (error.code === "ECONNABORTED") {
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
|
|
||||||
logParams.error_message = "Request timed out";
|
|
||||||
} else {
|
|
||||||
Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
|
|
||||||
logParams.error_message = error.message || error;
|
|
||||||
}
|
|
||||||
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
|
||||||
} finally {
|
|
||||||
const endTime = Date.now();
|
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
|
||||||
await logScrape(logParams, pageOptions);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
|
@ -1,111 +0,0 @@
|
||||||
import axios from "axios";
|
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
|
||||||
import { generateRequestParams } from "../single_url";
|
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
import { universalTimeout } from "../global";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Scrapes a URL with Playwright
|
|
||||||
* @param url The URL to scrape
|
|
||||||
* @param waitFor The time to wait for the page to load
|
|
||||||
* @param headers The headers to send with the request
|
|
||||||
* @param pageOptions The options for the page
|
|
||||||
* @returns The scraped content
|
|
||||||
*/
|
|
||||||
export async function scrapWithPlaywright(
|
|
||||||
url: string,
|
|
||||||
waitFor: number = 0,
|
|
||||||
headers?: Record<string, string>,
|
|
||||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
|
||||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
|
||||||
const logParams = {
|
|
||||||
url,
|
|
||||||
scraper: "playwright",
|
|
||||||
success: false,
|
|
||||||
response_code: null,
|
|
||||||
time_taken_seconds: null,
|
|
||||||
error_message: null,
|
|
||||||
html: "",
|
|
||||||
startTime: Date.now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
|
||||||
const reqParams = await generateRequestParams(url);
|
|
||||||
// If the user has passed a wait parameter in the request, use that
|
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
|
||||||
|
|
||||||
const response = await axios.post(
|
|
||||||
process.env.PLAYWRIGHT_MICROSERVICE_URL,
|
|
||||||
{
|
|
||||||
url: url,
|
|
||||||
wait_after_load: waitParam,
|
|
||||||
timeout: universalTimeout + waitParam,
|
|
||||||
headers: headers,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
|
|
||||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
if (response.status !== 200) {
|
|
||||||
Logger.debug(
|
|
||||||
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
|
||||||
);
|
|
||||||
logParams.error_message = response.data?.pageError;
|
|
||||||
logParams.response_code = response.data?.pageStatusCode;
|
|
||||||
return {
|
|
||||||
content: "",
|
|
||||||
pageStatusCode: response.data?.pageStatusCode,
|
|
||||||
pageError: response.data?.pageError,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const contentType = response.headers["content-type"];
|
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
|
||||||
logParams.success = true;
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
|
||||||
logParams.response_code = pageStatusCode;
|
|
||||||
logParams.error_message = pageError;
|
|
||||||
return { content, pageStatusCode, pageError };
|
|
||||||
} else {
|
|
||||||
const textData = response.data;
|
|
||||||
try {
|
|
||||||
const data = JSON.parse(textData);
|
|
||||||
const html = data.content;
|
|
||||||
logParams.success = true;
|
|
||||||
logParams.html = html;
|
|
||||||
logParams.response_code = data.pageStatusCode;
|
|
||||||
logParams.error_message = data.pageError;
|
|
||||||
return {
|
|
||||||
content: html ?? "",
|
|
||||||
pageStatusCode: data.pageStatusCode,
|
|
||||||
pageError: data.pageError,
|
|
||||||
};
|
|
||||||
} catch (jsonError) {
|
|
||||||
logParams.error_message = jsonError.message || jsonError;
|
|
||||||
Logger.debug(
|
|
||||||
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
|
||||||
);
|
|
||||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
if (error.code === "ECONNABORTED") {
|
|
||||||
logParams.error_message = "Request timed out";
|
|
||||||
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
|
||||||
} else {
|
|
||||||
logParams.error_message = error.message || error;
|
|
||||||
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
|
||||||
}
|
|
||||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
|
||||||
} finally {
|
|
||||||
const endTime = Date.now();
|
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
|
||||||
await logScrape(logParams);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,92 +0,0 @@
|
||||||
import { logScrape } from "../../../services/logging/scrape_log";
|
|
||||||
import { generateRequestParams } from "../single_url";
|
|
||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|
||||||
import { universalTimeout } from "../global";
|
|
||||||
import { ScrapingBeeClient } from "scrapingbee";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Scrapes a URL with ScrapingBee
|
|
||||||
* @param url The URL to scrape
|
|
||||||
* @param wait_browser The browser event to wait for
|
|
||||||
* @param timeout The timeout for the scrape
|
|
||||||
* @param pageOptions The options for the page
|
|
||||||
* @returns The scraped content
|
|
||||||
*/
|
|
||||||
export async function scrapWithScrapingBee(
|
|
||||||
url: string,
|
|
||||||
wait_browser: string = "domcontentloaded",
|
|
||||||
timeout: number = universalTimeout,
|
|
||||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
|
||||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
|
||||||
const logParams = {
|
|
||||||
url,
|
|
||||||
scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
|
|
||||||
success: false,
|
|
||||||
response_code: null,
|
|
||||||
time_taken_seconds: null,
|
|
||||||
error_message: null,
|
|
||||||
html: "",
|
|
||||||
startTime: Date.now(),
|
|
||||||
};
|
|
||||||
try {
|
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
|
||||||
const clientParams = await generateRequestParams(
|
|
||||||
url,
|
|
||||||
wait_browser,
|
|
||||||
timeout
|
|
||||||
);
|
|
||||||
const response = await client.get({
|
|
||||||
...clientParams,
|
|
||||||
params: {
|
|
||||||
...clientParams.params,
|
|
||||||
transparent_status_code: "True",
|
|
||||||
},
|
|
||||||
});
|
|
||||||
Logger.info(
|
|
||||||
`⛏️ ScrapingBee: Scraping ${url}`
|
|
||||||
);
|
|
||||||
const contentType = response.headers["content-type"];
|
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
|
||||||
logParams.success = true;
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
|
||||||
logParams.response_code = pageStatusCode;
|
|
||||||
logParams.error_message = pageError;
|
|
||||||
return { content, pageStatusCode, pageError };
|
|
||||||
} else {
|
|
||||||
let text = "";
|
|
||||||
try {
|
|
||||||
const decoder = new TextDecoder();
|
|
||||||
text = decoder.decode(response.data);
|
|
||||||
logParams.success = true;
|
|
||||||
} catch (decodeError) {
|
|
||||||
Logger.debug(
|
|
||||||
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
|
||||||
);
|
|
||||||
logParams.error_message = decodeError.message || decodeError;
|
|
||||||
}
|
|
||||||
logParams.response_code = response.status;
|
|
||||||
logParams.html = text;
|
|
||||||
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
|
|
||||||
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
|
|
||||||
return {
|
|
||||||
content: text,
|
|
||||||
pageStatusCode: response.status,
|
|
||||||
pageError: response.statusText !== "OK" ? response.statusText : undefined,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
|
||||||
logParams.error_message = error.message || error;
|
|
||||||
logParams.response_code = error.response?.status;
|
|
||||||
return {
|
|
||||||
content: "",
|
|
||||||
pageStatusCode: error.response?.status,
|
|
||||||
pageError: error.response?.statusText,
|
|
||||||
};
|
|
||||||
} finally {
|
|
||||||
const endTime = Date.now();
|
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
|
||||||
await logScrape(logParams);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,506 +0,0 @@
|
||||||
import * as cheerio from "cheerio";
|
|
||||||
import { extractMetadata } from "./utils/metadata";
|
|
||||||
import dotenv from "dotenv";
|
|
||||||
import {
|
|
||||||
Document,
|
|
||||||
PageOptions,
|
|
||||||
FireEngineResponse,
|
|
||||||
ExtractorOptions,
|
|
||||||
Action,
|
|
||||||
} from "../../lib/entities";
|
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
|
||||||
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
|
||||||
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
|
|
||||||
import { scrapWithFetch } from "./scrapers/fetch";
|
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
|
||||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
|
||||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
|
||||||
import { extractLinks } from "./utils/utils";
|
|
||||||
import { Logger } from "../../lib/logger";
|
|
||||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
|
||||||
import { clientSideError } from "../../strings";
|
|
||||||
import { ScrapeActionContent } from "../../lib/entities";
|
|
||||||
import { removeBase64Images } from "./utils/removeBase64Images";
|
|
||||||
|
|
||||||
dotenv.config();
|
|
||||||
|
|
||||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
|
||||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
|
||||||
|
|
||||||
export const baseScrapers = [
|
|
||||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
|
||||||
useFireEngine ? "fire-engine" : undefined,
|
|
||||||
useScrapingBee ? "scrapingBee" : undefined,
|
|
||||||
useFireEngine ? undefined : "playwright",
|
|
||||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
|
||||||
"fetch",
|
|
||||||
].filter(Boolean);
|
|
||||||
|
|
||||||
export async function generateRequestParams(
|
|
||||||
url: string,
|
|
||||||
wait_browser: string = "domcontentloaded",
|
|
||||||
timeout: number = 15000
|
|
||||||
): Promise<any> {
|
|
||||||
const defaultParams = {
|
|
||||||
url: url,
|
|
||||||
params: { timeout: timeout, wait_browser: wait_browser },
|
|
||||||
headers: { "ScrapingService-Request": "TRUE" },
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
|
||||||
const urlKey = new URL(url).hostname.replace(/^www\./, "");
|
|
||||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
|
||||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
|
||||||
} else {
|
|
||||||
return defaultParams;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error generating URL key: ${error}`);
|
|
||||||
return defaultParams;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the order of scrapers to be used for scraping a URL
|
|
||||||
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
|
|
||||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
|
||||||
* @returns The order of scrapers to be used for scraping a URL
|
|
||||||
*/
|
|
||||||
function getScrapingFallbackOrder(
|
|
||||||
defaultScraper?: string,
|
|
||||||
isWaitPresent: boolean = false,
|
|
||||||
isScreenshotPresent: boolean = false,
|
|
||||||
isHeadersPresent: boolean = false,
|
|
||||||
isActionsPresent: boolean = false,
|
|
||||||
) {
|
|
||||||
if (isActionsPresent) {
|
|
||||||
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
|
||||||
switch (scraper) {
|
|
||||||
case "scrapingBee":
|
|
||||||
case "scrapingBeeLoad":
|
|
||||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
|
||||||
case "fire-engine":
|
|
||||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
|
||||||
case "fire-engine;chrome-cdp":
|
|
||||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
|
||||||
case "playwright":
|
|
||||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let defaultOrder = [
|
|
||||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
|
||||||
useFireEngine ? "fire-engine" : undefined,
|
|
||||||
useScrapingBee ? "scrapingBee" : undefined,
|
|
||||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
|
||||||
useFireEngine ? undefined : "playwright",
|
|
||||||
"fetch",
|
|
||||||
].filter(Boolean);
|
|
||||||
|
|
||||||
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
|
||||||
// defaultOrder = [
|
|
||||||
// "fire-engine",
|
|
||||||
// useFireEngine ? undefined : "playwright",
|
|
||||||
// ...defaultOrder.filter(
|
|
||||||
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
|
||||||
// ),
|
|
||||||
// ].filter(Boolean);
|
|
||||||
// }
|
|
||||||
|
|
||||||
const filteredDefaultOrder = defaultOrder.filter(
|
|
||||||
(scraper: (typeof baseScrapers)[number]) =>
|
|
||||||
availableScrapers.includes(scraper)
|
|
||||||
);
|
|
||||||
const uniqueScrapers = new Set(
|
|
||||||
defaultScraper
|
|
||||||
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
|
|
||||||
: [...filteredDefaultOrder, ...availableScrapers]
|
|
||||||
);
|
|
||||||
|
|
||||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
|
||||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
|
||||||
jobId: string,
|
|
||||||
urlToScrap: string,
|
|
||||||
pageOptions: PageOptions,
|
|
||||||
extractorOptions?: ExtractorOptions,
|
|
||||||
existingHtml?: string,
|
|
||||||
priority?: number,
|
|
||||||
teamId?: string
|
|
||||||
): Promise<Document> {
|
|
||||||
pageOptions = {
|
|
||||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
|
||||||
includeExtract: pageOptions.includeExtract ?? false,
|
|
||||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
|
||||||
includeHtml: pageOptions.includeHtml ?? false,
|
|
||||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
|
||||||
waitFor: pageOptions.waitFor ?? undefined,
|
|
||||||
screenshot: pageOptions.screenshot ?? false,
|
|
||||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
|
||||||
headers: pageOptions.headers ?? undefined,
|
|
||||||
includeLinks: pageOptions.includeLinks ?? true,
|
|
||||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
|
||||||
parsePDF: pageOptions.parsePDF ?? true,
|
|
||||||
removeTags: pageOptions.removeTags ?? [],
|
|
||||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
|
||||||
useFastMode: pageOptions.useFastMode ?? false,
|
|
||||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
|
||||||
atsv: pageOptions.atsv ?? false,
|
|
||||||
actions: pageOptions.actions ?? undefined,
|
|
||||||
geolocation: pageOptions.geolocation ?? undefined,
|
|
||||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
|
||||||
removeBase64Images: pageOptions.removeBase64Images ?? true,
|
|
||||||
mobile: pageOptions.mobile ?? false,
|
|
||||||
}
|
|
||||||
|
|
||||||
if (extractorOptions) {
|
|
||||||
extractorOptions = {
|
|
||||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!existingHtml) {
|
|
||||||
existingHtml = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
urlToScrap = urlToScrap.trim();
|
|
||||||
|
|
||||||
const attemptScraping = async (
|
|
||||||
url: string,
|
|
||||||
method: (typeof baseScrapers)[number]
|
|
||||||
) => {
|
|
||||||
let scraperResponse: {
|
|
||||||
text: string;
|
|
||||||
screenshot: string;
|
|
||||||
actions?: {
|
|
||||||
screenshots?: string[];
|
|
||||||
scrapes?: ScrapeActionContent[];
|
|
||||||
};
|
|
||||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
|
||||||
} = { text: "", screenshot: "", metadata: {} };
|
|
||||||
let screenshot = "";
|
|
||||||
|
|
||||||
const timer = Date.now();
|
|
||||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
|
||||||
type: "scrape",
|
|
||||||
url,
|
|
||||||
worker: process.env.FLY_MACHINE_ID,
|
|
||||||
method,
|
|
||||||
result: null,
|
|
||||||
});
|
|
||||||
|
|
||||||
switch (method) {
|
|
||||||
case "fire-engine":
|
|
||||||
case "fire-engine;chrome-cdp":
|
|
||||||
|
|
||||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
|
||||||
if (method === "fire-engine;chrome-cdp") {
|
|
||||||
engine = "chrome-cdp";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
|
||||||
const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
|
|
||||||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
|
||||||
const result: Action[] = [];
|
|
||||||
// Don't add a wait if the previous action is a wait
|
|
||||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
|
||||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
|
||||||
// }
|
|
||||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
|
||||||
result.push(action);
|
|
||||||
// Don't add a wait if the next action is a wait
|
|
||||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
|
||||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
|
||||||
// }
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
return [action as Action];
|
|
||||||
}) ?? [] as Action[];
|
|
||||||
|
|
||||||
const response = await scrapWithFireEngine({
|
|
||||||
url,
|
|
||||||
...(engine === "chrome-cdp" ? ({
|
|
||||||
actions: [
|
|
||||||
...(pageOptions.waitFor ? [{
|
|
||||||
type: "wait" as const,
|
|
||||||
milliseconds: pageOptions.waitFor,
|
|
||||||
}] : []),
|
|
||||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
|
||||||
type: "screenshot" as const,
|
|
||||||
fullPage: !!pageOptions.fullPageScreenshot,
|
|
||||||
}] : []),
|
|
||||||
...processedActions,
|
|
||||||
],
|
|
||||||
}) : ({
|
|
||||||
waitFor: pageOptions.waitFor,
|
|
||||||
screenshot: pageOptions.screenshot,
|
|
||||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
|
||||||
})),
|
|
||||||
pageOptions: pageOptions,
|
|
||||||
headers: pageOptions.headers,
|
|
||||||
fireEngineOptions: {
|
|
||||||
engine: engine,
|
|
||||||
atsv: pageOptions.atsv,
|
|
||||||
disableJsDom: pageOptions.disableJsDom,
|
|
||||||
},
|
|
||||||
priority,
|
|
||||||
teamId,
|
|
||||||
});
|
|
||||||
scraperResponse.text = response.html;
|
|
||||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
|
||||||
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
|
||||||
}
|
|
||||||
if (pageOptions.actions) {
|
|
||||||
scraperResponse.actions = {
|
|
||||||
screenshots: response.screenshots ?? [],
|
|
||||||
scrapes: response.scrapeActionContent ?? [],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case "scrapingBee":
|
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
|
||||||
const response = await scrapWithScrapingBee(
|
|
||||||
url,
|
|
||||||
"domcontentloaded",
|
|
||||||
pageOptions.fallback === false ? 7000 : 15000
|
|
||||||
);
|
|
||||||
scraperResponse.text = response.content;
|
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case "playwright":
|
|
||||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
|
||||||
const response = await scrapWithPlaywright(
|
|
||||||
url,
|
|
||||||
pageOptions.waitFor,
|
|
||||||
pageOptions.headers
|
|
||||||
);
|
|
||||||
scraperResponse.text = response.content;
|
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case "scrapingBeeLoad":
|
|
||||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
|
||||||
const response = await scrapWithScrapingBee(url, "networkidle2");
|
|
||||||
scraperResponse.text = response.content;
|
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case "fetch":
|
|
||||||
const response = await scrapWithFetch(url);
|
|
||||||
scraperResponse.text = response.content;
|
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let customScrapedContent: FireEngineResponse | null = null;
|
|
||||||
|
|
||||||
// Check for custom scraping conditions
|
|
||||||
const customScraperResult = await handleCustomScraping(
|
|
||||||
scraperResponse.text,
|
|
||||||
url
|
|
||||||
);
|
|
||||||
|
|
||||||
if (customScraperResult) {
|
|
||||||
switch (customScraperResult.scraper) {
|
|
||||||
case "fire-engine":
|
|
||||||
customScrapedContent = await scrapWithFireEngine({
|
|
||||||
url: customScraperResult.url,
|
|
||||||
actions: customScraperResult.waitAfterLoad ? ([
|
|
||||||
{
|
|
||||||
type: "wait",
|
|
||||||
milliseconds: customScraperResult.waitAfterLoad,
|
|
||||||
}
|
|
||||||
]) : ([]),
|
|
||||||
pageOptions: customScraperResult.pageOptions,
|
|
||||||
});
|
|
||||||
break;
|
|
||||||
case "pdf":
|
|
||||||
const { content, pageStatusCode, pageError } =
|
|
||||||
await fetchAndProcessPdf(
|
|
||||||
customScraperResult.url,
|
|
||||||
pageOptions?.parsePDF
|
|
||||||
);
|
|
||||||
customScrapedContent = {
|
|
||||||
html: content,
|
|
||||||
pageStatusCode,
|
|
||||||
pageError,
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (customScrapedContent) {
|
|
||||||
scraperResponse.text = customScrapedContent.html;
|
|
||||||
}
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
|
||||||
let text = await parseMarkdown(cleanedHtml);
|
|
||||||
if (pageOptions.removeBase64Images) {
|
|
||||||
text = await removeBase64Images(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
const insertedLogId = await logInsertPromise;
|
|
||||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
|
||||||
response_size: scraperResponse.text.length,
|
|
||||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
|
||||||
error: scraperResponse.metadata.pageError,
|
|
||||||
response_code: scraperResponse.metadata.pageStatusCode,
|
|
||||||
time_taken: Date.now() - timer,
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
text,
|
|
||||||
html: cleanedHtml,
|
|
||||||
rawHtml: scraperResponse.text,
|
|
||||||
screenshot: scraperResponse.screenshot,
|
|
||||||
actions: scraperResponse.actions,
|
|
||||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
|
||||||
pageError: scraperResponse.metadata.pageError || undefined,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
|
||||||
text: "",
|
|
||||||
html: "",
|
|
||||||
rawHtml: "",
|
|
||||||
screenshot: "",
|
|
||||||
actions: undefined,
|
|
||||||
pageStatusCode: 200,
|
|
||||||
pageError: undefined,
|
|
||||||
};
|
|
||||||
try {
|
|
||||||
let urlKey = urlToScrap;
|
|
||||||
try {
|
|
||||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
|
||||||
}
|
|
||||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
|
||||||
const scrapersInOrder = getScrapingFallbackOrder(
|
|
||||||
defaultScraper,
|
|
||||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
|
||||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
|
||||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
|
||||||
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
|
||||||
);
|
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
|
||||||
// If exists text coming from crawler, use it
|
|
||||||
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
|
|
||||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
|
||||||
text = await parseMarkdown(cleanedHtml);
|
|
||||||
html = cleanedHtml;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
|
||||||
text = attempt.text ?? "";
|
|
||||||
html = attempt.html ?? "";
|
|
||||||
rawHtml = attempt.rawHtml ?? "";
|
|
||||||
screenshot = attempt.screenshot ?? "";
|
|
||||||
actions = attempt.actions ?? undefined;
|
|
||||||
|
|
||||||
if (attempt.pageStatusCode) {
|
|
||||||
pageStatusCode = attempt.pageStatusCode;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
|
|
||||||
pageError = attempt.pageError;
|
|
||||||
|
|
||||||
if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
|
|
||||||
pageStatusCode = 500;
|
|
||||||
}
|
|
||||||
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
|
|
||||||
pageError = undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
|
||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
|
|
||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
|
||||||
// if (nextScraperIndex < scrapersInOrder.length) {
|
|
||||||
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!text) {
|
|
||||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const soup = cheerio.load(rawHtml);
|
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
|
||||||
|
|
||||||
let linksOnPage: string[] | undefined;
|
|
||||||
|
|
||||||
if (pageOptions.includeLinks) {
|
|
||||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
|
||||||
}
|
|
||||||
|
|
||||||
let document: Document = {
|
|
||||||
content: text,
|
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
|
||||||
rawHtml:
|
|
||||||
pageOptions.includeRawHtml ||
|
|
||||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
|
||||||
? rawHtml
|
|
||||||
: undefined,
|
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
|
||||||
actions,
|
|
||||||
metadata: {
|
|
||||||
...metadata,
|
|
||||||
...(screenshot && screenshot.length > 0 ? ({
|
|
||||||
screenshot,
|
|
||||||
}) : {}),
|
|
||||||
sourceURL: urlToScrap,
|
|
||||||
pageStatusCode: pageStatusCode,
|
|
||||||
pageError: pageError,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
return document;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
|
||||||
ScrapeEvents.insert(jobId, {
|
|
||||||
type: "error",
|
|
||||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
|
||||||
stack: error.stack,
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
content: "",
|
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
|
||||||
html: "",
|
|
||||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
|
||||||
metadata: {
|
|
||||||
sourceURL: urlToScrap,
|
|
||||||
pageStatusCode: pageStatusCode,
|
|
||||||
pageError: pageError,
|
|
||||||
},
|
|
||||||
} as Document;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,9 +1,10 @@
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { Logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
|
import { scrapeURL } from "../scrapeURL";
|
||||||
|
import { scrapeOptions } from "../../controllers/v1/types";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
|
@ -17,17 +18,20 @@ export async function getLinksFromSitemap(
|
||||||
}
|
}
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
let content: string;
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} else if (mode === 'fire-engine') {
|
} else if (mode === 'fire-engine') {
|
||||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } });
|
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
|
||||||
content = response.html;
|
if (!response.success) {
|
||||||
|
throw response.error;
|
||||||
|
}
|
||||||
|
content = response.document.rawHtml!;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
@ -47,7 +51,7 @@ export async function getLinksFromSitemap(
|
||||||
allUrls.push(...validUrls);
|
allUrls.push(...validUrls);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
import * as docxProcessor from "../docxProcessor";
|
|
||||||
|
|
||||||
describe("DOCX Processing Module - Integration Test", () => {
|
|
||||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
|
||||||
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
|
|
||||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
|
||||||
);
|
|
||||||
expect(content.trim()).toContain(
|
|
||||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
|
||||||
);
|
|
||||||
expect(pageStatusCode).toBe(200);
|
|
||||||
expect(pageError).toBeUndefined();
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,128 +0,0 @@
|
||||||
import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
|
|
||||||
import cheerio from 'cheerio';
|
|
||||||
|
|
||||||
describe('parseTablesToMarkdown', () => {
|
|
||||||
it('converts a simple HTML table to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
|
||||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
|
||||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with a single row to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
|
||||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with a single column to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><th>Header 1</th></tr>
|
|
||||||
<tr><td>Row 1 Col 1</td></tr>
|
|
||||||
<tr><td>Row 2 Col 1</td></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with a single cell to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><th>Header 1</th></tr>
|
|
||||||
<tr><td>Row 1 Col 1</td></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with no header to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
|
||||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with no rows to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div></div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with no cells to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div></div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with no columns to Markdown', async () => {
|
|
||||||
const html = `
|
|
||||||
<table>
|
|
||||||
<tr><th></th></tr>
|
|
||||||
</table>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div></div>`;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table with no table to Markdown', async () => {
|
|
||||||
const html = ``;
|
|
||||||
const expectedMarkdown = ``;
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('converts a table inside of a bunch of html noise', async () => {
|
|
||||||
const html = `
|
|
||||||
<div>
|
|
||||||
<p>Some text before</p>
|
|
||||||
<table>
|
|
||||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
|
||||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
|
||||||
</table>
|
|
||||||
<p>Some text after</p>
|
|
||||||
</div>
|
|
||||||
`;
|
|
||||||
const expectedMarkdown = `<div>
|
|
||||||
<p>Some text before</p>
|
|
||||||
<div>| Row 1 Col 1 | Row 1 Col 2 |
|
|
||||||
| Row 2 Col 1 | Row 2 Col 2 |</div>
|
|
||||||
<p>Some text after</p>
|
|
||||||
</div>`;
|
|
||||||
|
|
||||||
const markdown = await parseTablesToMarkdown(html);
|
|
||||||
expect(markdown).toBe(expectedMarkdown);
|
|
||||||
});
|
|
||||||
|
|
||||||
});
|
|
|
@ -1,19 +0,0 @@
|
||||||
import * as pdfProcessor from '../pdfProcessor';
|
|
||||||
|
|
||||||
describe('PDF Processing Module - Integration Test', () => {
|
|
||||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
|
||||||
delete process.env.LLAMAPARSE_API_KEY;
|
|
||||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
|
||||||
expect(content.trim()).toEqual("Dummy PDF file");
|
|
||||||
expect(pageStatusCode).toEqual(200);
|
|
||||||
expect(pageError).toBeUndefined();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
|
||||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
|
|
||||||
expect(pageStatusCode).toBe(200);
|
|
||||||
expect(pageError).toBeUndefined();
|
|
||||||
expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
|
||||||
}, 60000); // 60 seconds
|
|
||||||
|
|
||||||
});
|
|
File diff suppressed because one or more lines are too long
|
@ -1,127 +0,0 @@
|
||||||
import { Document } from "../../../../lib/entities";
|
|
||||||
import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
|
|
||||||
|
|
||||||
describe('replacePaths', () => {
|
|
||||||
describe('replacePathsWithAbsolutePaths', () => {
|
|
||||||
it('should replace relative paths with absolute paths', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'This is a [link](/path/to/resource).',
|
|
||||||
markdown: 'This is a [link](/path/to/resource).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'This is a [link](https://example.com/path/to/resource).',
|
|
||||||
markdown: 'This is a [link](https://example.com/path/to/resource).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should not alter absolute URLs', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'This is an [external link](https://external.com/path).',
|
|
||||||
markdown: 'This is an [external link](https://external.com/path).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(documents); // Expect no change
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should not alter data URLs for images', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'This is an image: ![alt text]().',
|
|
||||||
markdown: 'This is an image: ![alt text]().'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(documents); // Expect no change
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should handle multiple links and images correctly', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
|
|
||||||
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
|
|
||||||
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should correctly handle a mix of absolute and relative paths', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().',
|
|
||||||
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().',
|
|
||||||
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('replaceImgPathsWithAbsolutePaths', () => {
|
|
||||||
it('should replace relative image paths with absolute paths', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
|
|
||||||
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
|
|
||||||
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should not alter data:image URLs', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'An image with a data URL: ![alt text]().',
|
|
||||||
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(documents); // Expect no change
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should handle multiple images with a mix of data and relative URLs', () => {
|
|
||||||
const documents: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).',
|
|
||||||
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
|
||||||
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).',
|
|
||||||
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
|
|
||||||
}];
|
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
|
||||||
expect(result).toEqual(expectedDocuments);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,66 +0,0 @@
|
||||||
import { Logger } from '../../../../lib/logger';
|
|
||||||
import { isUrlBlocked } from '../blocklist';
|
|
||||||
|
|
||||||
describe('isUrlBlocked', () => {
|
|
||||||
it('should return true for blocked social media URLs', () => {
|
|
||||||
const blockedUrls = [
|
|
||||||
'https://www.facebook.com',
|
|
||||||
'https://twitter.com/someuser',
|
|
||||||
'https://instagram.com/someuser',
|
|
||||||
'https://www.linkedin.com/in/someuser',
|
|
||||||
'https://snapchat.com/someuser',
|
|
||||||
'https://tiktok.com/@someuser',
|
|
||||||
'https://reddit.com/r/somesubreddit',
|
|
||||||
'https://flickr.com/photos/someuser',
|
|
||||||
'https://whatsapp.com/someuser',
|
|
||||||
'https://wechat.com/someuser',
|
|
||||||
'https://telegram.org/someuser',
|
|
||||||
];
|
|
||||||
|
|
||||||
blockedUrls.forEach(url => {
|
|
||||||
if (!isUrlBlocked(url)) {
|
|
||||||
Logger.debug(`URL not blocked: ${url}`);
|
|
||||||
}
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should return false for URLs containing allowed keywords', () => {
|
|
||||||
const allowedUrls = [
|
|
||||||
'https://www.facebook.com/privacy',
|
|
||||||
'https://twitter.com/terms',
|
|
||||||
'https://instagram.com/legal',
|
|
||||||
'https://www.linkedin.com/help',
|
|
||||||
'https://pinterest.com/about',
|
|
||||||
'https://snapchat.com/support',
|
|
||||||
'https://tiktok.com/contact',
|
|
||||||
'https://reddit.com/user-agreement',
|
|
||||||
'https://tumblr.com/policy',
|
|
||||||
'https://flickr.com/blog',
|
|
||||||
'https://whatsapp.com/press',
|
|
||||||
'https://wechat.com/careers',
|
|
||||||
'https://telegram.org/conditions',
|
|
||||||
'https://wix.com/careers',
|
|
||||||
];
|
|
||||||
|
|
||||||
allowedUrls.forEach(url => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should return false for non-blocked URLs', () => {
|
|
||||||
const nonBlockedUrls = [
|
|
||||||
'https://www.example.com',
|
|
||||||
'https://www.somewebsite.org',
|
|
||||||
'https://subdomain.example.com',
|
|
||||||
'firecrawl.dev',
|
|
||||||
'amazon.com',
|
|
||||||
'wix.com',
|
|
||||||
'https://wix.com'
|
|
||||||
];
|
|
||||||
|
|
||||||
nonBlockedUrls.forEach(url => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { Logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
|
|
||||||
const socialMediaBlocklist = [
|
const socialMediaBlocklist = [
|
||||||
'facebook.com',
|
'facebook.com',
|
||||||
|
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
|
||||||
return isBlocked;
|
return isBlocked;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// If an error occurs (e.g., invalid URL), return false
|
// If an error occurs (e.g., invalid URL), return false
|
||||||
Logger.error(`Error parsing the following URL: ${url}`);
|
logger.error(`Error parsing the following URL: ${url}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,198 +0,0 @@
|
||||||
export const urlSpecificParams = {
|
|
||||||
|
|
||||||
"support.greenpay.me":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 2000,
|
|
||||||
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"docs.pdw.co":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 3000,
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"developers.notion.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 2000,
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"docs2.hubitat.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 2000,
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"scrapethissite.com":{
|
|
||||||
defaultScraper: "fetch",
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"rsseau.fr":{
|
|
||||||
defaultScraper: "fetch",
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"help.salesforce.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 2000,
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
referer: "https://www.google.com/",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"accept-encoding": "gzip, deflate, br",
|
|
||||||
accept:
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"ir.veeva.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
},
|
|
||||||
"eonhealth.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
fireEngineOptions:{
|
|
||||||
mobileProxy: true,
|
|
||||||
method: "get",
|
|
||||||
engine: "request",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"notion.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params: {
|
|
||||||
wait_browser: "networkidle2",
|
|
||||||
block_resources: false,
|
|
||||||
wait: 2000,
|
|
||||||
engine: "playwright",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"developer.apple.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
engine: "playwright",
|
|
||||||
wait: 2000,
|
|
||||||
fireEngineOptions: {
|
|
||||||
blockMedia: false,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"amazon.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
fireEngineOptions:{
|
|
||||||
engine: "chrome-cdp",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"digikey.com":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
fireEngineOptions:{
|
|
||||||
engine: "tlsclient",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"zoopla.co.uk":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
fireEngineOptions:{
|
|
||||||
engine: "chrome-cdp",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"lorealparis.hu":{
|
|
||||||
defaultScraper: "fire-engine",
|
|
||||||
params:{
|
|
||||||
fireEngineOptions:{
|
|
||||||
engine: "tlsclient",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
|
@ -1,79 +0,0 @@
|
||||||
import axios from "axios";
|
|
||||||
import fs from "fs";
|
|
||||||
import { createWriteStream } from "node:fs";
|
|
||||||
import path from "path";
|
|
||||||
import os from "os";
|
|
||||||
import mammoth from "mammoth";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
|
||||||
let tempFilePath = '';
|
|
||||||
let pageStatusCode = 200;
|
|
||||||
let pageError = '';
|
|
||||||
let content = '';
|
|
||||||
|
|
||||||
try {
|
|
||||||
const downloadResult = await downloadDocx(url);
|
|
||||||
tempFilePath = downloadResult.tempFilePath;
|
|
||||||
pageStatusCode = downloadResult.pageStatusCode;
|
|
||||||
pageError = downloadResult.pageError;
|
|
||||||
content = await processDocxToText(tempFilePath);
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
|
||||||
pageStatusCode = 500;
|
|
||||||
pageError = error.message;
|
|
||||||
content = '';
|
|
||||||
} finally {
|
|
||||||
if (tempFilePath) {
|
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return { content, pageStatusCode, pageError };
|
|
||||||
}
|
|
||||||
|
|
||||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
|
||||||
try {
|
|
||||||
const response = await axios({
|
|
||||||
url,
|
|
||||||
method: "GET",
|
|
||||||
responseType: "stream",
|
|
||||||
});
|
|
||||||
|
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
|
||||||
const writer = createWriteStream(tempFilePath);
|
|
||||||
|
|
||||||
response.data.pipe(writer);
|
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
|
||||||
writer.on("error", () => {
|
|
||||||
Logger.error('Failed to write DOCX file to disk');
|
|
||||||
reject(new Error('Failed to write DOCX file to disk'));
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to download DOCX: ${error.message}`);
|
|
||||||
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function processDocxToText(filePath: string): Promise<string> {
|
|
||||||
try {
|
|
||||||
const content = await extractTextFromDocx(filePath);
|
|
||||||
return content;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
|
||||||
try {
|
|
||||||
const result = await mammoth.extractRawText({ path: filePath });
|
|
||||||
return result.value;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,42 +0,0 @@
|
||||||
export const excludeNonMainTags = [
|
|
||||||
"header",
|
|
||||||
"footer",
|
|
||||||
"nav",
|
|
||||||
"aside",
|
|
||||||
".top",
|
|
||||||
".navbar",
|
|
||||||
".footer",
|
|
||||||
".bottom",
|
|
||||||
"#footer",
|
|
||||||
".sidebar",
|
|
||||||
".side",
|
|
||||||
".aside",
|
|
||||||
"#sidebar",
|
|
||||||
".modal",
|
|
||||||
".popup",
|
|
||||||
"#modal",
|
|
||||||
".overlay",
|
|
||||||
".ad",
|
|
||||||
".ads",
|
|
||||||
".advert",
|
|
||||||
"#ad",
|
|
||||||
".lang-selector",
|
|
||||||
".language",
|
|
||||||
"#language-selector",
|
|
||||||
".social",
|
|
||||||
".social-media",
|
|
||||||
".social-links",
|
|
||||||
"#social",
|
|
||||||
".menu",
|
|
||||||
".navigation",
|
|
||||||
"#nav",
|
|
||||||
".breadcrumbs",
|
|
||||||
"#breadcrumbs",
|
|
||||||
"#search-form",
|
|
||||||
".search",
|
|
||||||
"#search",
|
|
||||||
".share",
|
|
||||||
"#share",
|
|
||||||
".cookie",
|
|
||||||
"#cookie"
|
|
||||||
];
|
|
|
@ -1,89 +0,0 @@
|
||||||
import Anthropic from '@anthropic-ai/sdk';
|
|
||||||
import axios from 'axios';
|
|
||||||
import { Logger } from '../../../lib/logger';
|
|
||||||
|
|
||||||
export async function getImageDescription(
|
|
||||||
imageUrl: string,
|
|
||||||
backText: string,
|
|
||||||
frontText: string,
|
|
||||||
model: string = "gpt-4-turbo"
|
|
||||||
): Promise<string> {
|
|
||||||
try {
|
|
||||||
const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
|
|
||||||
backText +
|
|
||||||
" and the following text: " +
|
|
||||||
frontText +
|
|
||||||
". Be super concise."
|
|
||||||
|
|
||||||
switch (model) {
|
|
||||||
case 'claude-3-opus': {
|
|
||||||
if (!process.env.ANTHROPIC_API_KEY) {
|
|
||||||
throw new Error("No Anthropic API key provided");
|
|
||||||
}
|
|
||||||
const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
|
|
||||||
const imageMediaType = 'image/png';
|
|
||||||
const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
|
|
||||||
|
|
||||||
const anthropic = new Anthropic();
|
|
||||||
const response = await anthropic.messages.create({
|
|
||||||
model: "claude-3-opus-20240229",
|
|
||||||
max_tokens: 1024,
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content: [
|
|
||||||
{
|
|
||||||
type: "image",
|
|
||||||
source: {
|
|
||||||
type: "base64",
|
|
||||||
media_type: imageMediaType,
|
|
||||||
data: imageData,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
type: "text",
|
|
||||||
text: prompt
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
});
|
|
||||||
|
|
||||||
return response[0].content.text;
|
|
||||||
}
|
|
||||||
default: {
|
|
||||||
if (!process.env.OPENAI_API_KEY) {
|
|
||||||
throw new Error("No OpenAI API key provided");
|
|
||||||
}
|
|
||||||
|
|
||||||
const { OpenAI } = require("openai");
|
|
||||||
const openai = new OpenAI();
|
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
|
||||||
model: "gpt-4-turbo",
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content: [
|
|
||||||
{
|
|
||||||
type: "text",
|
|
||||||
text: prompt,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
type: "image_url",
|
|
||||||
image_url: {
|
|
||||||
url: imageUrl,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
});
|
|
||||||
return response.choices[0].message.content;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error generating image alt text: ${error}`);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,185 +0,0 @@
|
||||||
import { CheerioAPI } from "cheerio";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
interface Metadata {
|
|
||||||
title?: string;
|
|
||||||
description?: string;
|
|
||||||
language?: string;
|
|
||||||
keywords?: string;
|
|
||||||
robots?: string;
|
|
||||||
ogTitle?: string;
|
|
||||||
ogDescription?: string;
|
|
||||||
ogUrl?: string;
|
|
||||||
ogImage?: string;
|
|
||||||
ogAudio?: string;
|
|
||||||
ogDeterminer?: string;
|
|
||||||
ogLocale?: string;
|
|
||||||
ogLocaleAlternate?: string[];
|
|
||||||
ogSiteName?: string;
|
|
||||||
ogVideo?: string;
|
|
||||||
dctermsCreated?: string;
|
|
||||||
dcDateCreated?: string;
|
|
||||||
dcDate?: string;
|
|
||||||
dctermsType?: string;
|
|
||||||
dcType?: string;
|
|
||||||
dctermsAudience?: string;
|
|
||||||
dctermsSubject?: string;
|
|
||||||
dcSubject?: string;
|
|
||||||
dcDescription?: string;
|
|
||||||
dctermsKeywords?: string;
|
|
||||||
modifiedTime?: string;
|
|
||||||
publishedTime?: string;
|
|
||||||
articleTag?: string;
|
|
||||||
articleSection?: string;
|
|
||||||
sourceURL?: string;
|
|
||||||
pageStatusCode?: number;
|
|
||||||
pageError?: string;
|
|
||||||
[key: string]: string | string[] | number | undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|
||||||
let title: string | null = null;
|
|
||||||
let description: string | null = null;
|
|
||||||
let language: string | null = null;
|
|
||||||
let keywords: string | null = null;
|
|
||||||
let robots: string | null = null;
|
|
||||||
let ogTitle: string | null = null;
|
|
||||||
let ogDescription: string | null = null;
|
|
||||||
let ogUrl: string | null = null;
|
|
||||||
let ogImage: string | null = null;
|
|
||||||
let ogAudio: string | null = null;
|
|
||||||
let ogDeterminer: string | null = null;
|
|
||||||
let ogLocale: string | null = null;
|
|
||||||
let ogLocaleAlternate: string[] | null = null;
|
|
||||||
let ogSiteName: string | null = null;
|
|
||||||
let ogVideo: string | null = null;
|
|
||||||
let dctermsCreated: string | null = null;
|
|
||||||
let dcDateCreated: string | null = null;
|
|
||||||
let dcDate: string | null = null;
|
|
||||||
let dctermsType: string | null = null;
|
|
||||||
let dcType: string | null = null;
|
|
||||||
let dctermsAudience: string | null = null;
|
|
||||||
let dctermsSubject: string | null = null;
|
|
||||||
let dcSubject: string | null = null;
|
|
||||||
let dcDescription: string | null = null;
|
|
||||||
let dctermsKeywords: string | null = null;
|
|
||||||
let modifiedTime: string | null = null;
|
|
||||||
let publishedTime: string | null = null;
|
|
||||||
let articleTag: string | null = null;
|
|
||||||
let articleSection: string | null = null;
|
|
||||||
let sourceURL: string | null = null;
|
|
||||||
let pageStatusCode: number | null = null;
|
|
||||||
let pageError: string | null = null;
|
|
||||||
|
|
||||||
const customMetadata: Record<string, string | string[]> = {};
|
|
||||||
|
|
||||||
try {
|
|
||||||
// TODO: remove this as it is redundant with the below implementation
|
|
||||||
title = soup("title").text() || null;
|
|
||||||
description = soup('meta[name="description"]').attr("content") || null;
|
|
||||||
|
|
||||||
language = soup("html").attr("lang") || null;
|
|
||||||
|
|
||||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
|
||||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
|
||||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
|
||||||
ogDescription =
|
|
||||||
soup('meta[property="og:description"]').attr("content") || null;
|
|
||||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
|
||||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
|
||||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
|
||||||
ogDeterminer =
|
|
||||||
soup('meta[property="og:determiner"]').attr("content") || null;
|
|
||||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
|
||||||
ogLocaleAlternate =
|
|
||||||
soup('meta[property="og:locale:alternate"]')
|
|
||||||
.map((i, el) => soup(el).attr("content"))
|
|
||||||
.get() || null;
|
|
||||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
|
||||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
|
||||||
articleSection =
|
|
||||||
soup('meta[name="article:section"]').attr("content") || null;
|
|
||||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
|
||||||
publishedTime =
|
|
||||||
soup('meta[property="article:published_time"]').attr("content") || null;
|
|
||||||
modifiedTime =
|
|
||||||
soup('meta[property="article:modified_time"]').attr("content") || null;
|
|
||||||
dctermsKeywords =
|
|
||||||
soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
|
||||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
|
||||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
|
||||||
dctermsSubject =
|
|
||||||
soup('meta[name="dcterms.subject"]').attr("content") || null;
|
|
||||||
dctermsAudience =
|
|
||||||
soup('meta[name="dcterms.audience"]').attr("content") || null;
|
|
||||||
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
|
||||||
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
|
||||||
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
|
||||||
dcDateCreated =
|
|
||||||
soup('meta[name="dc.date.created"]').attr("content") || null;
|
|
||||||
dctermsCreated =
|
|
||||||
soup('meta[name="dcterms.created"]').attr("content") || null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Extract all meta tags for custom metadata
|
|
||||||
soup("meta").each((i, elem) => {
|
|
||||||
try {
|
|
||||||
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
|
||||||
const content = soup(elem).attr("content");
|
|
||||||
|
|
||||||
if (name && content) {
|
|
||||||
if (customMetadata[name] === undefined) {
|
|
||||||
customMetadata[name] = content;
|
|
||||||
} else if (Array.isArray(customMetadata[name])) {
|
|
||||||
(customMetadata[name] as string[]).push(content);
|
|
||||||
} else {
|
|
||||||
customMetadata[name] = [customMetadata[name] as string, content];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error extracting custom metadata (in): ${error}`);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error extracting custom metadata: ${error}`);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error extracting metadata: ${error}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...(title ? { title } : {}),
|
|
||||||
...(description ? { description } : {}),
|
|
||||||
...(language ? { language } : {}),
|
|
||||||
...(keywords ? { keywords } : {}),
|
|
||||||
...(robots ? { robots } : {}),
|
|
||||||
...(ogTitle ? { ogTitle } : {}),
|
|
||||||
...(ogDescription ? { ogDescription } : {}),
|
|
||||||
...(ogUrl ? { ogUrl } : {}),
|
|
||||||
...(ogImage ? { ogImage } : {}),
|
|
||||||
...(ogAudio ? { ogAudio } : {}),
|
|
||||||
...(ogDeterminer ? { ogDeterminer } : {}),
|
|
||||||
...(ogLocale ? { ogLocale } : {}),
|
|
||||||
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
|
|
||||||
...(ogSiteName ? { ogSiteName } : {}),
|
|
||||||
...(ogVideo ? { ogVideo } : {}),
|
|
||||||
...(dctermsCreated ? { dctermsCreated } : {}),
|
|
||||||
...(dcDateCreated ? { dcDateCreated } : {}),
|
|
||||||
...(dcDate ? { dcDate } : {}),
|
|
||||||
...(dctermsType ? { dctermsType } : {}),
|
|
||||||
...(dcType ? { dcType } : {}),
|
|
||||||
...(dctermsAudience ? { dctermsAudience } : {}),
|
|
||||||
...(dctermsSubject ? { dctermsSubject } : {}),
|
|
||||||
...(dcSubject ? { dcSubject } : {}),
|
|
||||||
...(dcDescription ? { dcDescription } : {}),
|
|
||||||
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
|
||||||
...(modifiedTime ? { modifiedTime } : {}),
|
|
||||||
...(publishedTime ? { publishedTime } : {}),
|
|
||||||
...(articleTag ? { articleTag } : {}),
|
|
||||||
...(articleSection ? { articleSection } : {}),
|
|
||||||
...(sourceURL ? { sourceURL } : {}),
|
|
||||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
|
||||||
...(pageError ? { pageError } : {}),
|
|
||||||
...customMetadata,
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,74 +0,0 @@
|
||||||
import cheerio, { CheerioAPI } from "cheerio";
|
|
||||||
|
|
||||||
interface Replacement {
|
|
||||||
start: number;
|
|
||||||
end: number;
|
|
||||||
markdownTable: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const parseTablesToMarkdown = async (html: string): Promise<string> => {
|
|
||||||
const soup: CheerioAPI = cheerio.load(html, {
|
|
||||||
xmlMode: true,
|
|
||||||
withStartIndices: true,
|
|
||||||
withEndIndices: true
|
|
||||||
});
|
|
||||||
let tables = soup("table");
|
|
||||||
let replacements: Replacement[] = [];
|
|
||||||
|
|
||||||
if (tables.length) {
|
|
||||||
tables.each((_, tableElement) => {
|
|
||||||
const start: number = tableElement.startIndex;
|
|
||||||
const end: number = tableElement.endIndex + 1; // Include the closing tag properly
|
|
||||||
let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
|
|
||||||
const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
|
|
||||||
if (isTableEmpty) {
|
|
||||||
markdownTable = '';
|
|
||||||
}
|
|
||||||
replacements.push({ start, end, markdownTable });
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
replacements.sort((a, b) => b.start - a.start);
|
|
||||||
|
|
||||||
let modifiedHtml: string = html;
|
|
||||||
replacements.forEach(({ start, end, markdownTable }) => {
|
|
||||||
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
|
||||||
});
|
|
||||||
|
|
||||||
return modifiedHtml.trim();
|
|
||||||
};
|
|
||||||
|
|
||||||
export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
|
|
||||||
let rows: string[] = [];
|
|
||||||
let headerRowFound: boolean = false;
|
|
||||||
tableSoup("tr").each((i, tr) => {
|
|
||||||
const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
|
|
||||||
let cellText: string = tableSoup(cell).text().trim();
|
|
||||||
if (tableSoup(cell).is("th") && !headerRowFound) {
|
|
||||||
headerRowFound = true;
|
|
||||||
}
|
|
||||||
return ` ${cellText} |`;
|
|
||||||
}).get().join("");
|
|
||||||
if (cells) {
|
|
||||||
rows.push(`|${cells}`);
|
|
||||||
}
|
|
||||||
if (headerRowFound && i === 0) { // Header row
|
|
||||||
rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return rows.join('\n').trim();
|
|
||||||
};
|
|
||||||
|
|
||||||
export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
|
|
||||||
const cells: string = rowSoup("td, th").map((_, cell) => {
|
|
||||||
let cellText: string = rowSoup(cell).text().trim();
|
|
||||||
return ` ${cellText} |`;
|
|
||||||
}).get().join("");
|
|
||||||
|
|
||||||
return `|${cells}`;
|
|
||||||
};
|
|
||||||
|
|
||||||
export function createMarkdownDividerRow(cellCount: number): string {
|
|
||||||
return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
|
|
||||||
}
|
|
|
@ -1,140 +0,0 @@
|
||||||
import axios, { AxiosResponse } from "axios";
|
|
||||||
import fs from "fs/promises";
|
|
||||||
import { createReadStream, createWriteStream } from "node:fs";
|
|
||||||
import FormData from "form-data";
|
|
||||||
import dotenv from "dotenv";
|
|
||||||
import pdf from "pdf-parse";
|
|
||||||
import path from "path";
|
|
||||||
import os from "os";
|
|
||||||
import { axiosTimeout } from "../../../lib/timeout";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
dotenv.config();
|
|
||||||
|
|
||||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
|
||||||
try {
|
|
||||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
|
||||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
|
||||||
await fs.unlink(tempFilePath); // Clean up the temporary file
|
|
||||||
return { content, pageStatusCode, pageError };
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
|
||||||
return { content: "", pageStatusCode: 500, pageError: error.message };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
|
||||||
const response = await axios({
|
|
||||||
url,
|
|
||||||
method: "GET",
|
|
||||||
responseType: "stream",
|
|
||||||
});
|
|
||||||
|
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
|
||||||
const writer = createWriteStream(tempFilePath);
|
|
||||||
|
|
||||||
response.data.pipe(writer);
|
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
|
||||||
writer.on("error", reject);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
|
||||||
let content = "";
|
|
||||||
|
|
||||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
|
||||||
Logger.debug("Processing pdf document w/ LlamaIndex");
|
|
||||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
|
||||||
const headers = {
|
|
||||||
Authorization: `Bearer ${apiKey}`,
|
|
||||||
};
|
|
||||||
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
|
|
||||||
const fileType2 = "application/pdf";
|
|
||||||
|
|
||||||
try {
|
|
||||||
const formData = new FormData();
|
|
||||||
formData.append("file", createReadStream(filePath), {
|
|
||||||
filename: filePath,
|
|
||||||
contentType: fileType2,
|
|
||||||
});
|
|
||||||
|
|
||||||
const uploadUrl = `${base_url}/upload`;
|
|
||||||
const uploadResponse = await axios.post(uploadUrl, formData, {
|
|
||||||
headers: {
|
|
||||||
...headers,
|
|
||||||
...formData.getHeaders(),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const jobId = uploadResponse.data.id;
|
|
||||||
const resultType = "text";
|
|
||||||
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
|
|
||||||
|
|
||||||
let resultResponse: AxiosResponse;
|
|
||||||
let attempt = 0;
|
|
||||||
const maxAttempts = 10; // Maximum number of attempts
|
|
||||||
let resultAvailable = false;
|
|
||||||
while (attempt < maxAttempts && !resultAvailable) {
|
|
||||||
try {
|
|
||||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
|
||||||
if (resultResponse.status === 200) {
|
|
||||||
resultAvailable = true; // Exit condition met
|
|
||||||
} else {
|
|
||||||
// If the status code is not 200, increment the attempt counter and wait
|
|
||||||
attempt++;
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
|
||||||
attempt++;
|
|
||||||
if (attempt >= maxAttempts) {
|
|
||||||
Logger.error("Max attempts reached, unable to fetch result.");
|
|
||||||
break; // Exit the loop if max attempts are reached
|
|
||||||
}
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
|
||||||
// You may want to handle specific errors differently
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!resultAvailable) {
|
|
||||||
try {
|
|
||||||
content = await processPdf(filePath);
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to process PDF: ${error}`);
|
|
||||||
content = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
content = resultResponse.data[resultType];
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
|
||||||
content = await processPdf(filePath);
|
|
||||||
}
|
|
||||||
} else if (parsePDF) {
|
|
||||||
try {
|
|
||||||
content = await processPdf(filePath);
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to process PDF: ${error}`);
|
|
||||||
content = "";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
content = await fs.readFile(filePath, "utf-8");
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Failed to read PDF file: ${error}`);
|
|
||||||
content = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function processPdf(file: string) {
|
|
||||||
try {
|
|
||||||
const fileContent = await fs.readFile(file);
|
|
||||||
const data = await pdf(fileContent);
|
|
||||||
return data.text;
|
|
||||||
} catch (error) {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,82 +0,0 @@
|
||||||
import { AnyNode, Cheerio, load } from "cheerio";
|
|
||||||
import { PageOptions } from "../../../lib/entities";
|
|
||||||
import { excludeNonMainTags } from "./excludeTags";
|
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
|
||||||
html: string,
|
|
||||||
pageOptions: PageOptions,
|
|
||||||
) => {
|
|
||||||
let soup = load(html);
|
|
||||||
|
|
||||||
if (
|
|
||||||
pageOptions.onlyIncludeTags &&
|
|
||||||
pageOptions.onlyIncludeTags.length > 0 &&
|
|
||||||
pageOptions.onlyIncludeTags[0] !== ""
|
|
||||||
) {
|
|
||||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
|
||||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
|
||||||
}
|
|
||||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
|
||||||
// Create a new root element to hold the tags to keep
|
|
||||||
const newRoot = load("<div></div>")("div");
|
|
||||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
|
||||||
soup(tag).each((index, element) => {
|
|
||||||
newRoot.append(soup(element).clone());
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
soup = load(newRoot.html());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
soup("script, style, noscript, meta, head").remove();
|
|
||||||
|
|
||||||
if (
|
|
||||||
pageOptions.removeTags &&
|
|
||||||
pageOptions.removeTags.length > 0 &&
|
|
||||||
pageOptions.removeTags[0] !== ""
|
|
||||||
) {
|
|
||||||
if (typeof pageOptions.removeTags === "string") {
|
|
||||||
pageOptions.removeTags = [pageOptions.removeTags];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Array.isArray(pageOptions.removeTags)) {
|
|
||||||
pageOptions.removeTags.forEach((tag) => {
|
|
||||||
let elementsToRemove: Cheerio<AnyNode>;
|
|
||||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
|
||||||
let classMatch = false;
|
|
||||||
|
|
||||||
const regexPattern = new RegExp(tag.slice(1, -1), "i");
|
|
||||||
elementsToRemove = soup("*").filter((i, element) => {
|
|
||||||
if (element.type === "tag") {
|
|
||||||
const attributes = element.attribs;
|
|
||||||
const tagNameMatches = regexPattern.test(element.name);
|
|
||||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
|
||||||
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
|
||||||
);
|
|
||||||
if (tag.startsWith("*.")) {
|
|
||||||
classMatch = Object.keys(attributes).some((attr) =>
|
|
||||||
regexPattern.test(`class="${attributes[attr]}"`),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return tagNameMatches || attributesMatch || classMatch;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
elementsToRemove = soup(tag);
|
|
||||||
}
|
|
||||||
elementsToRemove.remove();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
|
||||||
excludeNonMainTags.forEach((tag) => {
|
|
||||||
const elementsToRemove = soup(tag);
|
|
||||||
elementsToRemove.remove();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
const cleanedHtml = soup.html();
|
|
||||||
return cleanedHtml;
|
|
||||||
};
|
|
|
@ -1,85 +0,0 @@
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
import { Document } from "../../../lib/entities";
|
|
||||||
|
|
||||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
|
||||||
try {
|
|
||||||
documents.forEach((document) => {
|
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
|
||||||
const paths =
|
|
||||||
document.content.match(
|
|
||||||
/!?\[.*?\]\(.*?\)|href=".+?"/g
|
|
||||||
) || [];
|
|
||||||
|
|
||||||
paths.forEach((path: string) => {
|
|
||||||
try {
|
|
||||||
const isImage = path.startsWith("!");
|
|
||||||
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
|
|
||||||
let url = matchedUrl[1];
|
|
||||||
|
|
||||||
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
|
||||||
if (url.startsWith("/")) {
|
|
||||||
url = url.substring(1);
|
|
||||||
}
|
|
||||||
url = new URL(url, baseUrl).toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
|
||||||
// Image is handled afterwards
|
|
||||||
if (!isImage) {
|
|
||||||
document.content = document.content.replace(
|
|
||||||
path,
|
|
||||||
`${markdownLinkOrImageText}(${url})`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
|
|
||||||
}
|
|
||||||
});
|
|
||||||
document.markdown = document.content;
|
|
||||||
});
|
|
||||||
|
|
||||||
return documents;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
|
||||||
try {
|
|
||||||
documents.forEach((document) => {
|
|
||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
|
||||||
const images =
|
|
||||||
document.content.match(
|
|
||||||
/!\[.*?\]\(.*?\)/g
|
|
||||||
) || [];
|
|
||||||
|
|
||||||
images.forEach((image: string) => {
|
|
||||||
let imageUrl = image.match(/\((.*?)\)/)[1];
|
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
|
||||||
|
|
||||||
if (!imageUrl.startsWith("data:image")) {
|
|
||||||
if (!imageUrl.startsWith("http")) {
|
|
||||||
if (imageUrl.startsWith("/")) {
|
|
||||||
imageUrl = imageUrl.substring(1);
|
|
||||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
|
||||||
} else {
|
|
||||||
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
document.content = document.content.replace(
|
|
||||||
image,
|
|
||||||
`![${altText}](${imageUrl})`
|
|
||||||
);
|
|
||||||
});
|
|
||||||
document.markdown = document.content;
|
|
||||||
});
|
|
||||||
|
|
||||||
return documents;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
};
|
|
|
@ -1,59 +0,0 @@
|
||||||
import axios from "axios";
|
|
||||||
import * as cheerio from "cheerio";
|
|
||||||
import { Logger } from "../../../lib/logger";
|
|
||||||
|
|
||||||
|
|
||||||
export async function attemptScrapWithRequests(
|
|
||||||
urlToScrap: string
|
|
||||||
): Promise<string | null> {
|
|
||||||
try {
|
|
||||||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
|
||||||
|
|
||||||
if (!response.data) {
|
|
||||||
Logger.debug("Failed normal requests as well");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return response.data;
|
|
||||||
} catch (error) {
|
|
||||||
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export function sanitizeText(text: string): string {
|
|
||||||
return text.replace("\u0000", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
const links: string[] = [];
|
|
||||||
|
|
||||||
$('a').each((_, element) => {
|
|
||||||
const href = $(element).attr('href');
|
|
||||||
if (href) {
|
|
||||||
try {
|
|
||||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
||||||
// Absolute URL, add as is
|
|
||||||
links.push(href);
|
|
||||||
} else if (href.startsWith('/')) {
|
|
||||||
// Relative URL starting with '/', append to base URL
|
|
||||||
links.push(new URL(href, baseUrl).href);
|
|
||||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
|
||||||
// Relative URL not starting with '/', append to base URL
|
|
||||||
links.push(new URL(href, baseUrl).href);
|
|
||||||
} else if (href.startsWith('mailto:')) {
|
|
||||||
// mailto: links, add as is
|
|
||||||
links.push(href);
|
|
||||||
}
|
|
||||||
// Fragment-only links (#) are ignored
|
|
||||||
} catch (error) {
|
|
||||||
// Log the error and continue
|
|
||||||
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Remove duplicates and return
|
|
||||||
return [...new Set(links)];
|
|
||||||
}
|
|
25
apps/api/src/scraper/scrapeURL/README.md
Normal file
25
apps/api/src/scraper/scrapeURL/README.md
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# `scrapeURL`
|
||||||
|
New URL scraper for Firecrawl
|
||||||
|
|
||||||
|
## Signal flow
|
||||||
|
```mermaid
|
||||||
|
flowchart TD;
|
||||||
|
scrapeURL-.->buildFallbackList;
|
||||||
|
buildFallbackList-.->scrapeURLWithEngine;
|
||||||
|
scrapeURLWithEngine-.->parseMarkdown;
|
||||||
|
parseMarkdown-.->wasScrapeSuccessful{{Was scrape successful?}};
|
||||||
|
wasScrapeSuccessful-."No".->areEnginesLeft{{Are there engines left to try?}};
|
||||||
|
areEnginesLeft-."Yes, try next engine".->scrapeURLWithEngine;
|
||||||
|
areEnginesLeft-."No".->NoEnginesLeftError[/NoEnginesLeftError/]
|
||||||
|
wasScrapeSuccessful-."Yes".->asd;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Differences from `WebScraperDataProvider`
|
||||||
|
- The job of `WebScraperDataProvider.validateInitialUrl` has been delegated to the zod layer above `scrapeUrl`.
|
||||||
|
- `WebScraperDataProvider.mode` has no equivalent, only `scrape_url` is supported.
|
||||||
|
- You may no longer specify multiple URLs.
|
||||||
|
- Built on `v1` definitons, instead of `v0`.
|
||||||
|
- PDFs are now converted straight to markdown using LlamaParse, instead of converting to just plaintext.
|
||||||
|
- DOCXs are now converted straight to HTML (and then later to markdown) using mammoth, instead of converting to just plaintext.
|
||||||
|
- Using new JSON Schema OpenAI API -- schema fails with LLM Extract will be basically non-existant.
|
||||||
|
|
15
apps/api/src/scraper/scrapeURL/engines/docx/index.ts
Normal file
15
apps/api/src/scraper/scrapeURL/engines/docx/index.ts
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
|
import mammoth from "mammoth";
|
||||||
|
|
||||||
|
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url,
|
||||||
|
statusCode: response.status,
|
||||||
|
|
||||||
|
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
|
||||||
|
}
|
||||||
|
}
|
28
apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Normal file
28
apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { TimeoutError } from "../../error";
|
||||||
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
|
||||||
|
export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const timeout = 20000;
|
||||||
|
|
||||||
|
const response = await Promise.race([
|
||||||
|
fetch(meta.url, {
|
||||||
|
redirect: "follow",
|
||||||
|
headers: meta.options.headers,
|
||||||
|
}),
|
||||||
|
(async () => {
|
||||||
|
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||||
|
throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
|
||||||
|
})()
|
||||||
|
]);
|
||||||
|
|
||||||
|
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url,
|
||||||
|
html: await response.text(),
|
||||||
|
statusCode: response.status,
|
||||||
|
// TODO: error?
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,107 @@
|
||||||
|
import { Logger } from "winston";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
import { EngineError } from "../../error";
|
||||||
|
|
||||||
|
const successSchema = z.object({
|
||||||
|
jobId: z.string(),
|
||||||
|
state: z.literal("completed"),
|
||||||
|
processing: z.literal(false),
|
||||||
|
|
||||||
|
// timeTaken: z.number(),
|
||||||
|
content: z.string(),
|
||||||
|
url: z.string().optional(),
|
||||||
|
|
||||||
|
pageStatusCode: z.number(),
|
||||||
|
pageError: z.string().optional(),
|
||||||
|
|
||||||
|
// TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
|
||||||
|
responseHeaders: z.record(z.string(), z.string()).optional(),
|
||||||
|
|
||||||
|
// timeTakenCookie: z.number().optional(),
|
||||||
|
// timeTakenRequest: z.number().optional(),
|
||||||
|
|
||||||
|
// legacy: playwright only
|
||||||
|
screenshot: z.string().optional(),
|
||||||
|
|
||||||
|
// new: actions
|
||||||
|
screenshots: z.string().array().optional(),
|
||||||
|
actionContent: z.object({
|
||||||
|
url: z.string(),
|
||||||
|
html: z.string(),
|
||||||
|
}).array().optional(),
|
||||||
|
})
|
||||||
|
|
||||||
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||||
|
|
||||||
|
const processingSchema = z.object({
|
||||||
|
jobId: z.string(),
|
||||||
|
state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown"]),
|
||||||
|
processing: z.boolean(),
|
||||||
|
});
|
||||||
|
|
||||||
|
const failedSchema = z.object({
|
||||||
|
jobId: z.string(),
|
||||||
|
state: z.literal("failed"),
|
||||||
|
processing: z.literal(false),
|
||||||
|
error: z.string(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export class StillProcessingError extends Error {
|
||||||
|
constructor(jobId: string) {
|
||||||
|
super("Job is still under processing", { cause: { jobId } })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
|
||||||
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
|
const status = await Sentry.startSpan({
|
||||||
|
name: "fire-engine: Check status",
|
||||||
|
attributes: {
|
||||||
|
jobId,
|
||||||
|
}
|
||||||
|
}, async span => {
|
||||||
|
return await robustFetch(
|
||||||
|
{
|
||||||
|
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||||
|
method: "GET",
|
||||||
|
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
||||||
|
headers: {
|
||||||
|
...(Sentry.isInitialized() ? ({
|
||||||
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
|
"baggage": Sentry.spanToBaggageHeader(span),
|
||||||
|
}) : {}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
const successParse = successSchema.safeParse(status);
|
||||||
|
const processingParse = processingSchema.safeParse(status);
|
||||||
|
const failedParse = failedSchema.safeParse(status);
|
||||||
|
|
||||||
|
if (successParse.success) {
|
||||||
|
logger.debug("Scrape succeeded!", { jobId });
|
||||||
|
return successParse.data;
|
||||||
|
} else if (processingParse.success) {
|
||||||
|
logger.debug("Scrape is still processing", { jobId });
|
||||||
|
throw new StillProcessingError(jobId);
|
||||||
|
} else if (failedParse.success) {
|
||||||
|
logger.debug("Scrape job failed", { status, jobId });
|
||||||
|
throw new EngineError("Scrape job failed", {
|
||||||
|
cause: {
|
||||||
|
status, jobId
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
logger.debug("Check status returned response not matched by any schema", { status, jobId });
|
||||||
|
throw new Error("Check status returned response not matched by any schema", {
|
||||||
|
cause: {
|
||||||
|
status, jobId
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
33
apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
Normal file
33
apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import { Logger } from "winston";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
|
||||||
|
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||||
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
|
await Sentry.startSpan({
|
||||||
|
name: "fire-engine: Delete scrape",
|
||||||
|
attributes: {
|
||||||
|
jobId,
|
||||||
|
}
|
||||||
|
}, async span => {
|
||||||
|
await robustFetch(
|
||||||
|
{
|
||||||
|
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||||
|
method: "DELETE",
|
||||||
|
headers: {
|
||||||
|
...(Sentry.isInitialized() ? ({
|
||||||
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
|
"baggage": Sentry.spanToBaggageHeader(span),
|
||||||
|
}) : {}),
|
||||||
|
},
|
||||||
|
ignoreResponse: true,
|
||||||
|
ignoreFailure: true,
|
||||||
|
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
// We do not care whether this fails or not.
|
||||||
|
}
|
198
apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Normal file
198
apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
import { Logger } from "winston";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
|
||||||
|
import { EngineError, TimeoutError } from "../../error";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { Action } from "../../../../lib/entities";
|
||||||
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
|
||||||
|
const defaultTimeout = 20000;
|
||||||
|
|
||||||
|
// This function does not take `Meta` on purpose. It may not access any
|
||||||
|
// meta values to construct the request -- that must be done by the
|
||||||
|
// `scrapeURLWithFireEngine*` functions.
|
||||||
|
async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
|
||||||
|
logger: Logger,
|
||||||
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
|
timeout = defaultTimeout,
|
||||||
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
|
const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
|
||||||
|
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errorLimit = 3;
|
||||||
|
let errors: any[] = [];
|
||||||
|
let status: FireEngineCheckStatusSuccess | undefined = undefined;
|
||||||
|
|
||||||
|
while (status === undefined) {
|
||||||
|
if (errors.length >= errorLimit) {
|
||||||
|
logger.error("Error limit hit.", { errors });
|
||||||
|
throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Date.now() - startTime > timeout) {
|
||||||
|
logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
|
||||||
|
throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof StillProcessingError) {
|
||||||
|
logger.debug("Scrape is still processing...");
|
||||||
|
} else if (error instanceof EngineError) {
|
||||||
|
logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
Sentry.captureException(error);
|
||||||
|
errors.push(error);
|
||||||
|
logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const actions: Action[] = [
|
||||||
|
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||||
|
...(meta.options.waitFor !== 0 ? [{
|
||||||
|
type: "wait" as const,
|
||||||
|
milliseconds: meta.options.waitFor,
|
||||||
|
}] : []),
|
||||||
|
|
||||||
|
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||||
|
...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
|
||||||
|
type: "screenshot" as const,
|
||||||
|
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
||||||
|
}] : []),
|
||||||
|
|
||||||
|
// Include specified actions
|
||||||
|
...(meta.options.actions ?? []),
|
||||||
|
];
|
||||||
|
|
||||||
|
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
|
||||||
|
url: meta.url,
|
||||||
|
engine: "chrome-cdp",
|
||||||
|
instantReturn: true,
|
||||||
|
skipTlsVerification: meta.options.skipTlsVerification,
|
||||||
|
headers: meta.options.headers,
|
||||||
|
...(actions.length > 0 ? ({
|
||||||
|
actions,
|
||||||
|
}) : {}),
|
||||||
|
priority: meta.internalOptions.priority,
|
||||||
|
geolocation: meta.options.geolocation,
|
||||||
|
mobile: meta.options.mobile,
|
||||||
|
// TODO: scrollXPaths
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = await performFireEngineScrape(
|
||||||
|
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||||
|
request,
|
||||||
|
);
|
||||||
|
|
||||||
|
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
||||||
|
|
||||||
|
if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
|
||||||
|
meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
|
||||||
|
response.screenshot = (response.screenshots ?? [])[0];
|
||||||
|
(response.screenshots ?? []).splice(0, 1);
|
||||||
|
meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.url) {
|
||||||
|
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url ?? meta.url,
|
||||||
|
|
||||||
|
html: response.content,
|
||||||
|
error: response.pageError,
|
||||||
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
screenshot: response.screenshot,
|
||||||
|
...(actions.length > 0 ? {
|
||||||
|
actions: {
|
||||||
|
screenshots: response.screenshots ?? [],
|
||||||
|
scrapes: response.actionContent ?? [],
|
||||||
|
}
|
||||||
|
} : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
|
||||||
|
url: meta.url,
|
||||||
|
engine: "playwright",
|
||||||
|
instantReturn: true,
|
||||||
|
|
||||||
|
headers: meta.options.headers,
|
||||||
|
priority: meta.internalOptions.priority,
|
||||||
|
screenshot: meta.options.formats.includes("screenshot"),
|
||||||
|
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||||
|
wait: meta.options.waitFor,
|
||||||
|
geolocation: meta.options.geolocation,
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = await performFireEngineScrape(
|
||||||
|
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||||
|
request,
|
||||||
|
);
|
||||||
|
|
||||||
|
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
||||||
|
|
||||||
|
if (!response.url) {
|
||||||
|
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url ?? meta.url,
|
||||||
|
|
||||||
|
html: response.content,
|
||||||
|
error: response.pageError,
|
||||||
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
|
||||||
|
screenshot: response.screenshots[0],
|
||||||
|
}) : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
|
||||||
|
url: meta.url,
|
||||||
|
engine: "tlsclient",
|
||||||
|
instantReturn: true,
|
||||||
|
|
||||||
|
headers: meta.options.headers,
|
||||||
|
priority: meta.internalOptions.priority,
|
||||||
|
|
||||||
|
atsv: meta.internalOptions.atsv,
|
||||||
|
geolocation: meta.options.geolocation,
|
||||||
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = await performFireEngineScrape(
|
||||||
|
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||||
|
request,
|
||||||
|
);
|
||||||
|
|
||||||
|
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
||||||
|
|
||||||
|
if (!response.url) {
|
||||||
|
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url ?? meta.url,
|
||||||
|
|
||||||
|
html: response.content,
|
||||||
|
error: response.pageError,
|
||||||
|
statusCode: response.pageStatusCode,
|
||||||
|
};
|
||||||
|
}
|
94
apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
Normal file
94
apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
import { Logger } from "winston";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
import { Action } from "../../../../lib/entities";
|
||||||
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
|
||||||
|
export type FireEngineScrapeRequestCommon = {
|
||||||
|
url: string;
|
||||||
|
|
||||||
|
headers?: { [K: string]: string };
|
||||||
|
|
||||||
|
blockMedia?: boolean; // default: true
|
||||||
|
blockAds?: boolean; // default: true
|
||||||
|
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
|
||||||
|
|
||||||
|
// useProxy?: boolean; // unused, default: true
|
||||||
|
// customProxy?: string; // unused
|
||||||
|
|
||||||
|
// disableSmartWaitCache?: boolean; // unused, default: false
|
||||||
|
// skipDnsCheck?: boolean; // unused, default: false
|
||||||
|
|
||||||
|
priority?: number; // default: 1
|
||||||
|
// team_id?: string; // unused
|
||||||
|
logRequest?: boolean; // default: true
|
||||||
|
instantReturn?: boolean; // default: false
|
||||||
|
geolocation?: { country?: string; languages?: string[]; };
|
||||||
|
}
|
||||||
|
|
||||||
|
export type FireEngineScrapeRequestChromeCDP = {
|
||||||
|
engine: "chrome-cdp";
|
||||||
|
skipTlsVerification?: boolean;
|
||||||
|
actions?: Action[];
|
||||||
|
blockMedia?: true; // cannot be false
|
||||||
|
mobile?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type FireEngineScrapeRequestPlaywright = {
|
||||||
|
engine: "playwright";
|
||||||
|
blockAds?: boolean; // default: true
|
||||||
|
|
||||||
|
// mutually exclusive, default: false
|
||||||
|
screenshot?: boolean;
|
||||||
|
fullPageScreenshot?: boolean;
|
||||||
|
|
||||||
|
wait?: number; // default: 0
|
||||||
|
};
|
||||||
|
|
||||||
|
export type FireEngineScrapeRequestTLSClient = {
|
||||||
|
engine: "tlsclient";
|
||||||
|
atsv?: boolean; // v0 only, default: false
|
||||||
|
disableJsDom?: boolean; // v0 only, default: false
|
||||||
|
// blockAds?: boolean; // default: true
|
||||||
|
};
|
||||||
|
|
||||||
|
const schema = z.object({
|
||||||
|
jobId: z.string(),
|
||||||
|
processing: z.boolean(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
|
||||||
|
logger: Logger,
|
||||||
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
|
): Promise<z.infer<typeof schema>> {
|
||||||
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
|
// TODO: retries
|
||||||
|
|
||||||
|
const scrapeRequest = await Sentry.startSpan({
|
||||||
|
name: "fire-engine: Scrape",
|
||||||
|
attributes: {
|
||||||
|
url: request.url,
|
||||||
|
},
|
||||||
|
}, async span => {
|
||||||
|
return await robustFetch(
|
||||||
|
{
|
||||||
|
url: `${fireEngineURL}/scrape`,
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
...(Sentry.isInitialized() ? ({
|
||||||
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
|
"baggage": Sentry.spanToBaggageHeader(span),
|
||||||
|
}) : {}),
|
||||||
|
},
|
||||||
|
body: request,
|
||||||
|
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||||
|
schema,
|
||||||
|
tryCount: 3,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
return scrapeRequest;
|
||||||
|
}
|
295
apps/api/src/scraper/scrapeURL/engines/index.ts
Normal file
295
apps/api/src/scraper/scrapeURL/engines/index.ts
Normal file
|
@ -0,0 +1,295 @@
|
||||||
|
import { ScrapeActionContent } from "../../../lib/entities";
|
||||||
|
import { Meta } from "..";
|
||||||
|
import { scrapeDOCX } from "./docx";
|
||||||
|
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
|
||||||
|
import { scrapePDF } from "./pdf";
|
||||||
|
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||||
|
import { scrapeURLWithFetch } from "./fetch";
|
||||||
|
import { scrapeURLWithPlaywright } from "./playwright";
|
||||||
|
|
||||||
|
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
||||||
|
|
||||||
|
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||||
|
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
|
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||||
|
|
||||||
|
export const engines: Engine[] = [
|
||||||
|
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
||||||
|
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
||||||
|
...(usePlaywright ? [ "playwright" as const ] : []),
|
||||||
|
"fetch",
|
||||||
|
"pdf",
|
||||||
|
"docx",
|
||||||
|
];
|
||||||
|
|
||||||
|
export const featureFlags = [
|
||||||
|
"actions",
|
||||||
|
"waitFor",
|
||||||
|
"screenshot",
|
||||||
|
"screenshot@fullScreen",
|
||||||
|
"pdf",
|
||||||
|
"docx",
|
||||||
|
"atsv",
|
||||||
|
"location",
|
||||||
|
"mobile",
|
||||||
|
"skipTlsVerification",
|
||||||
|
"useFastMode",
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
export type FeatureFlag = typeof featureFlags[number];
|
||||||
|
|
||||||
|
export const featureFlagOptions: {
|
||||||
|
[F in FeatureFlag]: {
|
||||||
|
priority: number;
|
||||||
|
}
|
||||||
|
} = {
|
||||||
|
"actions": { priority: 20 },
|
||||||
|
"waitFor": { priority: 1 },
|
||||||
|
"screenshot": { priority: 10 },
|
||||||
|
"screenshot@fullScreen": { priority: 10 },
|
||||||
|
"pdf": { priority: 100 },
|
||||||
|
"docx": { priority: 100 },
|
||||||
|
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
||||||
|
"useFastMode": { priority: 90 },
|
||||||
|
"location": { priority: 10 },
|
||||||
|
"mobile": { priority: 10 },
|
||||||
|
"skipTlsVerification": { priority: 10 },
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
export type EngineScrapeResult = {
|
||||||
|
url: string;
|
||||||
|
|
||||||
|
html: string;
|
||||||
|
markdown?: string;
|
||||||
|
statusCode: number;
|
||||||
|
error?: string;
|
||||||
|
|
||||||
|
screenshot?: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
scrapes: ScrapeActionContent[];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const engineHandlers: {
|
||||||
|
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
||||||
|
} = {
|
||||||
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
|
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||||
|
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||||
|
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
|
||||||
|
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
|
||||||
|
"playwright": scrapeURLWithPlaywright,
|
||||||
|
"fetch": scrapeURLWithFetch,
|
||||||
|
"pdf": scrapePDF,
|
||||||
|
"docx": scrapeDOCX,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const engineOptions: {
|
||||||
|
[E in Engine]: {
|
||||||
|
// A list of feature flags the engine supports.
|
||||||
|
features: { [F in FeatureFlag]: boolean },
|
||||||
|
|
||||||
|
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
||||||
|
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
||||||
|
quality: number,
|
||||||
|
}
|
||||||
|
} = {
|
||||||
|
"fire-engine;chrome-cdp": {
|
||||||
|
features: {
|
||||||
|
"actions": true,
|
||||||
|
"waitFor": true, // through actions transform
|
||||||
|
"screenshot": true, // through actions transform
|
||||||
|
"screenshot@fullScreen": true, // through actions transform
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": true,
|
||||||
|
"mobile": true,
|
||||||
|
"skipTlsVerification": true,
|
||||||
|
"useFastMode": false,
|
||||||
|
},
|
||||||
|
quality: 50,
|
||||||
|
},
|
||||||
|
"fire-engine;playwright": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": true,
|
||||||
|
"screenshot": true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": false,
|
||||||
|
},
|
||||||
|
quality: 40,
|
||||||
|
},
|
||||||
|
"scrapingbee": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": true,
|
||||||
|
"screenshot": true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": false,
|
||||||
|
},
|
||||||
|
quality: 30,
|
||||||
|
},
|
||||||
|
"scrapingbeeLoad": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": true,
|
||||||
|
"screenshot": true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": false,
|
||||||
|
},
|
||||||
|
quality: 29,
|
||||||
|
},
|
||||||
|
"playwright": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": true,
|
||||||
|
"screenshot": false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": false,
|
||||||
|
},
|
||||||
|
quality: 20,
|
||||||
|
},
|
||||||
|
"fire-engine;tlsclient": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": false,
|
||||||
|
"screenshot": false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": true,
|
||||||
|
"location": true,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": true,
|
||||||
|
},
|
||||||
|
quality: 10,
|
||||||
|
},
|
||||||
|
"fetch": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": false,
|
||||||
|
"screenshot": false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": true,
|
||||||
|
},
|
||||||
|
quality: 5,
|
||||||
|
},
|
||||||
|
"pdf": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": false,
|
||||||
|
"screenshot": false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
"pdf": true,
|
||||||
|
"docx": false,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": true,
|
||||||
|
},
|
||||||
|
quality: -10,
|
||||||
|
},
|
||||||
|
"docx": {
|
||||||
|
features: {
|
||||||
|
"actions": false,
|
||||||
|
"waitFor": false,
|
||||||
|
"screenshot": false,
|
||||||
|
"screenshot@fullScreen": false,
|
||||||
|
"pdf": false,
|
||||||
|
"docx": true,
|
||||||
|
"atsv": false,
|
||||||
|
"location": false,
|
||||||
|
"mobile": false,
|
||||||
|
"skipTlsVerification": false,
|
||||||
|
"useFastMode": true,
|
||||||
|
},
|
||||||
|
quality: -10,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export function buildFallbackList(meta: Meta): {
|
||||||
|
engine: Engine,
|
||||||
|
unsupportedFeatures: Set<FeatureFlag>,
|
||||||
|
}[] {
|
||||||
|
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||||
|
const priorityThreshold = Math.floor(prioritySum / 2);
|
||||||
|
let selectedEngines: {
|
||||||
|
engine: Engine,
|
||||||
|
supportScore: number,
|
||||||
|
unsupportedFeatures: Set<FeatureFlag>,
|
||||||
|
}[] = [];
|
||||||
|
|
||||||
|
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
|
||||||
|
|
||||||
|
for (const engine of currentEngines) {
|
||||||
|
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
|
||||||
|
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||||
|
|
||||||
|
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
||||||
|
for (const flag of meta.featureFlags) {
|
||||||
|
if (supportedFlags.has(flag)) {
|
||||||
|
unsupportedFeatures.delete(flag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (supportScore >= priorityThreshold) {
|
||||||
|
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
||||||
|
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
|
||||||
|
} else {
|
||||||
|
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
|
||||||
|
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
|
||||||
|
|
||||||
|
return selectedEngines;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
|
||||||
|
const fn = engineHandlers[engine];
|
||||||
|
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
|
||||||
|
const _meta = {
|
||||||
|
...meta,
|
||||||
|
logger,
|
||||||
|
};
|
||||||
|
|
||||||
|
return await fn(_meta);
|
||||||
|
}
|
114
apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Normal file
114
apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
import { createReadStream, promises as fs } from "node:fs";
|
||||||
|
import FormData from "form-data";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import * as marked from "marked";
|
||||||
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
import { z } from "zod";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import escapeHtml from "escape-html";
|
||||||
|
import PdfParse from "pdf-parse";
|
||||||
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
|
|
||||||
|
type PDFProcessorResult = {html: string, markdown?: string};
|
||||||
|
|
||||||
|
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||||
|
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
||||||
|
|
||||||
|
const uploadForm = new FormData();
|
||||||
|
uploadForm.append("file", createReadStream(tempFilePath), {
|
||||||
|
filename: tempFilePath,
|
||||||
|
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
|
||||||
|
});
|
||||||
|
|
||||||
|
const upload = await robustFetch({
|
||||||
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||||
|
},
|
||||||
|
body: uploadForm,
|
||||||
|
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
|
||||||
|
schema: z.object({
|
||||||
|
id: z.string(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const jobId = upload.id;
|
||||||
|
|
||||||
|
// TODO: timeout, retries
|
||||||
|
const result = await robustFetch({
|
||||||
|
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||||
|
method: "GET",
|
||||||
|
headers: {
|
||||||
|
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||||
|
},
|
||||||
|
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
|
||||||
|
schema: z.object({
|
||||||
|
markdown: z.string(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
markdown: result.markdown,
|
||||||
|
html: await marked.parse(result.markdown, { async: true }),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||||
|
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||||
|
|
||||||
|
const result = await PdfParse(await fs.readFile(tempFilePath));
|
||||||
|
const escaped = escapeHtml(result.text);
|
||||||
|
|
||||||
|
return {
|
||||||
|
markdown: escaped,
|
||||||
|
html: escaped,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
if (!meta.options.parsePDF) {
|
||||||
|
const file = await fetchFileToBuffer(meta.url);
|
||||||
|
const content = file.buffer.toString("base64");
|
||||||
|
return {
|
||||||
|
url: file.response.url,
|
||||||
|
statusCode: file.response.status,
|
||||||
|
|
||||||
|
html: content,
|
||||||
|
markdown: content,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||||
|
|
||||||
|
let result: PDFProcessorResult | null = null;
|
||||||
|
if (process.env.LLAMAPARSE_API_KEY) {
|
||||||
|
try {
|
||||||
|
result = await scrapePDFWithLlamaParse({
|
||||||
|
...meta,
|
||||||
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
||||||
|
}, tempFilePath);
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
||||||
|
Sentry.captureException(error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result === null) {
|
||||||
|
result = await scrapePDFWithParsePDF({
|
||||||
|
...meta,
|
||||||
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
|
||||||
|
}, tempFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
await fs.unlink(tempFilePath);
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: response.url,
|
||||||
|
statusCode: response.status,
|
||||||
|
|
||||||
|
html: result.html,
|
||||||
|
markdown: result.markdown,
|
||||||
|
}
|
||||||
|
}
|
42
apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
Normal file
42
apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
import { z } from "zod";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { TimeoutError } from "../../error";
|
||||||
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
|
||||||
|
export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const timeout = 20000 + meta.options.waitFor;
|
||||||
|
|
||||||
|
const response = await Promise.race([
|
||||||
|
await robustFetch({
|
||||||
|
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
url: meta.url,
|
||||||
|
wait_after_load: meta.options.waitFor,
|
||||||
|
timeout,
|
||||||
|
headers: meta.options.headers,
|
||||||
|
}),
|
||||||
|
method: "POST",
|
||||||
|
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||||
|
schema: z.object({
|
||||||
|
content: z.string(),
|
||||||
|
pageStatusCode: z.number(),
|
||||||
|
pageError: z.string().optional(),
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
(async () => {
|
||||||
|
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||||
|
throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
|
||||||
|
})(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: meta.url, // TODO: impove redirect following
|
||||||
|
html: response.content,
|
||||||
|
statusCode: response.pageStatusCode,
|
||||||
|
error: response.pageError,
|
||||||
|
}
|
||||||
|
}
|
66
apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
Normal file
66
apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import { ScrapingBeeClient } from "scrapingbee";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
import { AxiosError, type AxiosResponse } from "axios";
|
||||||
|
import { EngineError } from "../../error";
|
||||||
|
|
||||||
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||||
|
|
||||||
|
export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise<EngineScrapeResult>) {
|
||||||
|
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||||
|
let response: AxiosResponse<any>;
|
||||||
|
try {
|
||||||
|
response = await client.get({
|
||||||
|
url: meta.url,
|
||||||
|
params: {
|
||||||
|
timeout: 15000, // TODO: dynamic timeout based on request timeout
|
||||||
|
wait_browser: wait_browser,
|
||||||
|
wait: Math.min(meta.options.waitFor, 35000),
|
||||||
|
transparent_status_code: true,
|
||||||
|
json_response: true,
|
||||||
|
screenshot: meta.options.formats.includes("screenshot"),
|
||||||
|
screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"),
|
||||||
|
},
|
||||||
|
headers: {
|
||||||
|
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof AxiosError && error.response !== undefined) {
|
||||||
|
response = error.response;
|
||||||
|
} else {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const data: Buffer = response.data;
|
||||||
|
const body = JSON.parse(new TextDecoder().decode(data));
|
||||||
|
|
||||||
|
const headers = body.headers ?? {};
|
||||||
|
const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]);
|
||||||
|
|
||||||
|
if (body.errors || body.body?.error || isHiddenEngineError) {
|
||||||
|
meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body });
|
||||||
|
throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof body.body !== "string") {
|
||||||
|
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
||||||
|
throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } });
|
||||||
|
}
|
||||||
|
|
||||||
|
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers);
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: body["resolved-url"] ?? meta.url,
|
||||||
|
|
||||||
|
html: body.body,
|
||||||
|
error: response.status >= 300 ? response.statusText : undefined,
|
||||||
|
statusCode: response.status,
|
||||||
|
...(body.screenshot ? ({
|
||||||
|
screenshot: `data:image/png;base64,${body.screenshot}`,
|
||||||
|
}) : {}),
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
45
apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
Normal file
45
apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
import path from "path";
|
||||||
|
import os from "os";
|
||||||
|
import { createWriteStream, promises as fs } from "node:fs";
|
||||||
|
import { EngineError } from "../../error";
|
||||||
|
import { Writable } from "stream";
|
||||||
|
import { v4 as uuid } from "uuid";
|
||||||
|
|
||||||
|
export async function fetchFileToBuffer(url: string): Promise<{
|
||||||
|
response: Response,
|
||||||
|
buffer: Buffer
|
||||||
|
}> {
|
||||||
|
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||||
|
return {
|
||||||
|
response,
|
||||||
|
buffer: Buffer.from(await response.arrayBuffer()),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function downloadFile(id: string, url: string): Promise<{
|
||||||
|
response: Response
|
||||||
|
tempFilePath: string
|
||||||
|
}> {
|
||||||
|
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||||
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
|
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||||
|
|
||||||
|
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||||
|
if (response.body === null) {
|
||||||
|
throw new EngineError("Response body was null", { cause: { response } });
|
||||||
|
}
|
||||||
|
|
||||||
|
response.body.pipeTo(Writable.toWeb(tempFileWrite));
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
tempFileWrite.on("finish", () => resolve(null));
|
||||||
|
tempFileWrite.on("error", (error) => {
|
||||||
|
reject(new EngineError("Failed to write to temp file", { cause: { error } }));
|
||||||
|
});
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
response,
|
||||||
|
tempFilePath,
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
import { Logger } from "winston";
|
||||||
|
import { AddFeatureError } from "../../error";
|
||||||
|
|
||||||
|
export function specialtyScrapeCheck(logger: Logger, headers: Record<string, string> | undefined) {
|
||||||
|
const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1];
|
||||||
|
|
||||||
|
if (contentType === undefined) {
|
||||||
|
logger.warn("Failed to check contentType -- was not present in headers", { headers });
|
||||||
|
} else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf
|
||||||
|
throw new AddFeatureError(["pdf"]);
|
||||||
|
} else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx
|
||||||
|
throw new AddFeatureError(["docx"]);
|
||||||
|
}
|
||||||
|
}
|
34
apps/api/src/scraper/scrapeURL/error.ts
Normal file
34
apps/api/src/scraper/scrapeURL/error.ts
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import { EngineResultsTracker } from "."
|
||||||
|
import { Engine, FeatureFlag } from "./engines"
|
||||||
|
|
||||||
|
export class EngineError extends Error {
|
||||||
|
constructor(message?: string, options?: ErrorOptions) {
|
||||||
|
super(message, options)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class TimeoutError extends Error {
|
||||||
|
constructor(message?: string, options?: ErrorOptions) {
|
||||||
|
super(message, options)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class NoEnginesLeftError extends Error {
|
||||||
|
public fallbackList: Engine[];
|
||||||
|
public results: EngineResultsTracker;
|
||||||
|
|
||||||
|
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
||||||
|
super("All scraping engines failed!");
|
||||||
|
this.fallbackList = fallbackList;
|
||||||
|
this.results = results;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class AddFeatureError extends Error {
|
||||||
|
public featureFlags: FeatureFlag[];
|
||||||
|
|
||||||
|
constructor(featureFlags: FeatureFlag[]) {
|
||||||
|
super("New feature flags have been discovered: " + featureFlags.join(", "));
|
||||||
|
this.featureFlags = featureFlags;
|
||||||
|
}
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user