mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into pr/516
This commit is contained in:
commit
08a9cb8db4
42
.github/archive/publish-rust-sdk.yml
vendored
Normal file
42
.github/archive/publish-rust-sdk.yml
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
name: Publish Rust SDK
|
||||
|
||||
on: []
|
||||
|
||||
env:
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
profile: minimal
|
||||
|
||||
- name: Install dependencies
|
||||
run: cargo build --release
|
||||
|
||||
- name: Run version check script
|
||||
id: version_check_script
|
||||
run: |
|
||||
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
|
||||
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
|
||||
|
||||
- name: Build the package
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
run: cargo package
|
||||
working-directory: ./apps/rust-sdk
|
||||
|
||||
- name: Publish to crates.io
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
env:
|
||||
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
run: cargo publish
|
||||
working-directory: ./apps/rust-sdk
|
61
.github/archive/rust-sdk.yml
vendored
Normal file
61
.github/archive/rust-sdk.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
name: Run Rust SDK E2E Tests
|
||||
|
||||
on: []
|
||||
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Set up Rust
|
||||
uses: actions/setup-rust@v1
|
||||
with:
|
||||
rust-version: stable
|
||||
- name: Try the lib build
|
||||
working-directory: ./apps/rust-sdk
|
||||
run: cargo build
|
||||
- name: Run E2E tests for Rust SDK
|
||||
run: cargo test --test e2e_with_auth
|
20
.github/scripts/check_version_has_incremented.py
vendored
20
.github/scripts/check_version_has_incremented.py
vendored
|
@ -15,6 +15,7 @@ false
|
|||
|
||||
"""
|
||||
import json
|
||||
import toml
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
@ -53,6 +54,19 @@ def get_npm_version(package_name: str) -> str:
|
|||
version = response.json()['version']
|
||||
return version.strip()
|
||||
|
||||
def get_rust_version(file_path: str) -> str:
|
||||
"""Extract version string from Cargo.toml."""
|
||||
cargo_toml = toml.load(file_path)
|
||||
if 'package' in cargo_toml and 'version' in cargo_toml['package']:
|
||||
return cargo_toml['package']['version'].strip()
|
||||
raise RuntimeError("Unable to find version string in Cargo.toml.")
|
||||
|
||||
def get_crates_version(package_name: str) -> str:
|
||||
"""Get latest version of Rust package from crates.io."""
|
||||
response = requests.get(f"https://crates.io/api/v1/crates/{package_name}")
|
||||
version = response.json()['crate']['newest_version']
|
||||
return version.strip()
|
||||
|
||||
def is_version_incremented(local_version: str, published_version: str) -> bool:
|
||||
"""Compare local and published versions."""
|
||||
local_version_parsed: Version = parse_version(local_version)
|
||||
|
@ -74,6 +88,12 @@ if __name__ == "__main__":
|
|||
current_version = get_js_version(os.path.join(package_path, 'package.json'))
|
||||
# Get published version from npm
|
||||
published_version = get_npm_version(package_name)
|
||||
if package_type == "rust":
|
||||
# Get current version from Cargo.toml
|
||||
current_version = get_rust_version(os.path.join(package_path, 'Cargo.toml'))
|
||||
# Get published version from crates.io
|
||||
published_version = get_crates_version(package_name)
|
||||
|
||||
else:
|
||||
raise ValueError("Invalid package type. Use 'python' or 'js'.")
|
||||
|
||||
|
|
3
.github/scripts/requirements.txt
vendored
3
.github/scripts/requirements.txt
vendored
|
@ -1,2 +1,3 @@
|
|||
requests
|
||||
packaging
|
||||
packaging
|
||||
toml
|
7
.github/workflows/fly-direct.yml
vendored
7
.github/workflows/fly-direct.yml
vendored
|
@ -1,7 +1,7 @@
|
|||
name: Fly Deploy Direct
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 */6 * * *'
|
||||
- cron: '0 */2 * * *'
|
||||
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
|
@ -22,16 +22,19 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
name: Deploy app
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
|
|
84
.github/workflows/fly.yml
vendored
84
.github/workflows/fly.yml
vendored
|
@ -26,6 +26,8 @@ env:
|
|||
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
|
||||
jobs:
|
||||
pre-deploy-e2e-tests:
|
||||
|
@ -131,7 +133,7 @@ jobs:
|
|||
working-directory: ./apps/python-sdk
|
||||
- name: Run E2E tests for Python SDK
|
||||
run: |
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
pytest firecrawl/__tests__/v1/e2e_withAuth/test.py
|
||||
working-directory: ./apps/python-sdk
|
||||
|
||||
js-sdk-tests:
|
||||
|
@ -168,7 +170,7 @@ jobs:
|
|||
- name: Run E2E tests for JavaScript SDK
|
||||
run: npm run test
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
|
||||
go-sdk-tests:
|
||||
name: Go SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
|
@ -204,18 +206,54 @@ jobs:
|
|||
run: go test -v ./... -timeout 180s
|
||||
working-directory: ./apps/go-sdk/firecrawl
|
||||
|
||||
rust-sdk-tests:
|
||||
name: Rust SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Set up Rust
|
||||
uses: actions/setup-rust@v1
|
||||
with:
|
||||
rust-version: stable
|
||||
- name: Try the lib build
|
||||
working-directory: ./apps/rust-sdk
|
||||
run: cargo build
|
||||
- name: Run E2E tests for Rust SDK
|
||||
run: cargo test --test e2e_with_auth
|
||||
|
||||
deploy:
|
||||
name: Deploy app
|
||||
runs-on: ubuntu-latest
|
||||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests]
|
||||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests, rust-sdk-tests]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
|
||||
build-and-publish-python-sdk:
|
||||
name: Build and publish Python SDK
|
||||
|
@ -283,7 +321,7 @@ jobs:
|
|||
- name: Install dependencies for JavaScript SDK
|
||||
run: pnpm install
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
|
||||
- name: Run version check script
|
||||
id: version_check_script
|
||||
run: |
|
||||
|
@ -297,4 +335,38 @@ jobs:
|
|||
run: |
|
||||
npm run build-and-publish
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
build-and-publish-rust-sdk:
|
||||
name: Build and publish Rust SDK
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
profile: minimal
|
||||
|
||||
- name: Install dependencies
|
||||
run: cargo build --release
|
||||
|
||||
- name: Run version check script
|
||||
id: version_check_script
|
||||
run: |
|
||||
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
|
||||
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
|
||||
|
||||
- name: Build the package
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
run: cargo package
|
||||
working-directory: ./apps/rust-sdk
|
||||
|
||||
- name: Publish to crates.io
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
env:
|
||||
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
run: cargo publish
|
||||
working-directory: ./apps/rust-sdk
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -19,3 +19,5 @@ apps/test-suite/load-test-results/test-run-report.json
|
|||
apps/playwright-service-ts/node_modules/
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
*.pyc
|
||||
.rdb
|
||||
|
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
[submodule "apps/go-sdk/firecrawl-go"]
|
||||
path = apps/go-sdk/firecrawl-go
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/firecrawl-go-examples"]
|
||||
path = apps/go-sdk/firecrawl-go-examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
|
365
README.md
365
README.md
|
@ -1,3 +1,38 @@
|
|||
<h3 align="center">
|
||||
<img
|
||||
src="https://raw.githubusercontent.com/mendableai/firecrawl/main/img/firecrawl_logo.png"
|
||||
height="200"
|
||||
>
|
||||
</h3>
|
||||
<div align="center">
|
||||
<a href="https://github.com/mendableai/firecrawl/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/github/license/mendableai/firecrawl" alt="License">
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/firecrawl-py">
|
||||
<img src="https://static.pepy.tech/badge/firecrawl-py" alt="Downloads">
|
||||
</a>
|
||||
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
||||
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
||||
</a>
|
||||
<a href="https://github.com/mendableai/firecrawl">
|
||||
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source">
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div>
|
||||
<p align="center">
|
||||
<a href="https://twitter.com/firecrawl_dev">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/104100957">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.com/invite/gSmWdAkdwd">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
# 🔥 Firecrawl
|
||||
|
||||
Crawl and convert any website into LLM-ready markdown or structured data. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the Firecrawl community. Includes powerful scraping, crawling and data extraction capabilities.
|
||||
|
@ -6,11 +41,13 @@ _This repository is in its early development stages. We are still merging custom
|
|||
|
||||
## What is Firecrawl?
|
||||
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev).
|
||||
|
||||
_Pst. hey, you, join our stargazers :)_
|
||||
|
||||
<img src="https://github.com/mendableai/firecrawl/assets/44934913/53c4483a-0f0e-40c6-bd84-153a07f94d29" width="200">
|
||||
<a href="https://github.com/mendableai/firecrawl">
|
||||
<img src="https://img.shields.io/github/stars/mendableai/firecrawl.svg?style=social&label=Star&maxAge=2592000" alt="GitHub stars">
|
||||
</a>
|
||||
|
||||
## How to use it?
|
||||
|
||||
|
@ -41,18 +78,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||
Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/crawl \
|
||||
curl -X POST https://api.firecrawl.dev/v1/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-H 'Authorization: Bearer fc-YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"limit": 100,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "html"]
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Returns a jobId
|
||||
Returns a crawl job id and the url to check the status of the crawl.
|
||||
|
||||
```json
|
||||
{ "jobId": "1234-5678-9101" }
|
||||
{
|
||||
"success": true,
|
||||
"id": "123-456-789",
|
||||
"url": "https://api.firecrawl.dev/v1/crawl/123-456-789"
|
||||
}
|
||||
```
|
||||
|
||||
### Check Crawl Job
|
||||
|
@ -60,7 +105,7 @@ Returns a jobId
|
|||
Used to check the status of a crawl job and get its result.
|
||||
|
||||
```bash
|
||||
curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
||||
curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY'
|
||||
```
|
||||
|
@ -68,18 +113,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
|||
```json
|
||||
{
|
||||
"status": "completed",
|
||||
"current": 22,
|
||||
"total": 22,
|
||||
"total": 36,
|
||||
"creditsUsed": 36,
|
||||
"expiresAt": "2024-00-00T00:00:00.000Z",
|
||||
"data": [
|
||||
{
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl",
|
||||
"language": "en",
|
||||
"sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
"description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.",
|
||||
"ogLocaleAlternate": [],
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -88,14 +135,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
|||
|
||||
### Scraping
|
||||
|
||||
Used to scrape a URL and get its content.
|
||||
Used to scrape a URL and get its content in the specified formats.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -105,68 +153,95 @@ Response:
|
|||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Home - Firecrawl",
|
||||
"description": "Firecrawl crawls and converts any website into clean markdown.",
|
||||
"language": "en",
|
||||
"keywords": "Firecrawl,Markdown,Data,Mendable,Langchain",
|
||||
"robots": "follow, index",
|
||||
"ogTitle": "Firecrawl",
|
||||
"ogDescription": "Turn any website into LLM-ready data.",
|
||||
"ogUrl": "https://www.firecrawl.dev/",
|
||||
"ogImage": "https://www.firecrawl.dev/og.png?123",
|
||||
"ogLocaleAlternate": [],
|
||||
"ogSiteName": "Firecrawl",
|
||||
"sourceURL": "https://firecrawl.dev",
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Search (Beta)
|
||||
### Map (Alpha)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
Used to map a URL and get urls of the website. This returns most links present on the website.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
"url": "https://firecrawl.dev"
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://firecrawl.dev",
|
||||
"https://www.firecrawl.dev/pricing",
|
||||
"https://www.firecrawl.dev/blog",
|
||||
"https://www.firecrawl.dev/playground",
|
||||
"https://www.firecrawl.dev/smart-crawl",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Intelligent Extraction (Beta)
|
||||
#### Map with search
|
||||
|
||||
Map with `search` param allows you to search for specific urls inside a website.
|
||||
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://firecrawl.dev",
|
||||
"search": "docs"
|
||||
}'
|
||||
```
|
||||
|
||||
Response will be an ordered list from the most relevant to the least relevant.
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://docs.firecrawl.dev",
|
||||
"https://docs.firecrawl.dev/sdks/python",
|
||||
"https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### LLM Extraction (Beta)
|
||||
|
||||
Used to extract structured data from scraped pages.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://www.mendable.ai/",
|
||||
"extractorOptions": {
|
||||
"mode": "llm-extraction",
|
||||
"extractionPrompt": "Based on the information on the page, extract the information from the schema. ",
|
||||
"extractionSchema": {
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company_mission": {
|
||||
|
@ -220,6 +295,59 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
|
|||
}
|
||||
```
|
||||
|
||||
### Extracting without a schema (New)
|
||||
|
||||
You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://docs.firecrawl.dev/",
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"prompt": "Extract the company mission from the page."
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Using Python SDK
|
||||
|
||||
### Installing Python SDK
|
||||
|
@ -231,24 +359,28 @@ pip install firecrawl-py
|
|||
### Crawl a website
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="YOUR_API_KEY")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
# Scrape a website:
|
||||
scrape_status = app.scrape_url(
|
||||
'https://firecrawl.dev',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
print(scrape_status)
|
||||
|
||||
# Get the markdown
|
||||
for result in crawl_result:
|
||||
print(result['markdown'])
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```python
|
||||
url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
# Crawl a website:
|
||||
crawl_status = app.crawl_url(
|
||||
'https://firecrawl.dev',
|
||||
params={
|
||||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
@ -256,6 +388,11 @@ scraped_data = app.scrape_url(url)
|
|||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```python
|
||||
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
|
@ -277,15 +414,6 @@ data = app.scrape_url('https://news.ycombinator.com', {
|
|||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
|
||||
|
||||
```python
|
||||
query = 'What is Mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
## Using the Node SDK
|
||||
|
||||
### Installation
|
||||
|
@ -301,54 +429,33 @@ npm install @mendable/firecrawl-js
|
|||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```js
|
||||
try {
|
||||
const url = "https://example.com";
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
console.log(scrapedData);
|
||||
} catch (error) {
|
||||
console.error("Error occurred while scraping:", error.message);
|
||||
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website
|
||||
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
|
||||
formats: ['markdown', 'html'],
|
||||
});
|
||||
|
||||
if (scrapeResponse) {
|
||||
console.log(scrapeResponse)
|
||||
}
|
||||
|
||||
// Crawl a website
|
||||
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
limit: 100,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
}
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```js
|
||||
const crawlUrl = "https://example.com";
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
const waitUntilDone = true;
|
||||
const timeout = 5;
|
||||
const crawlResult = await app.crawlUrl(
|
||||
crawlUrl,
|
||||
params,
|
||||
waitUntilDone,
|
||||
timeout
|
||||
);
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```js
|
||||
const status = await app.checkCrawlStatus(jobId);
|
||||
console.log(status);
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
|
@ -360,6 +467,7 @@ import { z } from "zod";
|
|||
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
version: "v0"
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
|
@ -384,19 +492,6 @@ const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
|||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
```js
|
||||
const query = "what is mendable?";
|
||||
const searchResults = await app.search(query, {
|
||||
pageOptions: {
|
||||
fetchPageContent: true, // Fetch the page content for each search result
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
|
|
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
@ -160,7 +159,7 @@ Errors related to connecting to Redis, such as timeouts or "Connection refused".
|
|||
|
||||
**Solution:**
|
||||
- Ensure that the Redis service is up and running in your Docker environment.
|
||||
- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance.
|
||||
- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance, ensure that it points to the same URL in the `docker-compose.yaml` file (`redis://redis:6379`)
|
||||
- Check network settings and firewall rules that may block the connection to the Redis port.
|
||||
|
||||
### API endpoint does not respond
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://localhost:6379
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
|
@ -17,18 +17,27 @@ SUPABASE_URL=
|
|||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
# use if you've set up authentication and want to test with a real API key
|
||||
TEST_API_KEY=
|
||||
# set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE=
|
||||
# set if you'd like to test the crawling rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL=
|
||||
# set if you'd like to use scraping Be to handle JS blocking
|
||||
SCRAPING_BEE_API_KEY=
|
||||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_API_KEY=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST=
|
||||
|
||||
STRIPE_PRICE_ID_STANDARD=
|
||||
STRIPE_PRICE_ID_SCALE=
|
||||
|
@ -43,7 +52,8 @@ STRIPE_PRICE_ID_GROWTH_YEARLY=
|
|||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||
PROXY_SERVER=
|
||||
|
|
3
apps/api/.gitignore
vendored
3
apps/api/.gitignore
vendored
|
@ -6,3 +6,6 @@ dump.rdb
|
|||
/mongo-data
|
||||
|
||||
/.next/
|
||||
|
||||
.rdb
|
||||
.sentryclirc
|
||||
|
|
|
@ -12,8 +12,10 @@ RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-l
|
|||
FROM base AS build
|
||||
RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile
|
||||
|
||||
RUN apt-get update -qq && apt-get install -y ca-certificates && update-ca-certificates
|
||||
RUN pnpm install
|
||||
RUN pnpm run build
|
||||
RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
|
||||
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
|
||||
|
||||
# Install packages needed for deployment
|
||||
|
||||
|
|
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 100
|
||||
soft_limit = 50
|
||||
# hard_limit = 100
|
||||
soft_limit = 100
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "10s"
|
||||
|
@ -51,12 +51,13 @@ kill_timeout = '30s'
|
|||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 25
|
||||
soft_limit = 20
|
||||
# hard_limit = 25
|
||||
soft_limit = 100
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-1x'
|
||||
size = 'performance-2x'
|
||||
processes = ['app','worker']
|
||||
memory = 8192
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 200
|
||||
soft_limit = 75
|
||||
# hard_limit = 200
|
||||
soft_limit = 200
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "20s"
|
||||
|
@ -50,8 +50,8 @@ kill_timeout = '30s'
|
|||
|
||||
[services.concurrency]
|
||||
type = 'connections'
|
||||
hard_limit = 30
|
||||
soft_limit = 12
|
||||
# hard_limit = 30
|
||||
soft_limit = 200
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-4x'
|
||||
|
|
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
|
@ -0,0 +1,924 @@
|
|||
{
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": "Firecrawl API",
|
||||
"version": "0.0.0",
|
||||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
"url": "https://api.firecrawl.dev/v0"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl": {
|
||||
"post": {
|
||||
"summary": "Crawl multiple URLs based on options",
|
||||
"operationId": "crawlUrls",
|
||||
"tags": ["Crawling"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The base URL to start crawling from"
|
||||
},
|
||||
"crawlerOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"includes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
},
|
||||
"excludes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
},
|
||||
"generateImgAltText": {
|
||||
"type": "boolean",
|
||||
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||
"default": false
|
||||
},
|
||||
"returnOnlyUrls": {
|
||||
"type": "boolean",
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
"description": "Ignore the website sitemap when crawling",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CrawlResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search": {
|
||||
"post": {
|
||||
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||
"operationId": "searchGoogle",
|
||||
"tags": ["Search"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"fetchPageContent": {
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"searchOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results. Max is 20 during beta."
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SearchResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/status/{jobId}": {
|
||||
"get": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Get the status of a crawl job",
|
||||
"operationId": "getCrawlStatus",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Status of the job (completed, active, failed, paused)"
|
||||
},
|
||||
"current": {
|
||||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"bearerAuth": {
|
||||
"type": "http",
|
||||
"scheme": "bearer"
|
||||
}
|
||||
},
|
||||
"schemas": {
|
||||
"ScrapeResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"SearchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
|
@ -18,8 +18,8 @@
|
|||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrape",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
|
@ -38,94 +38,47 @@
|
|||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
"formats": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||
},
|
||||
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||
"default": ["markdown"]
|
||||
},
|
||||
"extractorOptions": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": true
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
|
@ -741,24 +694,42 @@
|
|||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Warning message to let you know of any issues."
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -780,27 +751,16 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -810,24 +770,33 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -849,11 +818,11 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
|
@ -871,34 +840,63 @@
|
|||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
"markdown": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -909,8 +907,15 @@
|
|||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,8 @@
|
|||
"format": "prettier --write \"src/**/*.(js|ts)\"",
|
||||
"flyio": "node dist/src/index.js",
|
||||
"start:dev": "nodemon --exec ts-node src/index.ts",
|
||||
"build": "tsc",
|
||||
"build": "tsc && pnpm sentry:sourcemaps",
|
||||
"build:nosentry": "tsc",
|
||||
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
||||
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
||||
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
||||
|
@ -19,8 +20,9 @@
|
|||
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
||||
"mongo-docker-console": "docker exec -it mongodb mongosh",
|
||||
"run-example": "npx ts-node src/example.ts",
|
||||
"deploy:fly": "flyctl deploy",
|
||||
"deploy:fly:staging": "fly deploy -c fly.staging.toml"
|
||||
"deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN)",
|
||||
"deploy:fly:staging": "fly deploy -c fly.staging.toml",
|
||||
"sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
|
@ -29,7 +31,6 @@
|
|||
"@jest/globals": "^29.7.0",
|
||||
"@tsconfig/recommended": "^1.0.3",
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/bull": "^4.10.0",
|
||||
"@types/cors": "^2.8.13",
|
||||
"@types/express": "^4.17.17",
|
||||
"@types/jest": "^29.5.12",
|
||||
|
@ -53,17 +54,21 @@
|
|||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.13",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.0",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/node": "^8.13.0",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
"@sentry/profiling-node": "^8.26.0",
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
"axios": "^1.3.4",
|
||||
"bottleneck": "^2.19.5",
|
||||
"bull": "^4.15.0",
|
||||
"bullmq": "^5.11.0",
|
||||
"cacheable-lookup": "^6.1.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"cohere": "^1.1.1",
|
||||
|
@ -71,7 +76,9 @@
|
|||
"cron-parser": "^4.9.0",
|
||||
"date-fns": "^3.6.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"dotenv-cli": "^7.4.2",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
|
@ -87,7 +94,7 @@
|
|||
"moment": "^2.29.4",
|
||||
"mongoose": "^8.4.4",
|
||||
"natural": "^7.0.7",
|
||||
"openai": "^4.52.2",
|
||||
"openai": "^4.57.0",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"pos": "^0.4.2",
|
||||
"posthog-node": "^4.0.1",
|
||||
|
@ -99,14 +106,16 @@
|
|||
"robots-parser": "^3.0.1",
|
||||
"scrapingbee": "^1.7.4",
|
||||
"stripe": "^16.1.0",
|
||||
"systeminformation": "^5.22.11",
|
||||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"typesense": "^1.5.4",
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
"zod": "^3.23.4",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.1"
|
||||
},
|
||||
"nodemonConfig": {
|
||||
|
@ -116,4 +125,4 @@
|
|||
"temp"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,12 +1,16 @@
|
|||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc
|
||||
Authorization: Bearer fc-
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"corterix.com"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||
|
|
|
@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(firstResponse.statusCode).toBe(200);
|
||||
|
||||
|
@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(secondResponse.statusCode).toBe(409);
|
||||
expect(secondResponse.body.error).toBe('Idempotency key already used');
|
||||
|
|
951
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
951
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
|
@ -0,0 +1,951 @@
|
|||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
delete process.env.USE_DB_AUTHENTICATION;
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it.concurrent("should return the production status", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
});
|
||||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/scrape"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.description).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.keywords).toBe(
|
||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
);
|
||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.ogDescription).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.ogUrl).toBe(
|
||||
"https://www.roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.ogImage).toBe(
|
||||
"https://www.roastmywebsite.ai/og.png"
|
||||
);
|
||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.sourceURL).toBe(
|
||||
"https://roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
|
||||
if (!("data" in responseWithoutRemoveTags.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithRemoveTags);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||
}, 30000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||
}, 60000);
|
||||
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(405);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(500);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
}, 3000);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.rawHtml).toContain("<html");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 5000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("links");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("PagerDuty");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("links");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.links).toContain("https://firecrawl.dev");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
|
||||
});
|
||||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/map"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://roastmywebsite.ai"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://usemotion.com",
|
||||
search: "pricing"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("usemotion.com/pricing");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: false
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||
})
|
||||
|
||||
it.concurrent("should return an error for invalid URL", async () => {
|
||||
const mapRequest = {
|
||||
url: "invalid-url",
|
||||
includeSubdomains: true,
|
||||
search: "test",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(400);
|
||||
expect(response.body).toHaveProperty("success", false);
|
||||
expect(response.body).toHaveProperty("error");
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe("POST /v1/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/crawl"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("id");
|
||||
expect(response.body.id).toMatch(
|
||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
||||
);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("url");
|
||||
expect(response.body.url).toContain("/v1/crawl/");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid includes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
includePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let response;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url).toContain("firecrawl.dev/blog");
|
||||
});
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
180000
|
||||
); // 180 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid excludes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
excludePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let isFinished = false;
|
||||
let response;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(3);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
|
||||
});
|
||||
},
|
||||
90000
|
||||
); // 90 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with max depth option for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
maxDepth: 1,
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
|
||||
// wait for 60 seconds
|
||||
let isCompleted = false;
|
||||
while (!isCompleted) {
|
||||
const statusCheckResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(statusCheckResponse.statusCode).toBe(200);
|
||||
isCompleted = statusCheckResponse.body.status === "completed";
|
||||
if (!isCompleted) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const pathSplits = new URL(url).pathname.split("/");
|
||||
const depth =
|
||||
pathSplits.length -
|
||||
(pathSplits[0].length === 0 &&
|
||||
pathSplits[pathSplits.length - 1].length === 0
|
||||
? 1
|
||||
: 0);
|
||||
expect(depth).toBeLessThanOrEqual(2);
|
||||
});
|
||||
},
|
||||
180000
|
||||
);
|
||||
})
|
||||
|
||||
describe("GET /v1/crawl/:jobId", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response = await request(TEST_URL).get("/v1/crawl/123");
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/123")
|
||||
.set("Authorization", `Bearer invalid-api-key`);
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return Job not found for invalid job ID",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/invalidJobId")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(404);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful crawl status response for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.mendable.ai" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.error
|
||||
).toBeUndefined();
|
||||
|
||||
const childrenLinks = completedResponse.body.data.filter(
|
||||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
},
|
||||
180000
|
||||
); // 120 seconds
|
||||
|
||||
it.concurrent(
|
||||
"If someone cancels a crawl job, it should turn into failed status",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.tatum.io", limit: 200 });
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const responseCancel = await request(TEST_URL)
|
||||
.delete(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(responseCancel.statusCode).toBe(200);
|
||||
expect(responseCancel.body).toHaveProperty("status");
|
||||
expect(responseCancel.body.status).toBe("cancelled");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("cancelled");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
})
|
||||
});
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
import { crawlController } from '../crawl'
|
||||
import { crawlController } from '../v0/crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getWebScraperQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function checkQueuesController(req: Request, res: Response) {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,26 +1,97 @@
|
|||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
||||
import { AuthResponse, NotificationType, RateLimiterMode } from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
import { parseApi } from "../lib/parseApi";
|
||||
import { getRateLimiter } from "../services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
PlanType,
|
||||
RateLimiterMode,
|
||||
} from "../types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
// const { data, error } = await supabase_service
|
||||
// .from('api_keys')
|
||||
// .select(`
|
||||
// key,
|
||||
// team_id,
|
||||
// teams (
|
||||
// subscriptions (
|
||||
// price_id
|
||||
// )
|
||||
// )
|
||||
// `)
|
||||
// .eq('key', normalizedApi)
|
||||
// .limit(1)
|
||||
// .single();
|
||||
function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||
// Check if the string is a valid UUID
|
||||
return validate(potentialUuid);
|
||||
}
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function getKeyAndPriceId(normalizedApi: string): Promise<{
|
||||
success: boolean;
|
||||
teamId?: string;
|
||||
priceId?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
}> {
|
||||
const { data, error } = await supabase_service.rpc("get_key_and_price_id_2", {
|
||||
api_key: normalizedApi,
|
||||
});
|
||||
if (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`);
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"The server seems overloaded. Please contact hello@firecrawl.com if you aren't sending too many requests at once.",
|
||||
status: 500,
|
||||
};
|
||||
}
|
||||
if (!data || data.length === 0) {
|
||||
if (error) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
Sentry.captureException(error);
|
||||
}
|
||||
// TODO: change this error code ?
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
teamId: data[0].team_id,
|
||||
priceId: data[0].price_id,
|
||||
};
|
||||
}
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
|
@ -31,9 +102,10 @@ export async function supaAuthenticateUser(
|
|||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
plan?: PlanType;
|
||||
}> {
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null);
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
}
|
||||
|
@ -51,20 +123,88 @@ export async function supaAuthenticateUser(
|
|||
const iptoken = incomingIP + token;
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let subscriptionData: { team_id: string; plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
let team_id: string;
|
||||
let cacheKey = "";
|
||||
let redLockKey = "";
|
||||
const lockTTL = 15000; // 10 seconds
|
||||
let teamId: string | null = null;
|
||||
let priceId: string | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
team_id = "preview";
|
||||
if (mode == RateLimiterMode.CrawlStatus) {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
} else {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
}
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
if (!normalizedApiIsUuid(normalizedApi)) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
cacheKey = `api_key:${normalizedApi}`;
|
||||
|
||||
try {
|
||||
const teamIdPriceId = await getValue(cacheKey);
|
||||
if (teamIdPriceId) {
|
||||
const { team_id, price_id } = JSON.parse(teamIdPriceId);
|
||||
teamId = team_id;
|
||||
priceId = price_id;
|
||||
} else {
|
||||
const {
|
||||
success,
|
||||
teamId: tId,
|
||||
priceId: pId,
|
||||
error,
|
||||
status,
|
||||
} = await getKeyAndPriceId(normalizedApi);
|
||||
if (!success) {
|
||||
return { success, error, status };
|
||||
}
|
||||
teamId = tId;
|
||||
priceId = pId;
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
60
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error with auth function: ${error}`);
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
}
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
|
@ -82,46 +222,47 @@ export async function supaAuthenticateUser(
|
|||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const internal_team_id = data[0].team_id;
|
||||
team_id = internal_team_id;
|
||||
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
}
|
||||
team_id: teamId,
|
||||
plan: plan,
|
||||
};
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Crawl,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Scrape,
|
||||
token,
|
||||
subscriptionData.plan,
|
||||
teamId
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Search,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Map:
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Map,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
|
@ -134,7 +275,8 @@ export async function supaAuthenticateUser(
|
|||
}
|
||||
}
|
||||
|
||||
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
|
||||
const team_endpoint_token =
|
||||
token === "this_is_just_a_preview_token" ? iptoken : teamId;
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
|
@ -147,17 +289,32 @@ export async function supaAuthenticateUser(
|
|||
const startDate = new Date();
|
||||
const endDate = new Date();
|
||||
endDate.setDate(endDate.getDate() + 7);
|
||||
|
||||
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||
// Cache longer for 429s
|
||||
if (teamId && priceId && mode !== RateLimiterMode.Preview) {
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
60 // 10 seconds, cache for everything
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
status: 429,
|
||||
};
|
||||
}
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
(mode === RateLimiterMode.Scrape ||
|
||||
mode === RateLimiterMode.Preview ||
|
||||
mode === RateLimiterMode.Map ||
|
||||
mode === RateLimiterMode.Crawl ||
|
||||
mode === RateLimiterMode.CrawlStatus ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
|
@ -181,10 +338,11 @@ export async function supaAuthenticateUser(
|
|||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
if (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
|
@ -195,26 +353,32 @@ export async function supaAuthenticateUser(
|
|||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: (subscriptionData.plan ?? "") as PlanType,
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
function getPlanByPriceId(price_id: string): PlanType {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
return "starter";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
return "standard";
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
return "scale";
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
return "hobby";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
return "standardnew";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
return "growth";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
|
||||
return "growthdouble";
|
||||
default:
|
||||
return 'free';
|
||||
return "free";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
let data = job.returnvalue;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
try {
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId: uuidv4(),
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
pageOptions: pageOptions,
|
||||
});
|
||||
|
||||
const docs = await a.getDocuments(false, (progress) => {
|
||||
job.progress({
|
||||
current: progress.current,
|
||||
total: progress.total,
|
||||
current_step: "SCRAPING",
|
||||
current_url: progress.currentDocumentUrl,
|
||||
});
|
||||
});
|
||||
return res.json({
|
||||
success: true,
|
||||
documents: docs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
});
|
||||
|
||||
await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
// authenticate on supabase
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
mode: mode ?? "crawl", // fix for single urls not working
|
||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
||||
team_id: "preview",
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: string
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: pageOptions,
|
||||
extractorOptions: extractorOptions,
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
|
||||
setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
|
||||
);
|
||||
|
||||
const docsPromise = a.getDocuments(false);
|
||||
|
||||
let docs;
|
||||
try {
|
||||
docs = await Promise.race([docsPromise, timeoutPromise]);
|
||||
} catch (error) {
|
||||
return error;
|
||||
}
|
||||
|
||||
// make sure doc.content is not empty
|
||||
let filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
||||
}
|
||||
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
filteredDocs.forEach(doc => {
|
||||
delete doc.rawHtml;
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs[0],
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
const checkCredits = async () => {
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
await checkCredits();
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [result.data],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
let data = job.returnvalue;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
let jobStatus = await job.getState();
|
||||
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
|
||||
jobStatus = 'active';
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
199
apps/api/src/controllers/v0/admin/queue.ts
Normal file
199
apps/api/src/controllers/v0/admin/queue.ts
Normal file
|
@ -0,0 +1,199 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
scrapeQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkQueuesController(req: Request, res: Response) {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
scrapeQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
export async function autoscalerController(req: Request, res: Response) {
|
||||
try {
|
||||
const maxNumberOfMachines = 80;
|
||||
const minNumberOfMachines = 20;
|
||||
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
|
||||
const [webScraperActive, webScraperWaiting, webScraperPriority] =
|
||||
await Promise.all([
|
||||
scrapeQueue.getActiveCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
scrapeQueue.getPrioritizedCount(),
|
||||
]);
|
||||
|
||||
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||
|
||||
// get number of machines active
|
||||
const request = await fetch(
|
||||
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
const machines = await request.json();
|
||||
|
||||
// Only worker machines
|
||||
const activeMachines = machines.filter(
|
||||
(machine) =>
|
||||
(machine.state === "started" ||
|
||||
machine.state === "starting" ||
|
||||
machine.state === "replacing") &&
|
||||
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
|
||||
).length;
|
||||
|
||||
let targetMachineCount = activeMachines;
|
||||
|
||||
const baseScaleUp = 10;
|
||||
// Slow scale down
|
||||
const baseScaleDown = 2;
|
||||
|
||||
// Scale up logic
|
||||
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 3
|
||||
);
|
||||
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 2
|
||||
);
|
||||
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp
|
||||
);
|
||||
}
|
||||
|
||||
// Scale down logic
|
||||
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 3
|
||||
);
|
||||
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 2
|
||||
);
|
||||
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown
|
||||
);
|
||||
}
|
||||
|
||||
if (targetMachineCount !== activeMachines) {
|
||||
Logger.info(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||
);
|
||||
|
||||
if (targetMachineCount > activeMachines) {
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
} else {
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
}
|
||||
return res.status(200).json({
|
||||
mode: "scale-descale",
|
||||
count: targetMachineCount,
|
||||
});
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
mode: "normal",
|
||||
count: activeMachines,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).send("Failed to initialize autoscaler");
|
||||
}
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
|
@ -1,11 +1,10 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -19,8 +18,9 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
if (!job) {
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
|
@ -40,31 +40,18 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
}
|
||||
}
|
||||
|
||||
const jobState = await job.getState();
|
||||
const { partialDocs } = await job.progress();
|
||||
|
||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||
Logger.info("Billing team for partial docs...");
|
||||
// Note: the credits that we will bill them here might be lower than the actual
|
||||
// due to promises that are not yet resolved
|
||||
await billTeam(team_id, partialDocs.length);
|
||||
}
|
||||
|
||||
try {
|
||||
await getWebScraperQueue().client.del(job.lockKey());
|
||||
await job.takeLock();
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
const newJobState = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
84
apps/api/src/controllers/v0/crawl-status.ts
Normal file
84
apps/api/src/controllers/v0/crawl-status.ts
Normal file
|
@ -0,0 +1,84 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
if (job) {
|
||||
job.returnvalue = x.docs;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
jobs.forEach(job => {
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
});
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== team_id) {
|
||||
return res.status(403).json({ error: "Forbidden" });
|
||||
}
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
if (
|
||||
jobs.length > 0 &&
|
||||
jobs[0].data &&
|
||||
jobs[0].data.pageOptions &&
|
||||
!jobs[0].data.pageOptions.includeRawHtml
|
||||
) {
|
||||
data.forEach(item => {
|
||||
if (item) {
|
||||
delete item.rawHtml;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
232
apps/api/src/controllers/v0/crawl.ts
Normal file
232
apps/api/src/controllers/v0/crawl.ts
Normal file
|
@ -0,0 +1,232 @@
|
|||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const crawlerOptions = {
|
||||
...defaultCrawlerOptions,
|
||||
...req.body.crawlerOptions,
|
||||
};
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const limitCheck = req.body?.crawlerOptions?.limit ?? 1;
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } =
|
||||
await checkTeamCredits(team_id, limitCheck);
|
||||
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
|
||||
}
|
||||
|
||||
// TODO: need to do this to v1
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
if (typeof url !== "string") {
|
||||
return res.status(400).json({ error: "URL must be a string" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res.status(403).json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.updateProgress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
if (Sentry.isInitialized()) {
|
||||
for (const job of jobs) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
} else {
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
}
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
|
||||
// Not needed, first one should be 15.
|
||||
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
},
|
||||
{
|
||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
138
apps/api/src/controllers/v0/crawlPreview.ts
Normal file
138
apps/api/src/controllers/v0/crawlPreview.ts
Normal file
|
@ -0,0 +1,138 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
);
|
||||
|
||||
const team_id = "preview";
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (e) {
|
||||
return res
|
||||
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.updateProgress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
let robots;
|
||||
|
||||
try {
|
||||
robots = await this.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
robots,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
for (const url of sitemap.map(x => x.url)) {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
|
||||
import { AuthResponse, RateLimiterMode } from "../types";
|
||||
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
|
||||
|
||||
export const keyAuthController = async (req: Request, res: Response) => {
|
299
apps/api/src/controllers/v0/scrape.ts
Normal file
299
apps/api/src/controllers/v0/scrape.ts
Normal file
|
@ -0,0 +1,299 @@
|
|||
import { ExtractorOptions, PageOptions } from "./../../lib/entities";
|
||||
import { Request, Response } from "express";
|
||||
import {
|
||||
billTeam,
|
||||
checkTeamCredits,
|
||||
} from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import {
|
||||
defaultPageOptions,
|
||||
defaultExtractorOptions,
|
||||
defaultTimeout,
|
||||
defaultOrigin,
|
||||
} from "../../lib/default-values";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
returnCode: 403,
|
||||
};
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
let doc;
|
||||
|
||||
const err = await Sentry.startSpan(
|
||||
{
|
||||
name: "Wait for job to finish",
|
||||
op: "bullmq.wait",
|
||||
attributes: { job: jobId },
|
||||
},
|
||||
async (span) => {
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
returnCode: 408,
|
||||
};
|
||||
} else if (
|
||||
typeof e === "string" &&
|
||||
(e.includes("Error generating completions: ") ||
|
||||
e.includes("Invalid schema for function") ||
|
||||
e.includes(
|
||||
"LLM extraction did not match the extraction schema you provided."
|
||||
))
|
||||
) {
|
||||
return {
|
||||
success: false,
|
||||
error: e,
|
||||
returnCode: 500,
|
||||
};
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
span.setAttribute("result", JSON.stringify(doc));
|
||||
return null;
|
||||
}
|
||||
);
|
||||
|
||||
if (err !== null) {
|
||||
return err;
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return {
|
||||
success: true,
|
||||
error: "No page found",
|
||||
returnCode: 200,
|
||||
data: doc,
|
||||
};
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (
|
||||
!pageOptions.includeRawHtml &&
|
||||
extractorOptions.mode == "llm-extraction-from-raw-html"
|
||||
) {
|
||||
if (doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageOptions.includeHtml) {
|
||||
if (doc.html) {
|
||||
delete doc.html;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = {
|
||||
...defaultExtractorOptions,
|
||||
...req.body.extractorOptions,
|
||||
};
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
if (
|
||||
typeof extractorOptions.extractionSchema !== "object" ||
|
||||
extractorOptions.extractionSchema === null
|
||||
) {
|
||||
return res.status(400).json({
|
||||
error:
|
||||
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
|
||||
});
|
||||
}
|
||||
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
// checkCredits
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({
|
||||
error:
|
||||
"Error checking team credits. Please contact hello@firecrawl.com for help.",
|
||||
});
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && result.data.markdown
|
||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 1;
|
||||
const creditsPerLLMExtract = 49;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
const billingResult = await billTeam(team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({
|
||||
error:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
});
|
||||
}
|
||||
}
|
|
@ -1,14 +1,18 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
|
@ -17,6 +21,7 @@ export async function searchHelper(
|
|||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
plan: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
|
@ -73,55 +78,57 @@ export async function searchHelper(
|
|||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
|
||||
|
||||
// filter out social media links
|
||||
|
||||
const jobDatas = res.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
pageOptions: {
|
||||
...pageOptions,
|
||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||
includeHtml: pageOptions?.includeHtml ?? false,
|
||||
removeTags: pageOptions?.removeTags ?? [],
|
||||
fallback: false,
|
||||
},
|
||||
});
|
||||
let jobs = [];
|
||||
if (Sentry.isInitialized()) {
|
||||
for (const job of jobDatas) {
|
||||
// add with sentry instrumentation
|
||||
jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId));
|
||||
}
|
||||
} else {
|
||||
jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
}
|
||||
|
||||
const docs = await a.getDocuments(false);
|
||||
const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
await Promise.all(jobs.map(x => x.remove()));
|
||||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
||||
(doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
filteredDocs.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
|
@ -132,7 +139,7 @@ export async function searchHelper(
|
|||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
|
@ -142,16 +149,16 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: false,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||
fallback: req.body.pageOptions?.fallback ?? false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
|
@ -161,6 +168,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
|
@ -172,6 +180,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
crawlerOptions,
|
||||
pageOptions,
|
||||
searchOptions,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
@ -191,6 +200,11 @@ export async function searchController(req: Request, res: Response) {
|
|||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({ error: "Request timed out" });
|
||||
}
|
||||
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
43
apps/api/src/controllers/v0/status.ts
Normal file
43
apps/api/src/controllers/v0/status.ts
Normal file
|
@ -0,0 +1,43 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
|
@ -0,0 +1,47 @@
|
|||
import { crawlController } from '../crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
jest.mock('../auth', () => ({
|
||||
authenticateUser: jest.fn().mockResolvedValue({
|
||||
success: true,
|
||||
team_id: 'team123',
|
||||
error: null,
|
||||
status: 200
|
||||
}),
|
||||
reduce: jest.fn()
|
||||
}));
|
||||
jest.mock('../../services/idempotency/validate');
|
||||
|
||||
describe('crawlController', () => {
|
||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
||||
const req = {
|
||||
headers: {
|
||||
'x-idempotency-key': await uuidv4(),
|
||||
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
|
||||
},
|
||||
body: {
|
||||
url: 'https://mendable.ai'
|
||||
}
|
||||
} as unknown as Request;
|
||||
const res = {
|
||||
status: jest.fn().mockReturnThis(),
|
||||
json: jest.fn()
|
||||
} as unknown as Response;
|
||||
|
||||
// Mock the idempotency key validation to return false for the second call
|
||||
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
|
||||
|
||||
// First request should succeed
|
||||
await crawlController(req, res);
|
||||
expect(res.status).not.toHaveBeenCalledWith(409);
|
||||
|
||||
// Second request with the same key should fail
|
||||
await crawlController(req, res);
|
||||
expect(res.status).toHaveBeenCalledWith(409);
|
||||
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
|
||||
});
|
||||
});
|
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
|
@ -0,0 +1,64 @@
|
|||
import { url } from "../types";
|
||||
|
||||
describe("URL Schema Validation", () => {
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
it("should prepend http:// to URLs without a protocol", () => {
|
||||
const result = url.parse("example.com");
|
||||
expect(result).toBe("http://example.com");
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("http://example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject URLs with unsupported protocols", () => {
|
||||
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
|
||||
});
|
||||
|
||||
it("should reject URLs without a valid top-level domain", () => {
|
||||
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
|
||||
});
|
||||
|
||||
it("should reject blocked URLs", () => {
|
||||
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains correctly", () => {
|
||||
expect(() => url.parse("http://sub.example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://blog.example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with paths correctly", () => {
|
||||
expect(() => url.parse("http://example.com/path")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains that are blocked", () => {
|
||||
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with paths that are blocked", () => {
|
||||
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||
});
|
||||
})
|
58
apps/api/src/controllers/v1/crawl-cancel.ts
Normal file
58
apps/api/src/controllers/v1/crawl-cancel.ts
Normal file
|
@ -0,0 +1,58 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
// check if the job belongs to the team
|
||||
if (useDbAuthentication) {
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
161
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
161
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
|
@ -0,0 +1,161 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
error: string,
|
||||
}
|
||||
|
||||
type CatchupMessage = {
|
||||
type: "catchup",
|
||||
data: CrawlStatusResponse,
|
||||
}
|
||||
|
||||
type DocumentMessage = {
|
||||
type: "document",
|
||||
data: Document,
|
||||
}
|
||||
|
||||
type DoneMessage = { type: "done" }
|
||||
|
||||
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
||||
|
||||
function send(ws: WebSocket, msg: Message) {
|
||||
if (ws.readyState === 1) {
|
||||
return new Promise((resolve, reject) => {
|
||||
ws.send(JSON.stringify(msg), (err) => {
|
||||
if (err) reject(err);
|
||||
else resolve(null);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function close(ws: WebSocket, code: number, msg: Message) {
|
||||
if (ws.readyState <= 1) {
|
||||
ws.close(code, JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return close(ws, 1008, { type: "error", error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs = [];
|
||||
let finished = false;
|
||||
|
||||
const loop = async () => {
|
||||
if (finished) return;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
if (jobIDs.length === doneJobIDs.length) {
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
|
||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
|
||||
for (const jobID of newlyDoneJobIDs) {
|
||||
const job = await getJob(jobID);
|
||||
|
||||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
}
|
||||
}
|
||||
|
||||
doneJobIDs.push(...newlyDoneJobIDs);
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
};
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
|
||||
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobs = await getJobs(doneJobIDs);
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
send(ws, {
|
||||
type: "catchup",
|
||||
data: {
|
||||
status,
|
||||
total: jobIDs.length,
|
||||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
}
|
||||
});
|
||||
|
||||
if (status !== "scraping") {
|
||||
finished = true;
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
}
|
||||
|
||||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
} catch (err) {
|
||||
Sentry.captureException(err);
|
||||
|
||||
const id = uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
126
apps/api/src/controllers/v1/crawl-status.ts
Normal file
126
apps/api/src/controllers/v1/crawl-status.ts
Normal file
|
@ -0,0 +1,126 @@
|
|||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
if (job) {
|
||||
job.returnvalue = x.docs;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
jobs.forEach(job => {
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
});
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
const bytesLimit = 10485760; // 10 MiB in bytes
|
||||
const factor = 100; // chunking for faster retrieval
|
||||
|
||||
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
|
||||
// get current chunk and retrieve jobs
|
||||
const currentIDs = doneJobsOrder.slice(i, i+factor);
|
||||
const jobs = await getJobs(currentIDs);
|
||||
|
||||
// iterate through jobs and add them one them one to the byte counter
|
||||
// both loops will break once we cross the byte counter
|
||||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
}
|
||||
}
|
||||
|
||||
// if we ran over the bytes limit, remove the last document
|
||||
if (bytes > bytesLimit) {
|
||||
doneJobs.splice(doneJobs.length - 1, 1);
|
||||
}
|
||||
} else {
|
||||
doneJobs = await getJobs(doneJobsOrder);
|
||||
}
|
||||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
if (typeof req.query.limit === "string") {
|
||||
nextURL.searchParams.set("limit", req.query.limit);
|
||||
}
|
||||
|
||||
if (data.length > 0) {
|
||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||
if (data[ii]) {
|
||||
delete data[ii].rawHtml;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
status,
|
||||
completed: doneJobsLength,
|
||||
total: jobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
next:
|
||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
});
|
||||
}
|
||||
|
165
apps/api/src/controllers/v1/crawl.ts
Normal file
165
apps/api/src/controllers/v1/crawl.ts
Normal file
|
@ -0,0 +1,165 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
const { remainingCredits } = req.account;
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
);
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
if(req.body.webhook) {
|
||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
6
apps/api/src/controllers/v1/liveness.ts
Normal file
6
apps/api/src/controllers/v1/liveness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
132
apps/api/src/controllers/v1/map.ts
Normal file
132
apps/api/src/controllers/v1/map.ts
Normal file
|
@ -0,0 +1,132 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
checkAndUpdateURLForMap,
|
||||
isSameDomain,
|
||||
isSameSubdomain,
|
||||
removeDuplicateUrls,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
|
||||
configDotenv();
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
|
||||
const limit : number = req.body.limit ?? 5000;
|
||||
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
// limit to 50 results (beta)
|
||||
numResults: Math.min(limit, 50),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.search) {
|
||||
const searchQuery = req.body.search.toLowerCase();
|
||||
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
|
||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
const linksToReturn = links.slice(0, limit);
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: links.length > 0,
|
||||
message: "Map completed",
|
||||
num_docs: linksToReturn.length,
|
||||
docs: linksToReturn,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||
});
|
||||
}
|
6
apps/api/src/controllers/v1/readiness.ts
Normal file
6
apps/api/src/controllers/v1/readiness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
|
@ -0,0 +1,38 @@
|
|||
import { Response } from "express";
|
||||
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
|
||||
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
|
||||
|
||||
export async function scrapeStatusController(req: any, res: any) {
|
||||
try {
|
||||
const rateLimiter = scrapeStatusRateLimiter;
|
||||
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||
req.socket.remoteAddress) as string;
|
||||
const iptoken = incomingIP;
|
||||
await rateLimiter.consume(iptoken);
|
||||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
|
||||
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
});
|
||||
}
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: job?.docs[0],
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||
return res.status(429).json({
|
||||
success: false,
|
||||
error: "Rate limit exceeded. Please try again later.",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "An unexpected error occurred.",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
152
apps/api/src/controllers/v1/scrape.ts
Normal file
152
apps/api/src/controllers/v1/scrape.ts
Normal file
|
@ -0,0 +1,152 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ScrapeRequest,
|
||||
scrapeRequestSchema,
|
||||
ScrapeResponse,
|
||||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export async function scrapeController(
|
||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||
res: Response<ScrapeResponse>
|
||||
) {
|
||||
req.body = scrapeRequestSchema.parse(req.body);
|
||||
let earlyReturn = false;
|
||||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan as PlanType,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
let doc: any | undefined;
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
extractorOptions && extractorOptions.mode !== "markdown"
|
||||
? " - Could be due to LLM parsing issues"
|
||||
: ""
|
||||
}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc,
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
doc && doc.markdown
|
||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if(req.body.extract && req.body.formats.includes("extract")) {
|
||||
creditsToBeBilled = 50;
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
message: "Scrape completed",
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
371
apps/api/src/controllers/v1/types.ts
Normal file
371
apps/api/src/controllers/v1/types.ts
Normal file
|
@ -0,0 +1,371 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
| "html"
|
||||
| "rawHtml"
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage"
|
||||
| "extract";
|
||||
|
||||
export const url = z.preprocess(
|
||||
(x) => {
|
||||
if (!protocolIncluded(x as string)) {
|
||||
return `http://${x}`;
|
||||
}
|
||||
return x;
|
||||
},
|
||||
z
|
||||
.string()
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
|
||||
"URL must have a valid top-level domain or be a valid path"
|
||||
)
|
||||
.refine(
|
||||
(x) => checkUrl(x as string),
|
||||
"Invalid URL"
|
||||
)
|
||||
.refine(
|
||||
(x) => !isUrlBlocked(x as string),
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||
)
|
||||
);
|
||||
|
||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const extractOptions = z.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
schema: z.any().optional(),
|
||||
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
|
||||
prompt: z.string().optional()
|
||||
}).strict(strictMessage);
|
||||
|
||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract"
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"]),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||
},
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
// includePaths?: string[];
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
webhook: z.string().url().optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
// export type ExtractorOptions = {
|
||||
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
// extractionPrompt?: string;
|
||||
// extractionSchema?: Record<string, any>;
|
||||
// }
|
||||
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000).optional(),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// };
|
||||
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
extract?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dcTermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dcTermsType?: string;
|
||||
dcType?: string;
|
||||
dcTermsAudience?: string;
|
||||
dcTermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dcTermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type ErrorResponse = {
|
||||
success: false;
|
||||
error: string;
|
||||
details?: any;
|
||||
};
|
||||
|
||||
export type ScrapeResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export interface ScrapeResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ScrapeResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
id: string;
|
||||
url: string;
|
||||
};
|
||||
|
||||
export type MapResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
links: string[];
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusParams = {
|
||||
jobId: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: string;
|
||||
next?: string;
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
remainingCredits: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth?: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface RequestWithAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined,
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface ResponseWithSentry<
|
||||
ResBody = undefined,
|
||||
> extends Response<ResBody> {
|
||||
sentry?: string,
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
maxCrawledLinks: x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
userPrompt: x.prompt ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc === null || doc === undefined) return doc;
|
||||
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata.pageError,
|
||||
statusCode: doc.metadata.pageStatusCode,
|
||||
},
|
||||
};
|
||||
}
|
|
@ -1,8 +1,10 @@
|
|||
import express from "express";
|
||||
import "dotenv/config";
|
||||
import "./services/sentry"
|
||||
import * as Sentry from "@sentry/node";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import "dotenv/config";
|
||||
import { getWebScraperQueue } from "./services/queue-service";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import cluster from "cluster";
|
||||
|
@ -13,6 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events";
|
|||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
|
@ -45,7 +53,8 @@ if (cluster.isMaster) {
|
|||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
const ws = expressWs(express());
|
||||
const app = ws.app;
|
||||
|
||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||
|
||||
|
@ -58,7 +67,7 @@ if (cluster.isMaster) {
|
|||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||
|
||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||
queues: [new BullAdapter(getWebScraperQueue())],
|
||||
queues: [new BullAdapter(getScrapeQueue())],
|
||||
serverAdapter: serverAdapter,
|
||||
});
|
||||
|
||||
|
@ -78,6 +87,7 @@ if (cluster.isMaster) {
|
|||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use("/v1", v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
|
@ -104,9 +114,9 @@ if (cluster.isMaster) {
|
|||
|
||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const [waitingJobs] = await Promise.all([
|
||||
webScraperQueue.getWaitingCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
const noWaitingJobs = waitingJobs === 0;
|
||||
|
@ -115,6 +125,7 @@ if (cluster.isMaster) {
|
|||
waitingJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
|
@ -126,9 +137,9 @@ if (cluster.isMaster) {
|
|||
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
||||
|
||||
const getWaitingJobsCount = async () => {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const [waitingJobsCount] = await Promise.all([
|
||||
webScraperQueue.getWaitingCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
return waitingJobsCount;
|
||||
|
@ -166,6 +177,7 @@ if (cluster.isMaster) {
|
|||
}, timeout);
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.debug(error);
|
||||
}
|
||||
};
|
||||
|
@ -178,14 +190,46 @@ if (cluster.isMaster) {
|
|||
res.send({ isProduction: global.isProduction });
|
||||
});
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
} else {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
Sentry.setupExpressErrorHandler(app);
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
|
||||
const id = res.sentry ?? uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
});
|
||||
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
||||
// const sq = getScrapeQueue();
|
||||
|
||||
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
// sq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
// sq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,8 @@ export async function generateCompletions(
|
|||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const prompt = extractionOptions.extractionPrompt;
|
||||
const systemPrompt = extractionOptions.extractionPrompt;
|
||||
const prompt = extractionOptions.userPrompt;
|
||||
|
||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
|
@ -24,30 +25,35 @@ export async function generateCompletions(
|
|||
switch (switchVariable) {
|
||||
case "openAI":
|
||||
const llm = new OpenAI();
|
||||
try{
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt,
|
||||
mode: mode,
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||
throw new Error(
|
||||
`JSON parsing error(s): ${validate.errors
|
||||
?.map((err) => err.message)
|
||||
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||
);
|
||||
}
|
||||
try {
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt,
|
||||
systemPrompt: systemPrompt,
|
||||
mode: mode,
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
if (schema) {
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||
throw new Error(
|
||||
`JSON parsing error(s): ${validate.errors
|
||||
?.map((err) => err.message)
|
||||
.join(
|
||||
", "
|
||||
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
throw new Error(`Error generating completions: ${error.message}`);
|
||||
}
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
default:
|
||||
throw new Error("Invalid client");
|
||||
}
|
||||
|
|
|
@ -15,8 +15,7 @@ const defaultPrompt =
|
|||
function prepareOpenAIDoc(
|
||||
document: Document,
|
||||
mode: "markdown" | "raw-html"
|
||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
|
||||
|
||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
||||
let markdown = document.markdown;
|
||||
|
||||
let extractionTarget = document.markdown;
|
||||
|
@ -27,78 +26,120 @@ function prepareOpenAIDoc(
|
|||
|
||||
// Check if the markdown content exists in the document
|
||||
if (!extractionTarget) {
|
||||
throw new Error(
|
||||
`${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
|
||||
);
|
||||
return null;
|
||||
// throw new Error(
|
||||
// `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
|
||||
// );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// count number of tokens
|
||||
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
||||
extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
|
||||
}
|
||||
|
||||
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = process.env.MODEL_NAME || "gpt-4o",
|
||||
model = process.env.MODEL_NAME || "gpt-4o-mini",
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
systemPrompt = defaultPrompt,
|
||||
prompt,
|
||||
temperature,
|
||||
mode
|
||||
mode,
|
||||
}: {
|
||||
client: OpenAI;
|
||||
model?: string;
|
||||
document: Document;
|
||||
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string;
|
||||
systemPrompt?: string;
|
||||
temperature?: number;
|
||||
mode: "markdown" | "raw-html";
|
||||
}): Promise<Document> {
|
||||
const openai = client as OpenAI;
|
||||
const [content, numTokens] = prepareOpenAIDoc(document, mode);
|
||||
const preparedDoc = prepareOpenAIDoc(document, mode);
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: prompt,
|
||||
},
|
||||
{ role: "user", content },
|
||||
],
|
||||
tools: [
|
||||
{
|
||||
type: "function",
|
||||
function: {
|
||||
name: "extract_content",
|
||||
description: "Extracts the content from the given webpage(s)",
|
||||
parameters: schema,
|
||||
if (preparedDoc === null) {
|
||||
return {
|
||||
...document,
|
||||
warning:
|
||||
"LLM extraction was not performed since the document's content is empty or missing.",
|
||||
};
|
||||
}
|
||||
const [content, numTokens] = preparedDoc;
|
||||
|
||||
let completion;
|
||||
let llmExtraction;
|
||||
if (prompt && !schema) {
|
||||
const jsonCompletion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: systemPrompt,
|
||||
},
|
||||
},
|
||||
],
|
||||
tool_choice: { "type": "function", "function": {"name": "extract_content"}},
|
||||
temperature,
|
||||
});
|
||||
{ role: "user", content },
|
||||
{
|
||||
role: "user",
|
||||
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
||||
},
|
||||
],
|
||||
response_format: { type: "json_object" },
|
||||
temperature,
|
||||
});
|
||||
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||
try {
|
||||
llmExtraction = JSON.parse(
|
||||
jsonCompletion.choices[0].message.content.trim()
|
||||
);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
}
|
||||
} else {
|
||||
completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: systemPrompt,
|
||||
},
|
||||
{ role: "user", content },
|
||||
],
|
||||
tools: [
|
||||
{
|
||||
type: "function",
|
||||
function: {
|
||||
name: "extract_content",
|
||||
description: "Extracts the content from the given webpage(s)",
|
||||
parameters: schema,
|
||||
},
|
||||
},
|
||||
],
|
||||
tool_choice: { type: "function", function: { name: "extract_content" } },
|
||||
temperature,
|
||||
});
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||
|
||||
// Extract the LLM extraction content from the completion response
|
||||
const llmExtraction = JSON.parse(c);
|
||||
// Extract the LLM extraction content from the completion response
|
||||
try {
|
||||
llmExtraction = JSON.parse(c);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
}
|
||||
}
|
||||
|
||||
// Return the document with the LLM extraction content added
|
||||
return {
|
||||
...document,
|
||||
llm_extraction: llmExtraction,
|
||||
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
|
||||
warning:
|
||||
numTokens > maxTokens
|
||||
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
||||
: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
134
apps/api/src/lib/__tests__/job-priority.test.ts
Normal file
134
apps/api/src/lib/__tests__/job-priority.test.ts
Normal file
|
@ -0,0 +1,134 @@
|
|||
import {
|
||||
getJobPriority,
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
} from "../job-priority";
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
jest.mock("../../services/queue-service", () => ({
|
||||
redisConnection: {
|
||||
sadd: jest.fn(),
|
||||
srem: jest.fn(),
|
||||
scard: jest.fn(),
|
||||
expire: jest.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
describe("Job Priority Tests", () => {
|
||||
afterEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
test("addJobPriority should add job_id to the set and set expiration", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
await addJobPriority(team_id, job_id);
|
||||
expect(redisConnection.sadd).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
job_id
|
||||
);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
});
|
||||
|
||||
test("deleteJobPriority should remove job_id from the set", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
await deleteJobPriority(team_id, job_id);
|
||||
expect(redisConnection.srem).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
job_id
|
||||
);
|
||||
});
|
||||
|
||||
test("getJobPriority should return correct priority based on plan and set length", async () => {
|
||||
const team_id = "team1";
|
||||
const plan: PlanType = "standard";
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
|
||||
|
||||
const priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(250);
|
||||
const priorityExceeded = await getJobPriority({ plan, team_id });
|
||||
expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4)
|
||||
});
|
||||
|
||||
test("getJobPriority should handle different plans correctly", async () => {
|
||||
const team_id = "team1";
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(50);
|
||||
let plan: PlanType = "hobby";
|
||||
let priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
|
||||
plan = "hobby";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3)
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(25);
|
||||
plan = "free";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(60);
|
||||
plan = "free";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5)
|
||||
});
|
||||
|
||||
test("addJobPriority should reset expiration time when adding new job", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id1 = "job1";
|
||||
const job_id2 = "job2";
|
||||
|
||||
await addJobPriority(team_id, job_id1);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
|
||||
// Clear the mock calls
|
||||
(redisConnection.expire as jest.Mock).mockClear();
|
||||
|
||||
// Add another job
|
||||
await addJobPriority(team_id, job_id2);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
});
|
||||
|
||||
test("Set should expire after 60 seconds", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
|
||||
jest.useFakeTimers();
|
||||
|
||||
await addJobPriority(team_id, job_id);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
|
||||
// Fast-forward time by 59 seconds
|
||||
jest.advanceTimersByTime(59000);
|
||||
|
||||
// The set should still exist
|
||||
expect(redisConnection.scard).not.toHaveBeenCalled();
|
||||
|
||||
// Fast-forward time by 2 more seconds (total 61 seconds)
|
||||
jest.advanceTimersByTime(2000);
|
||||
|
||||
// Check if the set has been removed (scard should return 0)
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(0);
|
||||
const setSize = await redisConnection.scard(`limit_team_id:${team_id}`);
|
||||
expect(setSize).toBe(0);
|
||||
|
||||
jest.useRealTimers();
|
||||
});
|
||||
});
|
32
apps/api/src/lib/checkCredits.ts
Normal file
32
apps/api/src/lib/checkCredits.ts
Normal file
|
@ -0,0 +1,32 @@
|
|||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
type checkCreditsResponse = {
|
||||
status: number;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
|
||||
try {
|
||||
const {
|
||||
success: creditsCheckSuccess,
|
||||
message: creditsCheckMessage
|
||||
} = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return {
|
||||
status: 402,
|
||||
error: "Insufficient credits"
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return {
|
||||
status: 500,
|
||||
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
|
||||
};
|
||||
}
|
||||
return {
|
||||
status: 200,
|
||||
error: null
|
||||
}
|
||||
};
|
124
apps/api/src/lib/crawl-redis.ts
Normal file
124
apps/api/src/lib/crawl-redis.ts
Normal file
|
@ -0,0 +1,124 @@
|
|||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
plan: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
};
|
||||
|
||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||
const x = await redisConnection.get("crawl:" + id);
|
||||
|
||||
if (x === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return JSON.parse(x);
|
||||
}
|
||||
|
||||
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("crawl:" + id);
|
||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||
d.setMilliseconds(0);
|
||||
return d;
|
||||
}
|
||||
|
||||
export async function addCrawlJob(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
||||
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
||||
}
|
||||
|
||||
export async function isCrawlFinished(id: string) {
|
||||
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
||||
}
|
||||
|
||||
export async function isCrawlFinishedLocked(id: string) {
|
||||
return (await redisConnection.exists("crawl:" + id + ":finish"));
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
if (await isCrawlFinished(id)) {
|
||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||
if (set === 1) {
|
||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||
}
|
||||
return set === 1
|
||||
}
|
||||
}
|
||||
|
||||
export async function getCrawlJobs(id: string): Promise<string[]> {
|
||||
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
initialUrl: sc.originUrl,
|
||||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
try {
|
||||
crawler.importRobotsTxt(sc.robots);
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
return crawler;
|
||||
}
|
|
@ -19,3 +19,4 @@ export class CustomError extends Error {
|
|||
Object.setPrototypeOf(this, CustomError.prototype);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
export const defaultOrigin = "api";
|
||||
|
||||
export const defaultTimeout = 45000; // 45 seconds
|
||||
export const defaultTimeout = 60000; // 60 seconds
|
||||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
|
@ -12,7 +12,8 @@ export const defaultPageOptions = {
|
|||
};
|
||||
|
||||
export const defaultCrawlerOptions = {
|
||||
allowBackwardCrawling: false
|
||||
allowBackwardCrawling: false,
|
||||
limit: 10000
|
||||
}
|
||||
|
||||
export const defaultCrawlPageOptions = {
|
||||
|
|
|
@ -11,6 +11,8 @@ export interface Progress {
|
|||
}
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
includeExtract?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
|
@ -24,12 +26,17 @@ export type PageOptions = {
|
|||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
includeLinks?: boolean;
|
||||
useFastMode?: boolean; // beta
|
||||
disableJSDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any>;
|
||||
userPrompt?: string;
|
||||
}
|
||||
|
||||
export type SearchOptions = {
|
||||
|
@ -65,6 +72,8 @@ export type WebScraperOptions = {
|
|||
extractorOptions?: ExtractorOptions;
|
||||
concurrentRequests?: number;
|
||||
bullJobId?: string;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
};
|
||||
|
||||
export interface DocumentUrl {
|
||||
|
@ -141,4 +150,5 @@ export interface FireEngineOptions{
|
|||
blockMedia?: boolean;
|
||||
blockAds?: boolean;
|
||||
disableJsDom?: boolean;
|
||||
atsv?: boolean; // beta
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
export function parseMarkdown(html: string) {
|
||||
export async function parseMarkdown(html: string) {
|
||||
var TurndownService = require("turndown");
|
||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||
|
||||
|
@ -21,7 +21,27 @@ export function parseMarkdown(html: string) {
|
|||
});
|
||||
var gfm = turndownPluginGfm.gfm;
|
||||
turndownService.use(gfm);
|
||||
let markdownContent = turndownService.turndown(html);
|
||||
let markdownContent = "";
|
||||
const turndownPromise = new Promise<string>((resolve, reject) => {
|
||||
try {
|
||||
const result = turndownService.turndown(html);
|
||||
resolve(result);
|
||||
} catch (error) {
|
||||
reject("Error converting HTML to Markdown: " + error);
|
||||
}
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<string>((resolve, reject) => {
|
||||
const timeout = 5000; // Timeout in milliseconds
|
||||
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
|
||||
});
|
||||
|
||||
try {
|
||||
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return ""; // Optionally return an empty string or handle the error as needed
|
||||
}
|
||||
|
||||
// multiple line links
|
||||
let insideLinkContent = false;
|
||||
|
|
91
apps/api/src/lib/job-priority.ts
Normal file
91
apps/api/src/lib/job-priority.ts
Normal file
|
@ -0,0 +1,91 @@
|
|||
import { redisConnection } from "../../src/services/queue-service";
|
||||
import { PlanType } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
const SET_KEY_PREFIX = "limit_team_id:";
|
||||
export async function addJobPriority(team_id, job_id) {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// Add scrape job id to the set
|
||||
await redisConnection.sadd(setKey, job_id);
|
||||
|
||||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||
await redisConnection.expire(setKey, 60);
|
||||
} catch (e) {
|
||||
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function deleteJobPriority(team_id, job_id) {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// remove job_id from the set
|
||||
await redisConnection.srem(setKey, job_id);
|
||||
} catch (e) {
|
||||
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function getJobPriority({
|
||||
plan,
|
||||
team_id,
|
||||
basePriority = 10,
|
||||
}: {
|
||||
plan: PlanType;
|
||||
team_id: string;
|
||||
basePriority?: number;
|
||||
}): Promise<number> {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// Get the length of the set
|
||||
const setLength = await redisConnection.scard(setKey);
|
||||
|
||||
// Determine the priority based on the plan and set length
|
||||
let planModifier = 1;
|
||||
let bucketLimit = 0;
|
||||
|
||||
switch (plan) {
|
||||
case "free":
|
||||
bucketLimit = 25;
|
||||
planModifier = 0.5;
|
||||
break;
|
||||
case "hobby":
|
||||
bucketLimit = 100;
|
||||
planModifier = 0.3;
|
||||
break;
|
||||
case "standard":
|
||||
case "standardnew":
|
||||
bucketLimit = 200;
|
||||
planModifier = 0.2;
|
||||
break;
|
||||
case "growth":
|
||||
case "growthdouble":
|
||||
bucketLimit = 400;
|
||||
planModifier = 0.1;
|
||||
break;
|
||||
|
||||
default:
|
||||
bucketLimit = 25;
|
||||
planModifier = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// if length set is smaller than set, just return base priority
|
||||
if (setLength <= bucketLimit) {
|
||||
return basePriority;
|
||||
} else {
|
||||
// If not, we keep base priority + planModifier
|
||||
return Math.ceil(
|
||||
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
Logger.error(
|
||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
||||
);
|
||||
return basePriority;
|
||||
}
|
||||
}
|
46
apps/api/src/lib/map-cosine.ts
Normal file
46
apps/api/src/lib/map-cosine.ts
Normal file
|
@ -0,0 +1,46 @@
|
|||
import { Logger } from "./logger";
|
||||
|
||||
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||
try {
|
||||
// Function to calculate cosine similarity
|
||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||
const magnitude1 = Math.sqrt(
|
||||
vec1.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
const magnitude2 = Math.sqrt(
|
||||
vec2.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||
return dotProduct / (magnitude1 * magnitude2);
|
||||
};
|
||||
|
||||
// Function to convert text to vector
|
||||
const textToVector = (text: string): number[] => {
|
||||
const words = searchQuery.toLowerCase().split(/\W+/);
|
||||
return words.map((word) => {
|
||||
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
|
||||
.length;
|
||||
return count / text.length;
|
||||
});
|
||||
};
|
||||
|
||||
// Calculate similarity scores
|
||||
const similarityScores = links.map((link) => {
|
||||
const linkVector = textToVector(link);
|
||||
const searchVector = textToVector(searchQuery);
|
||||
return cosineSimilarity(linkVector, searchVector);
|
||||
});
|
||||
|
||||
// Sort links based on similarity scores and print scores
|
||||
const a = links
|
||||
.map((link, index) => ({ link, score: similarityScores[index] }))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
|
||||
links = a.map((item) => item.link);
|
||||
return links;
|
||||
} catch (error) {
|
||||
Logger.error(`Error performing cosine similarity: ${error}`);
|
||||
return links;
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
import { Job, JobId } from "bull";
|
||||
import { Job } from "bullmq";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
|
@ -71,7 +71,7 @@ export class ScrapeEvents {
|
|||
}
|
||||
}
|
||||
|
||||
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
|
||||
static async logJobEvent(job: Job | any, event: ScrapeQueueEvent["event"]) {
|
||||
try {
|
||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||
type: "queue",
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export const supabaseGetJobById = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.eq('job_id', jobId)
|
||||
.from("firecrawl_jobs")
|
||||
.select("*")
|
||||
.eq("job_id", jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
|
@ -16,4 +18,41 @@ export const supabaseGetJobById = async (jobId: string) => {
|
|||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
};
|
||||
|
||||
export const supabaseGetJobsById = async (jobIds: string[]) => {
|
||||
const { data, error } = await supabase_service.rpc("get_jobs_by_ids", {
|
||||
job_ids: jobIds,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error in get_jobs_by_ids: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return data;
|
||||
};
|
||||
|
||||
|
||||
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.select("docs, team_id")
|
||||
.eq("job_id", jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
};
|
159
apps/api/src/lib/validateUrl.test.ts
Normal file
159
apps/api/src/lib/validateUrl.test.ts
Normal file
|
@ -0,0 +1,159 @@
|
|||
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
|
||||
import { isSameSubdomain } from "./validateUrl";
|
||||
|
||||
describe("isSameDomain", () => {
|
||||
it("should return true for a subdomain", () => {
|
||||
const result = isSameDomain("http://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain", () => {
|
||||
const result = isSameDomain("http://example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameDomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with different protocols", () => {
|
||||
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameDomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameDomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with www prefix", () => {
|
||||
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain with www prefix", () => {
|
||||
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
describe("isSameSubdomain", () => {
|
||||
it("should return false for a subdomain", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different subdomains", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameSubdomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameSubdomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with different protocols", () => {
|
||||
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with www prefix", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("removeDuplicateUrls", () => {
|
||||
it("should remove duplicate URLs with different protocols", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com",
|
||||
"http://www.example.com",
|
||||
"https://www.example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should keep URLs with different paths", () => {
|
||||
const urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
]);
|
||||
});
|
||||
|
||||
it("should prefer https over http", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should prefer non-www over www", () => {
|
||||
const urls = [
|
||||
"https://www.example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should handle empty input", () => {
|
||||
const urls: string[] = [];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it("should handle URLs with different cases", () => {
|
||||
const urls = [
|
||||
"https://EXAMPLE.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||
});
|
||||
|
||||
it("should handle URLs with trailing slashes", () => {
|
||||
const urls = [
|
||||
"https://example.com",
|
||||
"https://example.com/"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
});
|
170
apps/api/src/lib/validateUrl.ts
Normal file
170
apps/api/src/lib/validateUrl.ts
Normal file
|
@ -0,0 +1,170 @@
|
|||
export const protocolIncluded = (url: string) => {
|
||||
// if :// not in the start of the url assume http (maybe https?)
|
||||
// regex checks if :// appears before any .
|
||||
return /^([^.:]+:\/\/)/.test(url);
|
||||
};
|
||||
|
||||
const getURLobj = (s: string) => {
|
||||
// URL fails if we dont include the protocol ie google.com
|
||||
let error = false;
|
||||
let urlObj = {};
|
||||
try {
|
||||
urlObj = new URL(s);
|
||||
} catch (err) {
|
||||
error = true;
|
||||
}
|
||||
return { error, urlObj };
|
||||
};
|
||||
|
||||
export const checkAndUpdateURL = (url: string) => {
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
export const checkUrl = (url: string) => {
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
|
||||
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
|
||||
}
|
||||
|
||||
return url;
|
||||
};
|
||||
|
||||
/**
|
||||
* Same domain check
|
||||
* It checks if the domain of the url is the same as the base url
|
||||
* It accounts true for subdomains and www.subdomains
|
||||
* @param url
|
||||
* @param baseUrl
|
||||
* @returns
|
||||
*/
|
||||
export function isSameDomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
return domain1 === domain2;
|
||||
}
|
||||
|
||||
|
||||
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
||||
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
||||
|
||||
// Check if the domains are the same and the subdomains are the same
|
||||
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||
}
|
||||
|
||||
|
||||
export const checkAndUpdateURLForMap = (url: string) => {
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
// remove last slash if present
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
// remove any query params
|
||||
url = url.split("?")[0].trim();
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||
const urlMap = new Map<string, string>();
|
||||
|
||||
for (const url of urls) {
|
||||
const parsedUrl = new URL(url);
|
||||
const protocol = parsedUrl.protocol;
|
||||
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
||||
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||
|
||||
const key = `${hostname}${path}`;
|
||||
|
||||
if (!urlMap.has(key)) {
|
||||
urlMap.set(key, url);
|
||||
} else {
|
||||
const existingUrl = new URL(urlMap.get(key)!);
|
||||
const existingProtocol = existingUrl.protocol;
|
||||
|
||||
if (protocol === 'https:' && existingProtocol === 'http:') {
|
||||
urlMap.set(key, url);
|
||||
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
||||
urlMap.set(key, url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(Array.from(urlMap.values()))];
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
import { Job } from "bull";
|
||||
import { Job } from "bullmq";
|
||||
import {
|
||||
CrawlResult,
|
||||
WebScraperOptions,
|
||||
|
@ -15,15 +15,23 @@ import { ScrapeEvents } from "../lib/scrape-events";
|
|||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
}: {
|
||||
job: Job<WebScraperOptions>;
|
||||
token: string;
|
||||
}) {
|
||||
let partialDocs: Document[] = [];
|
||||
return (await runWebScraper({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
|
@ -31,20 +39,22 @@ export async function startWebScraperPipeline({
|
|||
if (partialDocs.length > 50) {
|
||||
partialDocs = partialDocs.slice(-50);
|
||||
}
|
||||
job.progress({ ...progress, partialDocs: partialDocs });
|
||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
}
|
||||
},
|
||||
onSuccess: (result) => {
|
||||
onSuccess: (result, mode) => {
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result);
|
||||
saveJob(job, result, token, mode);
|
||||
},
|
||||
onError: (error) => {
|
||||
Logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error);
|
||||
job.moveToFailed(error, token, false);
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}
|
||||
export async function runWebScraper({
|
||||
|
@ -52,11 +62,14 @@ export async function runWebScraper({
|
|||
mode,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
inProgress,
|
||||
onSuccess,
|
||||
onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
|
@ -65,17 +78,22 @@ export async function runWebScraper({
|
|||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
teamId: team_id
|
||||
});
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
|
@ -97,21 +115,24 @@ export async function runWebScraper({
|
|||
return { url: doc.metadata.sourceURL };
|
||||
}
|
||||
})
|
||||
: docs.filter((doc) => doc.content.trim().length > 0);
|
||||
: docs;
|
||||
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
if(is_scrape === false) {
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
onSuccess(filteredDocs);
|
||||
onSuccess(filteredDocs, mode);
|
||||
|
||||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
|
@ -121,7 +142,7 @@ export async function runWebScraper({
|
|||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any) => {
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
|
@ -131,17 +152,21 @@ const saveJob = async (job: Job, result: any) => {
|
|||
.eq("job_id", job.id);
|
||||
|
||||
if (error) throw new Error(error.message);
|
||||
try {
|
||||
await job.moveToCompleted(null, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
await job.moveToCompleted(result, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
// try {
|
||||
// if (mode === "crawl") {
|
||||
// await job.moveToCompleted(null, token, false);
|
||||
// } else {
|
||||
// await job.moveToCompleted(result, token, false);
|
||||
// }
|
||||
// } catch (error) {
|
||||
// // I think the job won't exist here anymore
|
||||
// }
|
||||
// } else {
|
||||
// try {
|
||||
// await job.moveToCompleted(result, token, false);
|
||||
// } catch (error) {
|
||||
// // I think the job won't exist here anymore
|
||||
// }
|
||||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
import {
|
||||
autoscalerController,
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
} from "../controllers/v0/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -27,3 +28,8 @@ adminRouter.get(
|
|||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
queuesController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
autoscalerController
|
||||
);
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
import { searchController } from "../../src/controllers/v0/search";
|
||||
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
import { livenessController } from "../controllers/v0/liveness";
|
||||
import { readinessController } from "../controllers/v0/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
|
|
156
apps/api/src/routes/v1.ts
Normal file
156
apps/api/src/routes/v1.ts
Normal file
|
@ -0,0 +1,156 @@
|
|||
import express, { NextFunction, Request, Response } from "express";
|
||||
import { crawlController } from "../controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../controllers/v1/crawl-status";
|
||||
import { mapController } from "../controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/auth";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
|
||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
}
|
||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
};
|
||||
}
|
||||
|
||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
}
|
||||
|
||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
(async () => {
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
}
|
||||
}
|
||||
|
||||
expressWs(express());
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(),
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
wrap(scrapeStatusController)
|
||||
);
|
||||
|
||||
v1Router.ws(
|
||||
"/crawl/:jobId",
|
||||
crawlStatusWSController
|
||||
);
|
||||
|
||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||
|
||||
|
||||
v1Router.delete(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
crawlCancelController
|
||||
);
|
||||
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
// v1Router.get("/keyAuth", keyAuthController);
|
||||
|
||||
// // Search routes
|
||||
// v0Router.post("/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
175
apps/api/src/run-req.ts
Normal file
175
apps/api/src/run-req.ts
Normal file
|
@ -0,0 +1,175 @@
|
|||
import axios from "axios";
|
||||
import { promises as fs } from "fs";
|
||||
import { v4 as uuidV4 } from "uuid";
|
||||
|
||||
interface Result {
|
||||
start_url: string;
|
||||
job_id?: string;
|
||||
idempotency_key?: string;
|
||||
result_data_jsonb?: any;
|
||||
}
|
||||
|
||||
async function sendCrawl(result: Result): Promise<string | undefined> {
|
||||
const idempotencyKey = uuidV4();
|
||||
const url = result.start_url;
|
||||
try {
|
||||
const response = await axios.post(
|
||||
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
|
||||
{
|
||||
url: url,
|
||||
crawlerOptions: {
|
||||
limit: 75,
|
||||
},
|
||||
pageOptions: {
|
||||
includeHtml: true,
|
||||
replaceAllPathsWithAbsolutePaths: true,
|
||||
waitFor: 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer `,
|
||||
},
|
||||
}
|
||||
);
|
||||
result.idempotency_key = idempotencyKey;
|
||||
return response.data.jobId;
|
||||
} catch (error) {
|
||||
console.error("Error sending crawl:", error);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async function getContent(result: Result): Promise<boolean> {
|
||||
let attempts = 0;
|
||||
while (attempts < 120) {
|
||||
// Reduce the number of attempts to speed up
|
||||
try {
|
||||
const response = await axios.get(
|
||||
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer `,
|
||||
},
|
||||
}
|
||||
);
|
||||
if (response.data.status === "completed") {
|
||||
result.result_data_jsonb = response.data.data;
|
||||
// Job actually completed
|
||||
return true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error getting content:", error);
|
||||
}
|
||||
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
|
||||
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
|
||||
attempts++;
|
||||
}
|
||||
// Set result as null if timed out
|
||||
result.result_data_jsonb = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
async function processResults(results: Result[]): Promise<void> {
|
||||
let processedCount = 0;
|
||||
let starterCount = 0;
|
||||
const queue: Result[] = [];
|
||||
const processedUrls = new Set<string>();
|
||||
|
||||
// Initialize the queue with the first 1000 results
|
||||
for (let i = 0; i < Math.min(100, results.length); i++) {
|
||||
queue.push(results[i]);
|
||||
processedUrls.add(results[i].start_url);
|
||||
}
|
||||
|
||||
// Function to process a single result
|
||||
const processSingleResult = async (result: Result) => {
|
||||
const jobId = await sendCrawl(result);
|
||||
if (jobId) {
|
||||
console.log(`Job requested count: ${starterCount}`);
|
||||
starterCount++;
|
||||
result.job_id = jobId;
|
||||
processedCount++;
|
||||
// Save the result to the file
|
||||
try {
|
||||
// Save job id along with the start_url
|
||||
const resultWithJobId = results.map(r => ({
|
||||
start_url: r.start_url,
|
||||
job_id: r.job_id,
|
||||
}));
|
||||
await fs.writeFile(
|
||||
"results_with_job_id_4000_6000.json",
|
||||
JSON.stringify(resultWithJobId, null, 4)
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error writing to results_with_content.json:", error);
|
||||
}
|
||||
|
||||
// Add a new result to the queue if there are more results to process
|
||||
// if (processedCount < results.length) {
|
||||
// for (let i = queue.length; i < results.length; i++) {
|
||||
// if (!processedUrls.has(results[i].start_url)) {
|
||||
// const nextResult = results[i];
|
||||
// console.log("Next result:", nextResult.start_url);
|
||||
// queue.push(nextResult);
|
||||
// processedUrls.add(nextResult.start_url);
|
||||
// console.log(`Queue length: ${queue.length}`);
|
||||
// processSingleResult(nextResult);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
};
|
||||
|
||||
// Start processing the initial queue concurrently
|
||||
// for (let i = 0; i < queue.length; i++) {
|
||||
// processSingleResult(queue[i]);
|
||||
// if ((i + 1) % 500 === 0) {
|
||||
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
|
||||
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||
// }
|
||||
// }
|
||||
// Start processing the initial queue concurrently
|
||||
// await Promise.all(queue.map(result => processSingleResult(result)));
|
||||
for (let i = 0; i < results.length; i += 100) {
|
||||
const batch = results.slice(i, i + 100);
|
||||
Promise.all(batch.map((result) => processSingleResult(result)))
|
||||
.then(() => {
|
||||
console.log(`Processed ${i + 100} results.`);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`Error processing batch starting at index ${i}:`, error);
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||
}
|
||||
}
|
||||
|
||||
// Example call
|
||||
|
||||
async function getStartUrls(): Promise<Result[]> {
|
||||
try {
|
||||
const data = await fs.readFile("starturls.json", "utf-8");
|
||||
return JSON.parse(data);
|
||||
} catch (error) {
|
||||
console.error("Error reading starturls.json:", error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
|
||||
// console.log(results.map((r) => r.start_url).slice(0, 3));
|
||||
|
||||
processResults(results)
|
||||
.then(() => {
|
||||
console.log("All results processed.");
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error("Error processing results:", error);
|
||||
});
|
||||
}
|
||||
|
||||
main();
|
|
@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
|
|||
});
|
||||
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://example.com';
|
||||
const url = 'https://flutterbricks.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
|
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
|
||||
}, 10000);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 15000);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import axios from "axios";
|
||||
import axios, { AxiosError } from "axios";
|
||||
import cheerio, { load } from "cheerio";
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
|
@ -22,7 +22,7 @@ export class WebCrawler {
|
|||
private crawledUrls: Map<string, string> = new Map();
|
||||
private limit: number;
|
||||
private robotsTxtUrl: string;
|
||||
private robots: any;
|
||||
public robots: any;
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
|
@ -53,8 +53,8 @@ export class WebCrawler {
|
|||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
this.baseUrl = new URL(initialUrl).origin;
|
||||
this.includes = includes ?? [];
|
||||
this.excludes = excludes ?? [];
|
||||
this.includes = Array.isArray(includes) ? includes : [];
|
||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||
this.limit = limit;
|
||||
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
|
||||
this.robots = robotsParser(this.robotsTxtUrl, "");
|
||||
|
@ -66,10 +66,16 @@ export class WebCrawler {
|
|||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
}
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
const url = new URL(link.trim(), this.baseUrl);
|
||||
let url: URL;
|
||||
try {
|
||||
url = new URL(link.trim(), this.baseUrl);
|
||||
} catch (error) {
|
||||
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
const path = url.pathname;
|
||||
|
||||
const depth = getURLDepth(url.toString());
|
||||
|
@ -102,7 +108,12 @@ export class WebCrawler {
|
|||
|
||||
// Normalize the initial URL and the link to account for www and non-www versions
|
||||
const normalizedInitialUrl = new URL(this.initialUrl);
|
||||
const normalizedLink = new URL(link);
|
||||
let normalizedLink;
|
||||
try {
|
||||
normalizedLink = new URL(link);
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||
|
||||
|
@ -130,6 +141,25 @@ export class WebCrawler {
|
|||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
public importRobotsTxt(txt: string) {
|
||||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||
}
|
||||
|
||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public async start(
|
||||
inProgress?: (progress: Progress) => void,
|
||||
pageOptions?: PageOptions,
|
||||
|
@ -142,19 +172,17 @@ export class WebCrawler {
|
|||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||
const txt = await this.getRobotsTxt();
|
||||
this.importRobotsTxt(txt);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
const sm = await this.tryGetSitemap();
|
||||
if (sm !== null) {
|
||||
return sm;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -241,6 +269,63 @@ export class WebCrawler {
|
|||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
public filterURL(href: string, url: string): string | null {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
try {
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
let urlObj;
|
||||
try {
|
||||
urlObj = new URL(fullUrl);
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
const path = urlObj.pathname;
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
return fullUrl;
|
||||
}
|
||||
} else { // EXTERNAL LINKS
|
||||
if (
|
||||
this.isInternalLink(url) &&
|
||||
this.allowExternalContentLinks &&
|
||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||
!this.matchesExcludes(fullUrl, true) &&
|
||||
!this.isExternalMainPage(fullUrl)
|
||||
) {
|
||||
return fullUrl;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public extractLinksFromHTML(html: string, url: string) {
|
||||
let links: string[] = [];
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
|
@ -284,37 +369,7 @@ export class WebCrawler {
|
|||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
}
|
||||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
} else { // EXTERNAL LINKS
|
||||
if (
|
||||
this.isInternalLink(url) &&
|
||||
this.allowExternalContentLinks &&
|
||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||
!this.matchesExcludes(fullUrl, true) &&
|
||||
!this.isExternalMainPage(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
|
@ -465,9 +520,13 @@ export class WebCrawler {
|
|||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -480,7 +539,11 @@ export class WebCrawler {
|
|||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,7 +16,6 @@ import {
|
|||
replacePathsWithAbsolutePaths,
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
@ -44,6 +43,8 @@ export class WebScraperDataProvider {
|
|||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
private teamId?: string;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
|
@ -72,7 +73,9 @@ export class WebScraperDataProvider {
|
|||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML
|
||||
existingHTML,
|
||||
this.priority,
|
||||
this.teamId,
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
|
@ -88,21 +91,6 @@ export class WebScraperDataProvider {
|
|||
results[i + index] = result;
|
||||
})
|
||||
);
|
||||
try {
|
||||
if (this.mode === "crawl" && this.bullJobId) {
|
||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||
const jobStatus = await job.getState();
|
||||
if (jobStatus === "failed") {
|
||||
Logger.info(
|
||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||
);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error.message);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
return results.filter((result) => result !== null) as Document[];
|
||||
}
|
||||
|
@ -306,28 +294,32 @@ export class WebScraperDataProvider {
|
|||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"markdown"
|
||||
);
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
if (
|
||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"raw-html"
|
||||
);
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||
|
||||
if (
|
||||
extractionMode === "llm-extraction" ||
|
||||
extractionMode === "llm-extraction-from-markdown" ||
|
||||
extractionMode === "llm-extraction-from-raw-html"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
completionMode
|
||||
);
|
||||
}
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
@ -359,6 +351,7 @@ export class WebScraperDataProvider {
|
|||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
|
@ -581,12 +574,21 @@ export class WebScraperDataProvider {
|
|||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: [],
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
@ -608,6 +610,10 @@ export class WebScraperDataProvider {
|
|||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
|
|
@ -5,6 +5,7 @@ import { generateRequestParams } from "../single_url";
|
|||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Fire-Engine
|
||||
|
@ -22,19 +23,23 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
teamId,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
|
@ -49,11 +54,11 @@ export async function scrapWithFireEngine({
|
|||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
@ -68,47 +73,101 @@ export async function scrapWithFireEngine({
|
|||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
|
||||
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
||||
fireEngineOptionsParam.atsv = true;
|
||||
} else {
|
||||
pageOptions.atsv = false;
|
||||
}
|
||||
|
||||
const axiosInstance = axios.create({
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
|
||||
const startTime = Date.now();
|
||||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
timeout: universalTimeout + waitParam,
|
||||
}
|
||||
);
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
}
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
if (checkStatusResponse.data.processing) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||
axiosInstance.delete(
|
||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
||||
validateStatus: (status) => true
|
||||
}
|
||||
).catch((error) => {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||
});
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
||||
}
|
||||
|
||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||
Logger.debug(
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}`
|
||||
);
|
||||
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
||||
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
||||
|
||||
if(response.data && response.data?.pageStatusCode !== 200) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
|
||||
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
|
||||
}
|
||||
|
||||
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
||||
|
||||
return {
|
||||
html: "",
|
||||
screenshot: "",
|
||||
pageStatusCode: response.data?.pageStatusCode,
|
||||
pageError: response.data?.pageError,
|
||||
pageStatusCode,
|
||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
|
||||
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
url,
|
||||
|
@ -119,18 +178,19 @@ export async function scrapWithFireEngine({
|
|||
logParams.error_message = pageError;
|
||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = response.data;
|
||||
const data = checkStatusResponse.data;
|
||||
|
||||
logParams.success =
|
||||
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
||||
data.pageStatusCode === 404;
|
||||
logParams.html = data.content ?? "";
|
||||
logParams.response_code = data.pageStatusCode;
|
||||
logParams.error_message = data.pageError;
|
||||
logParams.error_message = data.pageError ?? data.error;
|
||||
return {
|
||||
html: data.content ?? "",
|
||||
screenshot: data.screenshot ?? "",
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError,
|
||||
pageError: data.pageError ?? data.error,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
|
|
|
@ -43,6 +43,9 @@ export async function scrapWithScrapingBee(
|
|||
transparent_status_code: "True",
|
||||
},
|
||||
});
|
||||
Logger.info(
|
||||
`⛏️ ScrapingBee: Scraping ${url}`
|
||||
);
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
|
|
|
@ -27,8 +27,8 @@ const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SC
|
|||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
|
||||
export const baseScrapers = [
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
|
@ -88,8 +88,8 @@ function getScrapingFallbackOrder(
|
|||
});
|
||||
|
||||
let defaultOrder = [
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
|
@ -125,20 +125,39 @@ function getScrapingFallbackOrder(
|
|||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = ""
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
includeExtract: pageOptions.includeExtract ?? false,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
|
@ -166,7 +185,7 @@ export async function scrapSingleUrl(
|
|||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if(method === "fire-engine;chrome-cdp"){
|
||||
if (method === "fire-engine;chrome-cdp") {
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
|
@ -180,7 +199,10 @@ export async function scrapSingleUrl(
|
|||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
}
|
||||
atsv: pageOptions.atsv,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
|
@ -339,11 +361,11 @@ export async function scrapSingleUrl(
|
|||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && pageStatusCode == 404) {
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||
break;
|
||||
}
|
||||
|
@ -362,20 +384,22 @@ export async function scrapSingleUrl(
|
|||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
|
@ -387,11 +411,11 @@ export async function scrapSingleUrl(
|
|||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
|
@ -400,7 +424,7 @@ export async function scrapSingleUrl(
|
|||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -414,9 +438,9 @@ export async function scrapSingleUrl(
|
|||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
|
|
@ -8,7 +8,6 @@ describe('Blocklist Functionality', () => {
|
|||
'https://twitter.com/home',
|
||||
'https://instagram.com/explore',
|
||||
'https://linkedin.com/in/johndoe',
|
||||
'https://pinterest.com/pin/create',
|
||||
'https://snapchat.com/add/johndoe',
|
||||
'https://tiktok.com/@johndoe',
|
||||
'https://reddit.com/r/funny',
|
||||
|
|
|
@ -8,7 +8,6 @@ describe('isUrlBlocked', () => {
|
|||
'https://twitter.com/someuser',
|
||||
'https://instagram.com/someuser',
|
||||
'https://www.linkedin.com/in/someuser',
|
||||
'https://pinterest.com/someuser',
|
||||
'https://snapchat.com/someuser',
|
||||
'https://tiktok.com/@someuser',
|
||||
'https://reddit.com/r/somesubreddit',
|
||||
|
|
|
@ -6,7 +6,6 @@ const socialMediaBlocklist = [
|
|||
'twitter.com',
|
||||
'instagram.com',
|
||||
'linkedin.com',
|
||||
'pinterest.com',
|
||||
'snapchat.com',
|
||||
'tiktok.com',
|
||||
'reddit.com',
|
||||
|
@ -15,6 +14,11 @@ const socialMediaBlocklist = [
|
|||
'whatsapp.com',
|
||||
'wechat.com',
|
||||
'telegram.org',
|
||||
'researchhub.com',
|
||||
'youtube.com',
|
||||
'corterix.com',
|
||||
'southwest.com',
|
||||
'ryanair.com'
|
||||
];
|
||||
|
||||
const allowedKeywords = [
|
||||
|
|
|
@ -1,24 +1,11 @@
|
|||
export const urlSpecificParams = {
|
||||
"platform.openai.com": {
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
cookies: {
|
||||
__cf_bm:
|
||||
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
wait: 3000,
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp"
|
||||
},
|
||||
},
|
||||
},
|
||||
"support.greenpay.me":{
|
||||
|
@ -247,5 +234,13 @@ export const urlSpecificParams = {
|
|||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
},
|
||||
"zoopla.co.uk":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
|
|
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
|
|
@ -41,10 +41,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(`${origin}${href}`);
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(`${baseUrl}/${href}`);
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
|
|
45
apps/api/src/search/fireEngine.ts
Normal file
45
apps/api/src/search/fireEngine.ts
Normal file
|
@ -0,0 +1,45 @@
|
|||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fireEngineMap(q: string, options: {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
numResults: number;
|
||||
page?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
let data = JSON.stringify({
|
||||
query: q,
|
||||
lang: options.lang,
|
||||
country: options.country,
|
||||
location: options.location,
|
||||
tbs: options.tbs,
|
||||
numResults: options.numResults,
|
||||
page: options.page ?? 1,
|
||||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
|
||||
return [];
|
||||
}
|
||||
|
||||
let config = {
|
||||
method: "POST",
|
||||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response) {
|
||||
return response.data
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
|||
|
||||
|
||||
|
||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
|
@ -30,12 +28,20 @@ export async function search({
|
|||
proxy?: string;
|
||||
sleep_interval?: number;
|
||||
timeout?: number;
|
||||
}) : Promise<SearchResult[]> {
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
if (process.env.SERPER_API_KEY ) {
|
||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
||||
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location,
|
||||
});
|
||||
}
|
||||
return await google_search(
|
||||
return await googleSearch(
|
||||
query,
|
||||
advanced,
|
||||
num_results,
|
||||
|
@ -49,7 +55,6 @@ export async function search({
|
|||
);
|
||||
} catch (error) {
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
return [];
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getWebScraperQueue } from "../queue-service";
|
||||
import { getScrapeQueue } from "../queue-service";
|
||||
import { sendSlackWebhook } from "./slack";
|
||||
|
||||
export async function checkAlerts() {
|
||||
|
@ -13,8 +13,8 @@ export async function checkAlerts() {
|
|||
Logger.info("Initializing alerts");
|
||||
const checkActiveJobs = async () => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const activeJobs = await webScraperQueue.getActiveCount();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const activeJobs = await scrapeQueue.getActiveCount();
|
||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||
Logger.warn(
|
||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||
|
@ -34,11 +34,10 @@ export async function checkAlerts() {
|
|||
};
|
||||
|
||||
const checkWaitingQueue = async () => {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const waitingJobs = await webScraperQueue.getWaitingCount();
|
||||
const paused = await webScraperQueue.getPausedCount();
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const waitingJobs = await scrapeQueue.getWaitingCount();
|
||||
|
||||
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||
Logger.warn(
|
||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
||||
);
|
||||
|
|
|
@ -3,9 +3,9 @@ import { Logger } from "../../../src/lib/logger";
|
|||
|
||||
export async function sendSlackWebhook(
|
||||
message: string,
|
||||
alertEveryone: boolean = false
|
||||
alertEveryone: boolean = false,
|
||||
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
|
||||
) {
|
||||
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
||||
const payload = {
|
||||
text: `${messagePrefix} ${message}`,
|
||||
|
|
|
@ -4,37 +4,12 @@ import { sendNotification } from "../notification/email_notification";
|
|||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import Redlock from "redlock";
|
||||
import Client from "ioredis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
const redlock = new Redlock(
|
||||
// You should have one client for each independent redis node
|
||||
// or cluster.
|
||||
[new Client(process.env.REDIS_RATE_LIMIT_URL)],
|
||||
{
|
||||
// The expected clock drift; for more details see:
|
||||
// http://redis.io/topics/distlock
|
||||
driftFactor: 0.01, // multiplied by lock ttl to determine drift time
|
||||
|
||||
// The max number of times Redlock will attempt to lock a resource
|
||||
// before erroring.
|
||||
retryCount: 5,
|
||||
|
||||
// the time in ms between attempts
|
||||
retryDelay: 100, // time in ms
|
||||
|
||||
// the max time in ms randomly added to retries
|
||||
// to improve performance under high contention
|
||||
// see https://www.awsarchitectureblog.com/2015/03/backoff.html
|
||||
retryJitter: 200, // time in ms
|
||||
|
||||
// The minimum remaining time on a lock before an extension is automatically
|
||||
// attempted with the `using` API.
|
||||
automaticExtensionThreshold: 500, // time in ms
|
||||
}
|
||||
);
|
||||
export async function billTeam(team_id: string, credits: number) {
|
||||
return withAuth(supaBillTeam)(team_id, credits);
|
||||
}
|
||||
|
@ -193,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
export async function checkTeamCredits(team_id: string, credits: number) {
|
||||
return withAuth(supaCheckTeamCredits)(team_id, credits);
|
||||
}
|
||||
|
||||
// if team has enough credits for the operation, return true, else return false
|
||||
export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
|
@ -223,21 +199,44 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Free credits, no coupons
|
||||
if (subscriptionError || !subscription) {
|
||||
if (!subscription || subscriptionError) {
|
||||
|
||||
// If there is no active subscription but there are available coupons
|
||||
if (couponCredits >= credits) {
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||
}
|
||||
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service
|
||||
let creditUsages;
|
||||
let creditUsageError;
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const retryInterval = 2000; // 2 seconds
|
||||
|
||||
while (retries < maxRetries) {
|
||||
const result = await supabase_service
|
||||
.from("credit_usage")
|
||||
.select("credits_used")
|
||||
.is("subscription_id", null)
|
||||
.eq("team_id", team_id);
|
||||
|
||||
creditUsages = result.data;
|
||||
creditUsageError = result.error;
|
||||
|
||||
if (!creditUsageError) {
|
||||
break;
|
||||
}
|
||||
|
||||
retries++;
|
||||
if (retries < maxRetries) {
|
||||
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||
}
|
||||
}
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||
throw new Error(
|
||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||
);
|
||||
|
@ -277,9 +276,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return {
|
||||
success: false,
|
||||
message: "Insufficient credits, please upgrade!",
|
||||
remainingCredits: FREE_CREDITS - totalCreditsUsed
|
||||
};
|
||||
}
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
|
||||
}
|
||||
|
||||
let totalCreditsUsed = 0;
|
||||
|
@ -340,24 +340,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
// Compare the adjusted total credits used with the credits allowed by the plan
|
||||
if (adjustedCreditsUsed + credits > price.credits) {
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
subscription.current_period_start,
|
||||
subscription.current_period_end
|
||||
);
|
||||
return { success: false, message: "Insufficient credits, please upgrade!" };
|
||||
// await sendNotification(
|
||||
// team_id,
|
||||
// NotificationType.LIMIT_REACHED,
|
||||
// subscription.current_period_start,
|
||||
// subscription.current_period_end
|
||||
// );
|
||||
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
} else if (creditUsagePercentage >= 0.8) {
|
||||
// Send email notification for approaching credit limit
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
subscription.current_period_start,
|
||||
subscription.current_period_end
|
||||
);
|
||||
// await sendNotification(
|
||||
// team_id,
|
||||
// NotificationType.APPROACHING_LIMIT,
|
||||
// subscription.current_period_start,
|
||||
// subscription.current_period_end
|
||||
// );
|
||||
}
|
||||
|
||||
return { success: true, message: "Sufficient credits available" };
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
}
|
||||
|
||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||
|
|
|
@ -41,10 +41,11 @@ export async function logJob(job: FirecrawlJob) {
|
|||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
crawl_id: job.crawl_id,
|
||||
},
|
||||
]);
|
||||
|
||||
if (process.env.POSTHOG_API_KEY) {
|
||||
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||
let phLog = {
|
||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||
...(job.team_id !== "preview" && {
|
||||
|
|
|
@ -1,17 +1,72 @@
|
|||
import { Job, Queue } from "bull";
|
||||
import {
|
||||
getWebScraperQueue,
|
||||
} from "./queue-service";
|
||||
import { Job, Queue } from "bullmq";
|
||||
import { getScrapeQueue } from "./queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { WebScraperOptions } from "../types";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function addWebScraperJob(
|
||||
webScraperOptions: WebScraperOptions,
|
||||
options: any = {}
|
||||
async function addScrapeJobRaw(
|
||||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
jobPriority: number = 10
|
||||
): Promise<Job> {
|
||||
return await getWebScraperQueue().add(webScraperOptions, {
|
||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
...options,
|
||||
jobId: uuidv4(),
|
||||
priority: jobPriority,
|
||||
jobId,
|
||||
});
|
||||
}
|
||||
|
||||
export async function addScrapeJob(
|
||||
webScraperOptions: WebScraperOptions,
|
||||
options: any = {},
|
||||
jobId: string = uuidv4(),
|
||||
jobPriority: number = 10
|
||||
): Promise<Job> {
|
||||
|
||||
if (Sentry.isInitialized()) {
|
||||
const size = JSON.stringify(webScraperOptions).length;
|
||||
return await Sentry.startSpan({
|
||||
name: "Add scrape job",
|
||||
op: "queue.publish",
|
||||
attributes: {
|
||||
"messaging.message.id": jobId,
|
||||
"messaging.destination.name": getScrapeQueue().name,
|
||||
"messaging.message.body.size": size,
|
||||
},
|
||||
}, async (span) => {
|
||||
return await addScrapeJobRaw({
|
||||
...webScraperOptions,
|
||||
sentry: {
|
||||
trace: Sentry.spanToTraceHeader(span),
|
||||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
}, options, jobId, jobPriority);
|
||||
});
|
||||
} else {
|
||||
return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
|
||||
}
|
||||
}
|
||||
|
||||
export function waitForJob(jobId: string, timeout: number) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + timeout) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await getScrapeQueue().getJobState(jobId);
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1,23 +1,40 @@
|
|||
import Queue from "bull";
|
||||
import { Queue as BullQueue } from "bull";
|
||||
import { Queue } from "bullmq";
|
||||
import { Logger } from "../lib/logger";
|
||||
import IORedis from "ioredis";
|
||||
|
||||
let webScraperQueue: BullQueue;
|
||||
let scrapeQueue: Queue;
|
||||
|
||||
export function getWebScraperQueue() {
|
||||
if (!webScraperQueue) {
|
||||
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
||||
settings: {
|
||||
lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||
stalledInterval: 30 * 1000,
|
||||
maxStalledCount: 10,
|
||||
},
|
||||
defaultJobOptions:{
|
||||
attempts: 2
|
||||
export const redisConnection = new IORedis(process.env.REDIS_URL, {
|
||||
maxRetriesPerRequest: null,
|
||||
});
|
||||
|
||||
export const scrapeQueueName = "{scrapeQueue}";
|
||||
|
||||
export function getScrapeQueue() {
|
||||
if (!scrapeQueue) {
|
||||
scrapeQueue = new Queue(
|
||||
scrapeQueueName,
|
||||
{
|
||||
connection: redisConnection,
|
||||
}
|
||||
});
|
||||
// {
|
||||
// settings: {
|
||||
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||
// stalledInterval: 30 * 1000,
|
||||
// maxStalledCount: 10,
|
||||
// },
|
||||
// defaultJobOptions:{
|
||||
// attempts: 5
|
||||
// }
|
||||
// }
|
||||
);
|
||||
Logger.info("Web scraper queue created");
|
||||
}
|
||||
return webScraperQueue;
|
||||
return scrapeQueue;
|
||||
}
|
||||
|
||||
|
||||
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
||||
// import { QueueEvents } from 'bullmq';
|
||||
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
|
@ -1,74 +1,456 @@
|
|||
import { CustomError } from "../lib/custom-error";
|
||||
import { getWebScraperQueue } from "./queue-service";
|
||||
import "dotenv/config";
|
||||
import "./sentry";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { CustomError } from "../lib/custom-error";
|
||||
import {
|
||||
getScrapeQueue,
|
||||
redisConnection,
|
||||
scrapeQueueName,
|
||||
} from "./queue-service";
|
||||
import { logtail } from "./logtail";
|
||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
import { Job } from "bull";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { Worker } from "bullmq";
|
||||
import systemMonitor from "./system-monitor";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobDone,
|
||||
crawlToCrawler,
|
||||
finishCrawl,
|
||||
getCrawl,
|
||||
getCrawlJobs,
|
||||
lockURL,
|
||||
} from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import {
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
getJobPriority,
|
||||
} from "../../src/lib/job-priority";
|
||||
import { PlanType } from "../types";
|
||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||
|
||||
if (process.env.ENV === 'production') {
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
consoleCapture: true,
|
||||
additionalInstrumentations: [],
|
||||
});
|
||||
}
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
const wsq = getWebScraperQueue();
|
||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||
const workerStalledCheckInterval =
|
||||
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
||||
const jobLockExtendInterval =
|
||||
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||
const jobLockExtensionTime =
|
||||
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||
|
||||
async function processJob(job: Job, done) {
|
||||
const cantAcceptConnectionInterval =
|
||||
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||
const connectionMonitorInterval =
|
||||
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
|
||||
const processJobInternal = async (token: string, job: Job) => {
|
||||
const extendLockInterval = setInterval(async () => {
|
||||
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
|
||||
await job.extendLock(token, jobLockExtensionTime);
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
await addJobPriority(job.data.team_id, job.id);
|
||||
let err = null;
|
||||
try {
|
||||
const result = await processJob(job, token);
|
||||
try {
|
||||
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
await job.moveToCompleted(null, token, false);
|
||||
} else {
|
||||
await job.moveToCompleted(result.docs, token, false);
|
||||
}
|
||||
} catch (e) {}
|
||||
} catch (error) {
|
||||
console.log("Job failed, error:", error);
|
||||
Sentry.captureException(error);
|
||||
err = error;
|
||||
await job.moveToFailed(error, token, false);
|
||||
} finally {
|
||||
await deleteJobPriority(job.data.team_id, job.id);
|
||||
clearInterval(extendLockInterval);
|
||||
}
|
||||
|
||||
return err;
|
||||
};
|
||||
|
||||
let isShuttingDown = false;
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
console.log("Received SIGINT. Shutting down gracefully...");
|
||||
isShuttingDown = true;
|
||||
});
|
||||
|
||||
const workerFun = async (
|
||||
queueName: string,
|
||||
processJobInternal: (token: string, job: Job) => Promise<any>
|
||||
) => {
|
||||
const worker = new Worker(queueName, null, {
|
||||
connection: redisConnection,
|
||||
lockDuration: 1 * 60 * 1000, // 1 minute
|
||||
// lockRenewTime: 15 * 1000, // 15 seconds
|
||||
stalledInterval: 30 * 1000, // 30 seconds
|
||||
maxStalledCount: 10, // 10 times
|
||||
});
|
||||
|
||||
worker.startStalledCheckTimer();
|
||||
|
||||
const monitor = await systemMonitor;
|
||||
|
||||
while (true) {
|
||||
if (isShuttingDown) {
|
||||
console.log("No longer accepting new jobs. SIGINT");
|
||||
break;
|
||||
}
|
||||
const token = uuidv4();
|
||||
const canAcceptConnection = await monitor.acceptConnection();
|
||||
if (!canAcceptConnection) {
|
||||
console.log("Cant accept connection");
|
||||
await sleep(cantAcceptConnectionInterval); // more sleep
|
||||
continue;
|
||||
}
|
||||
|
||||
const job = await worker.getNextJob(token);
|
||||
if (job) {
|
||||
if (job.data && job.data.sentry && Sentry.isInitialized()) {
|
||||
Sentry.continueTrace(
|
||||
{
|
||||
sentryTrace: job.data.sentry.trace,
|
||||
baggage: job.data.sentry.baggage,
|
||||
},
|
||||
() => {
|
||||
Sentry.startSpan(
|
||||
{
|
||||
name: "Scrape job",
|
||||
attributes: {
|
||||
job: job.id,
|
||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||
},
|
||||
},
|
||||
async (span) => {
|
||||
await Sentry.startSpan(
|
||||
{
|
||||
name: "Process scrape job",
|
||||
op: "queue.process",
|
||||
attributes: {
|
||||
"messaging.message.id": job.id,
|
||||
"messaging.destination.name": getScrapeQueue().name,
|
||||
"messaging.message.body.size": job.data.sentry.size,
|
||||
"messaging.message.receive.latency":
|
||||
Date.now() - (job.processedOn ?? job.timestamp),
|
||||
"messaging.message.retry.count": job.attemptsMade,
|
||||
},
|
||||
},
|
||||
async () => {
|
||||
const res = await processJobInternal(token, job);
|
||||
if (res !== null) {
|
||||
span.setStatus({ code: 2 }); // ERROR
|
||||
} else {
|
||||
span.setStatus({ code: 1 }); // OK
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
} else {
|
||||
Sentry.startSpan(
|
||||
{
|
||||
name: "Scrape job",
|
||||
attributes: {
|
||||
job: job.id,
|
||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||
},
|
||||
},
|
||||
() => {
|
||||
processJobInternal(token, job);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
await sleep(gotJobInterval);
|
||||
} else {
|
||||
await sleep(connectionMonitorInterval);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
workerFun(scrapeQueueName, processJobInternal);
|
||||
|
||||
async function processJob(job: Job, token: string) {
|
||||
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||
|
||||
// Check if the job URL is researchhub and block it immediately
|
||||
// TODO: remove this once solve the root issue
|
||||
if (
|
||||
job.data.url &&
|
||||
(job.data.url.includes("researchhub.com") ||
|
||||
job.data.url.includes("ebay.com") ||
|
||||
job.data.url.includes("youtube.com") ||
|
||||
job.data.url.includes("microsoft.com"))
|
||||
) {
|
||||
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||
const data = {
|
||||
success: false,
|
||||
docs: [],
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||
};
|
||||
await job.moveToCompleted(data.docs, token, false);
|
||||
return data;
|
||||
}
|
||||
|
||||
try {
|
||||
job.progress({
|
||||
job.updateProgress({
|
||||
current: 1,
|
||||
total: 100,
|
||||
current_step: "SCRAPING",
|
||||
current_url: "",
|
||||
});
|
||||
const start = Date.now();
|
||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
||||
|
||||
const { success, message, docs } = await startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
});
|
||||
|
||||
// Better if we throw here so we capture with the correct error
|
||||
if (!success) {
|
||||
throw new Error(message);
|
||||
}
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
|
||||
const data = {
|
||||
success: success,
|
||||
success,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
};
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs,
|
||||
};
|
||||
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
// No idea what this does and when it is called.
|
||||
if (job.data.mode === "crawl" && !job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.id as string,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
);
|
||||
}
|
||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||
await callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.page",
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||
// This seems to work really welel
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: sc.plan as PlanType,
|
||||
team_id: sc.team_id,
|
||||
basePriority: job.data.crawl_id ? 20 : 10,
|
||||
});
|
||||
const jobId = uuidv4();
|
||||
|
||||
// console.log("plan: ", sc.plan);
|
||||
// console.log("team_id: ", sc.team_id)
|
||||
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
||||
// console.log("job priority: " , jobPriority, "\n\n\n")
|
||||
|
||||
const newJob = await addScrapeJob(
|
||||
{
|
||||
url: link,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
team_id: sc.team_id,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
v1: job.data.v1,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||
const jobStatus =
|
||||
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||
? "failed"
|
||||
: "completed";
|
||||
|
||||
const fullDocs = jobs.map((x) =>
|
||||
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
||||
);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
num_docs: fullDocs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: jobStatus !== "failed",
|
||||
result: {
|
||||
links: fullDocs.map((doc) => {
|
||||
return {
|
||||
content: doc,
|
||||
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||
};
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
docs: fullDocs,
|
||||
};
|
||||
|
||||
// v0 web hooks, call when done with all the data
|
||||
if (!job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x)));
|
||||
const jobStatus =
|
||||
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||
? "failed"
|
||||
: "completed";
|
||||
|
||||
// v1 web hooks, call when done with no data, but with event completed
|
||||
if (job.data.v1 && job.data.webhook) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
);
|
||||
}
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
num_docs: jobIDs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
Logger.info(`🐂 Job done ${job.id}`);
|
||||
done(null, data);
|
||||
return data;
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
if (await getWebScraperQueue().isPaused(false)) {
|
||||
Logger.debug("🐂Queue is paused, ignoring");
|
||||
return;
|
||||
}
|
||||
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id,
|
||||
},
|
||||
});
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
|
@ -81,6 +463,9 @@ async function processJob(job: Job, done) {
|
|||
});
|
||||
}
|
||||
Logger.error(error);
|
||||
if (error.stack) {
|
||||
Logger.error(error.stack);
|
||||
}
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
|
@ -89,37 +474,87 @@ async function processJob(job: Job, done) {
|
|||
|
||||
const data = {
|
||||
success: false,
|
||||
docs: [],
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
done(null, data);
|
||||
|
||||
if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id ?? (job.id as string),
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
);
|
||||
}
|
||||
if (job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.id as string,
|
||||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.failed"
|
||||
);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ??
|
||||
"Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: job.data.mode,
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: false,
|
||||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ??
|
||||
"Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
// done(null, data);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
|
||||
wsq.process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
processJob
|
||||
);
|
||||
// wsq.process(
|
||||
// Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
// processJob
|
||||
// );
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
// wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
// wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
// wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
// wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
// wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
// wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
|
|
@ -65,7 +65,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter2.points).toBe(50);
|
||||
expect(limiter2.points).toBe(100);
|
||||
|
||||
const limiter3 = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
|
@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(150);
|
||||
expect(limiter4.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the default rate limiter if plan is not provided", () => {
|
||||
|
@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => {
|
|||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'crawl' mode", async () => {
|
||||
|
@ -188,14 +188,13 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someTokenXY",
|
||||
"hobby"
|
||||
);
|
||||
// expect hobby to have 100 points
|
||||
expect(limiter.points).toBe(10);
|
||||
expect(limiter.points).toBe(20);
|
||||
|
||||
const consumePoints = 5;
|
||||
|
||||
const res = await limiter.consume("test-prefix:someTokenXY", consumePoints);
|
||||
expect(res.consumedPoints).toBe(5);
|
||||
expect(res.remainingPoints).toBe(5);
|
||||
expect(res.remainingPoints).toBe(15);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'crawl' mode", () => {
|
||||
|
@ -227,7 +226,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
expect(limiter.points).toBe(10);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
|
@ -241,7 +240,14 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter3.points).toBe(50);
|
||||
expect(limiter3.points).toBe(100);
|
||||
|
||||
const limiter4 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(1000);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'search' mode", () => {
|
||||
|
@ -309,7 +315,7 @@ describe("Rate Limiter Service", () => {
|
|||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'testSuite' mode", () => {
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user