mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into o1-crawler
This commit is contained in:
commit
2619522fe7
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: "[BUG]"
|
||||
title: "[Bug] "
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
|
|
40
.github/ISSUE_TEMPLATE/self_host_issue.md
vendored
Normal file
40
.github/ISSUE_TEMPLATE/self_host_issue.md
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
---
|
||||
name: Self-host issue
|
||||
about: Report an issue with self-hosting Firecrawl
|
||||
title: "[Self-Host] "
|
||||
labels: self-host
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the Issue**
|
||||
Provide a clear and concise description of the self-hosting issue you're experiencing.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the issue:
|
||||
1. Configure the environment or settings with '...'
|
||||
2. Run the command '...'
|
||||
3. Observe the error or unexpected output at '...'
|
||||
4. Log output/error message
|
||||
|
||||
**Expected Behavior**
|
||||
A clear and concise description of what you expected to happen when self-hosting.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots or copies of the command line output to help explain the self-hosting issue.
|
||||
|
||||
**Environment (please complete the following information):**
|
||||
- OS: [e.g. macOS, Linux, Windows]
|
||||
- Firecrawl Version: [e.g. 1.2.3]
|
||||
- Node.js Version: [e.g. 14.x]
|
||||
- Docker Version (if applicable): [e.g. 20.10.14]
|
||||
- Database Type and Version: [e.g. PostgreSQL 13.4]
|
||||
|
||||
**Logs**
|
||||
If applicable, include detailed logs to help understand the self-hosting problem.
|
||||
|
||||
**Configuration**
|
||||
Provide relevant parts of your configuration files (with sensitive information redacted).
|
||||
|
||||
**Additional Context**
|
||||
Add any other context about the self-hosting issue here, such as specific infrastructure details, network setup, or any modifications made to the original Firecrawl setup.
|
42
.github/archive/publish-rust-sdk.yml
vendored
Normal file
42
.github/archive/publish-rust-sdk.yml
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
name: Publish Rust SDK
|
||||
|
||||
on: []
|
||||
|
||||
env:
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
profile: minimal
|
||||
|
||||
- name: Install dependencies
|
||||
run: cargo build --release
|
||||
|
||||
- name: Run version check script
|
||||
id: version_check_script
|
||||
run: |
|
||||
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
|
||||
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
|
||||
|
||||
- name: Build the package
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
run: cargo package
|
||||
working-directory: ./apps/rust-sdk
|
||||
|
||||
- name: Publish to crates.io
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
env:
|
||||
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
run: cargo publish
|
||||
working-directory: ./apps/rust-sdk
|
61
.github/archive/rust-sdk.yml
vendored
Normal file
61
.github/archive/rust-sdk.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
name: Run Rust SDK E2E Tests
|
||||
|
||||
on: []
|
||||
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Set up Rust
|
||||
uses: actions/setup-rust@v1
|
||||
with:
|
||||
rust-version: stable
|
||||
- name: Try the lib build
|
||||
working-directory: ./apps/rust-sdk
|
||||
run: cargo build
|
||||
- name: Run E2E tests for Rust SDK
|
||||
run: cargo test --test e2e_with_auth
|
20
.github/scripts/check_version_has_incremented.py
vendored
20
.github/scripts/check_version_has_incremented.py
vendored
|
@ -15,6 +15,7 @@ false
|
|||
|
||||
"""
|
||||
import json
|
||||
import toml
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
@ -53,6 +54,19 @@ def get_npm_version(package_name: str) -> str:
|
|||
version = response.json()['version']
|
||||
return version.strip()
|
||||
|
||||
def get_rust_version(file_path: str) -> str:
|
||||
"""Extract version string from Cargo.toml."""
|
||||
cargo_toml = toml.load(file_path)
|
||||
if 'package' in cargo_toml and 'version' in cargo_toml['package']:
|
||||
return cargo_toml['package']['version'].strip()
|
||||
raise RuntimeError("Unable to find version string in Cargo.toml.")
|
||||
|
||||
def get_crates_version(package_name: str) -> str:
|
||||
"""Get latest version of Rust package from crates.io."""
|
||||
response = requests.get(f"https://crates.io/api/v1/crates/{package_name}")
|
||||
version = response.json()['crate']['newest_version']
|
||||
return version.strip()
|
||||
|
||||
def is_version_incremented(local_version: str, published_version: str) -> bool:
|
||||
"""Compare local and published versions."""
|
||||
local_version_parsed: Version = parse_version(local_version)
|
||||
|
@ -74,6 +88,12 @@ if __name__ == "__main__":
|
|||
current_version = get_js_version(os.path.join(package_path, 'package.json'))
|
||||
# Get published version from npm
|
||||
published_version = get_npm_version(package_name)
|
||||
if package_type == "rust":
|
||||
# Get current version from Cargo.toml
|
||||
current_version = get_rust_version(os.path.join(package_path, 'Cargo.toml'))
|
||||
# Get published version from crates.io
|
||||
published_version = get_crates_version(package_name)
|
||||
|
||||
else:
|
||||
raise ValueError("Invalid package type. Use 'python' or 'js'.")
|
||||
|
||||
|
|
1
.github/scripts/requirements.txt
vendored
1
.github/scripts/requirements.txt
vendored
|
@ -1,2 +1,3 @@
|
|||
requests
|
||||
packaging
|
||||
toml
|
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
|
@ -28,7 +28,8 @@ env:
|
|||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
pre-deploy:
|
||||
|
|
7
.github/workflows/fly-direct.yml
vendored
7
.github/workflows/fly-direct.yml
vendored
|
@ -22,12 +22,19 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
name: Deploy app
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
|
|
80
.github/workflows/fly.yml
vendored
80
.github/workflows/fly.yml
vendored
|
@ -26,7 +26,10 @@ env:
|
|||
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
pre-deploy-e2e-tests:
|
||||
|
@ -56,6 +59,9 @@ jobs:
|
|||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Wait for the application to be ready
|
||||
run: |
|
||||
sleep 10
|
||||
- name: Run E2E tests
|
||||
run: |
|
||||
npm run test:prod
|
||||
|
@ -132,7 +138,7 @@ jobs:
|
|||
working-directory: ./apps/python-sdk
|
||||
- name: Run E2E tests for Python SDK
|
||||
run: |
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
pytest firecrawl/__tests__/v1/e2e_withAuth/test.py
|
||||
working-directory: ./apps/python-sdk
|
||||
|
||||
js-sdk-tests:
|
||||
|
@ -205,10 +211,45 @@ jobs:
|
|||
run: go test -v ./... -timeout 180s
|
||||
working-directory: ./apps/go-sdk/firecrawl
|
||||
|
||||
rust-sdk-tests:
|
||||
name: Rust SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies for API
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Set up Rust
|
||||
uses: actions/setup-rust@v1
|
||||
with:
|
||||
rust-version: stable
|
||||
- name: Try the lib build
|
||||
working-directory: ./apps/rust-sdk
|
||||
run: cargo build
|
||||
- name: Run E2E tests for Rust SDK
|
||||
run: cargo test --test e2e_with_auth
|
||||
|
||||
deploy:
|
||||
name: Deploy app
|
||||
runs-on: ubuntu-latest
|
||||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests]
|
||||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests, rust-sdk-tests]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
|
@ -299,4 +340,39 @@ jobs:
|
|||
run: |
|
||||
npm run build-and-publish
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
build-and-publish-rust-sdk:
|
||||
name: Build and publish Rust SDK
|
||||
runs-on: ubuntu-latest
|
||||
needs: deploy
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
profile: minimal
|
||||
|
||||
- name: Install dependencies
|
||||
run: cargo build --release
|
||||
|
||||
- name: Run version check script
|
||||
id: version_check_script
|
||||
run: |
|
||||
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
|
||||
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
|
||||
|
||||
- name: Build the package
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
run: cargo package
|
||||
working-directory: ./apps/rust-sdk
|
||||
|
||||
- name: Publish to crates.io
|
||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
||||
env:
|
||||
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||
run: cargo publish
|
||||
working-directory: ./apps/rust-sdk
|
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -19,4 +19,10 @@ apps/test-suite/load-test-results/test-run-report.json
|
|||
apps/playwright-service-ts/node_modules/
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
|
||||
/examples/o1_web_crawler/venv
|
||||
*.pyc
|
||||
.rdb
|
||||
|
||||
apps/js-sdk/firecrawl/dist
|
||||
|
||||
|
|
8
.gitmodules
vendored
8
.gitmodules
vendored
|
@ -1,6 +1,6 @@
|
|||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
[submodule "apps/go-sdk/firecrawl-go"]
|
||||
path = apps/go-sdk/firecrawl-go
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
[submodule "apps/go-sdk/firecrawl-go-examples"]
|
||||
path = apps/go-sdk/firecrawl-go-examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
||||
|
|
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
|
376
README.md
376
README.md
|
@ -1,3 +1,37 @@
|
|||
<h3 align="center">
|
||||
<img
|
||||
src="https://raw.githubusercontent.com/mendableai/firecrawl/main/img/firecrawl_logo.png"
|
||||
height="200"
|
||||
>
|
||||
</h3>
|
||||
<div align="center">
|
||||
<a href="https://github.com/mendableai/firecrawl/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/github/license/mendableai/firecrawl" alt="License">
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/firecrawl-py">
|
||||
<img src="https://static.pepy.tech/badge/firecrawl-py" alt="Downloads">
|
||||
</a>
|
||||
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
||||
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
||||
</a>
|
||||
<a href="https://firecrawl.dev">
|
||||
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
|
||||
</a>
|
||||
</div>
|
||||
<div>
|
||||
<p align="center">
|
||||
<a href="https://twitter.com/firecrawl_dev">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/104100957">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.com/invite/gSmWdAkdwd">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
# 🔥 Firecrawl
|
||||
|
||||
Crawl and convert any website into LLM-ready markdown or structured data. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the Firecrawl community. Includes powerful scraping, crawling and data extraction capabilities.
|
||||
|
@ -6,11 +40,13 @@ _This repository is in its early development stages. We are still merging custom
|
|||
|
||||
## What is Firecrawl?
|
||||
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev).
|
||||
|
||||
_Pst. hey, you, join our stargazers :)_
|
||||
|
||||
<img src="https://github.com/mendableai/firecrawl/assets/44934913/53c4483a-0f0e-40c6-bd84-153a07f94d29" width="200">
|
||||
<a href="https://github.com/mendableai/firecrawl">
|
||||
<img src="https://img.shields.io/github/stars/mendableai/firecrawl.svg?style=social&label=Star&maxAge=2592000" alt="GitHub stars">
|
||||
</a>
|
||||
|
||||
## How to use it?
|
||||
|
||||
|
@ -41,18 +77,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||
Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/crawl \
|
||||
curl -X POST https://api.firecrawl.dev/v1/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-H 'Authorization: Bearer fc-YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"limit": 100,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "html"]
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Returns a jobId
|
||||
Returns a crawl job id and the url to check the status of the crawl.
|
||||
|
||||
```json
|
||||
{ "jobId": "1234-5678-9101" }
|
||||
{
|
||||
"success": true,
|
||||
"id": "123-456-789",
|
||||
"url": "https://api.firecrawl.dev/v1/crawl/123-456-789"
|
||||
}
|
||||
```
|
||||
|
||||
### Check Crawl Job
|
||||
|
@ -60,7 +104,7 @@ Returns a jobId
|
|||
Used to check the status of a crawl job and get its result.
|
||||
|
||||
```bash
|
||||
curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
||||
curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY'
|
||||
```
|
||||
|
@ -68,18 +112,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
|||
```json
|
||||
{
|
||||
"status": "completed",
|
||||
"current": 22,
|
||||
"total": 22,
|
||||
"total": 36,
|
||||
"creditsUsed": 36,
|
||||
"expiresAt": "2024-00-00T00:00:00.000Z",
|
||||
"data": [
|
||||
{
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl",
|
||||
"language": "en",
|
||||
"sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
"description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.",
|
||||
"ogLocaleAlternate": [],
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -88,14 +134,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
|||
|
||||
### Scraping
|
||||
|
||||
Used to scrape a URL and get its content.
|
||||
Used to scrape a URL and get its content in the specified formats.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -105,68 +152,95 @@ Response:
|
|||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Home - Firecrawl",
|
||||
"description": "Firecrawl crawls and converts any website into clean markdown.",
|
||||
"language": "en",
|
||||
"keywords": "Firecrawl,Markdown,Data,Mendable,Langchain",
|
||||
"robots": "follow, index",
|
||||
"ogTitle": "Firecrawl",
|
||||
"ogDescription": "Turn any website into LLM-ready data.",
|
||||
"ogUrl": "https://www.firecrawl.dev/",
|
||||
"ogImage": "https://www.firecrawl.dev/og.png?123",
|
||||
"ogLocaleAlternate": [],
|
||||
"ogSiteName": "Firecrawl",
|
||||
"sourceURL": "https://firecrawl.dev",
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Search (Beta)
|
||||
### Map (Alpha)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
Used to map a URL and get urls of the website. This returns most links present on the website.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
"url": "https://firecrawl.dev"
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://firecrawl.dev",
|
||||
"https://www.firecrawl.dev/pricing",
|
||||
"https://www.firecrawl.dev/blog",
|
||||
"https://www.firecrawl.dev/playground",
|
||||
"https://www.firecrawl.dev/smart-crawl",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Intelligent Extraction (Beta)
|
||||
#### Map with search
|
||||
|
||||
Map with `search` param allows you to search for specific urls inside a website.
|
||||
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://firecrawl.dev",
|
||||
"search": "docs"
|
||||
}'
|
||||
```
|
||||
|
||||
Response will be an ordered list from the most relevant to the least relevant.
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://docs.firecrawl.dev",
|
||||
"https://docs.firecrawl.dev/sdks/python",
|
||||
"https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### LLM Extraction (Beta)
|
||||
|
||||
Used to extract structured data from scraped pages.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://www.mendable.ai/",
|
||||
"extractorOptions": {
|
||||
"mode": "llm-extraction",
|
||||
"extractionPrompt": "Based on the information on the page, extract the information from the schema. ",
|
||||
"extractionSchema": {
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company_mission": {
|
||||
|
@ -220,6 +294,59 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
|
|||
}
|
||||
```
|
||||
|
||||
### Extracting without a schema (New)
|
||||
|
||||
You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://docs.firecrawl.dev/",
|
||||
"formats": ["extract"],
|
||||
"extract": {
|
||||
"prompt": "Extract the company mission from the page."
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Using Python SDK
|
||||
|
||||
### Installing Python SDK
|
||||
|
@ -231,24 +358,28 @@ pip install firecrawl-py
|
|||
### Crawl a website
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="YOUR_API_KEY")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
# Scrape a website:
|
||||
scrape_status = app.scrape_url(
|
||||
'https://firecrawl.dev',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
print(scrape_status)
|
||||
|
||||
# Get the markdown
|
||||
for result in crawl_result:
|
||||
print(result['markdown'])
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```python
|
||||
url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
# Crawl a website:
|
||||
crawl_status = app.crawl_url(
|
||||
'https://firecrawl.dev',
|
||||
params={
|
||||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
@ -256,6 +387,11 @@ scraped_data = app.scrape_url(url)
|
|||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```python
|
||||
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
|
@ -266,24 +402,12 @@ class TopArticlesSchema(BaseModel):
|
|||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
data = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
|
||||
|
||||
```python
|
||||
query = 'What is Mendable?'
|
||||
search_result = app.search(query)
|
||||
print(data["extract"])
|
||||
```
|
||||
|
||||
## Using the Node SDK
|
||||
|
@ -301,54 +425,33 @@ npm install @mendable/firecrawl-js
|
|||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```js
|
||||
try {
|
||||
const url = "https://example.com";
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
console.log(scrapedData);
|
||||
} catch (error) {
|
||||
console.error("Error occurred while scraping:", error.message);
|
||||
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website
|
||||
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
|
||||
formats: ['markdown', 'html'],
|
||||
});
|
||||
|
||||
if (scrapeResponse) {
|
||||
console.log(scrapeResponse)
|
||||
}
|
||||
|
||||
// Crawl a website
|
||||
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
limit: 100,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
}
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```js
|
||||
const crawlUrl = "https://example.com";
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
const waitUntilDone = true;
|
||||
const timeout = 5;
|
||||
const crawlResult = await app.crawlUrl(
|
||||
crawlUrl,
|
||||
params,
|
||||
waitUntilDone,
|
||||
timeout
|
||||
);
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```js
|
||||
const status = await app.checkCrawlStatus(jobId);
|
||||
console.log(status);
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
|
@ -359,7 +462,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
|
|||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
apiKey: "fc-YOUR_API_KEY"
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
|
@ -384,19 +487,6 @@ const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
|||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
```js
|
||||
const query = "what is mendable?";
|
||||
const searchResults = await app.search(query, {
|
||||
pageOptions: {
|
||||
fetchPageContent: true, // Fetch the page content for each search result
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
|
|
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
@ -107,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad
|
|||
If you’d like to test the crawl endpoint, you can run this:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3002/v0/crawl \
|
||||
curl -X POST http://localhost:3002/v1/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
|
|
|
@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
|
|||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you have a serper key you'd like to use as a search api
|
||||
SERPER_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
|
|
2
apps/api/.gitignore
vendored
2
apps/api/.gitignore
vendored
|
@ -7,5 +7,5 @@ dump.rdb
|
|||
|
||||
/.next/
|
||||
|
||||
# Sentry Config File
|
||||
.rdb
|
||||
.sentryclirc
|
||||
|
|
|
@ -17,8 +17,15 @@ RUN pnpm install
|
|||
RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
|
||||
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
|
||||
|
||||
# Install packages needed for deployment
|
||||
# Install Go
|
||||
FROM golang:1.19 AS go-base
|
||||
COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md
|
||||
|
||||
# Install Go dependencies and build parser lib
|
||||
RUN cd /app/src/lib/go-html-to-md && \
|
||||
go mod tidy && \
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
|
||||
chmod +x html-to-markdown.so
|
||||
|
||||
FROM base
|
||||
RUN apt-get update -qq && \
|
||||
|
@ -26,9 +33,7 @@ RUN apt-get update -qq && \
|
|||
rm -rf /var/lib/apt/lists /var/cache/apt/archives
|
||||
COPY --from=prod-deps /app/node_modules /app/node_modules
|
||||
COPY --from=build /app /app
|
||||
|
||||
|
||||
|
||||
COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so
|
||||
|
||||
# Start the server by default, this can be overwritten at runtime
|
||||
EXPOSE 8080
|
||||
|
|
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
|
@ -0,0 +1,924 @@
|
|||
{
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": "Firecrawl API",
|
||||
"version": "0.0.0",
|
||||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
"url": "https://api.firecrawl.dev/v0"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl": {
|
||||
"post": {
|
||||
"summary": "Crawl multiple URLs based on options",
|
||||
"operationId": "crawlUrls",
|
||||
"tags": ["Crawling"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The base URL to start crawling from"
|
||||
},
|
||||
"crawlerOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"includes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
},
|
||||
"excludes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
},
|
||||
"generateImgAltText": {
|
||||
"type": "boolean",
|
||||
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||
"default": false
|
||||
},
|
||||
"returnOnlyUrls": {
|
||||
"type": "boolean",
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
"description": "Ignore the website sitemap when crawling",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CrawlResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search": {
|
||||
"post": {
|
||||
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||
"operationId": "searchGoogle",
|
||||
"tags": ["Search"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"fetchPageContent": {
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"searchOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results. Max is 20 during beta."
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SearchResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/status/{jobId}": {
|
||||
"get": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Get the status of a crawl job",
|
||||
"operationId": "getCrawlStatus",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Status of the job (completed, active, failed, paused)"
|
||||
},
|
||||
"current": {
|
||||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"bearerAuth": {
|
||||
"type": "http",
|
||||
"scheme": "bearer"
|
||||
}
|
||||
},
|
||||
"schemas": {
|
||||
"ScrapeResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"SearchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
|
@ -18,8 +18,8 @@
|
|||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrape",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
|
@ -38,94 +38,47 @@
|
|||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"formats": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||
},
|
||||
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||
"default": ["markdown"]
|
||||
},
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": true
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
|
@ -741,24 +694,42 @@
|
|||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Warning message to let you know of any issues."
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -780,27 +751,16 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -810,24 +770,33 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -849,11 +818,11 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
|
@ -871,16 +840,34 @@
|
|||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -898,7 +885,18 @@
|
|||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -909,8 +907,15 @@
|
|||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,8 @@
|
|||
"@sentry/node": "^8.26.0",
|
||||
"@sentry/profiling-node": "^8.26.0",
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
|
@ -76,6 +78,7 @@
|
|||
"dotenv": "^16.3.1",
|
||||
"dotenv-cli": "^7.4.2",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
|
@ -83,6 +86,7 @@
|
|||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"json-schema-to-zod": "^2.3.0",
|
||||
"keyword-extractor": "^0.0.28",
|
||||
"koffi": "^2.9.0",
|
||||
"langchain": "^0.2.8",
|
||||
"languagedetect": "^2.0.0",
|
||||
"logsnag": "^1.0.0",
|
||||
|
@ -91,7 +95,7 @@
|
|||
"moment": "^2.29.4",
|
||||
"mongoose": "^8.4.4",
|
||||
"natural": "^7.0.7",
|
||||
"openai": "^4.52.2",
|
||||
"openai": "^4.57.0",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"pos": "^0.4.2",
|
||||
"posthog-node": "^4.0.1",
|
||||
|
@ -110,8 +114,9 @@
|
|||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
"zod": "^3.23.4",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.1"
|
||||
},
|
||||
"nodemonConfig": {
|
||||
|
|
|
@ -47,6 +47,12 @@ importers:
|
|||
'@supabase/supabase-js':
|
||||
specifier: ^2.44.2
|
||||
version: 2.44.2
|
||||
'@types/express-ws':
|
||||
specifier: ^3.0.4
|
||||
version: 3.0.4
|
||||
'@types/ws':
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
|
@ -92,6 +98,9 @@ importers:
|
|||
express-rate-limit:
|
||||
specifier: ^7.3.1
|
||||
version: 7.3.1(express@4.19.2)
|
||||
express-ws:
|
||||
specifier: ^5.0.2
|
||||
version: 5.0.2(express@4.19.2)
|
||||
form-data:
|
||||
specifier: ^4.0.0
|
||||
version: 4.0.0
|
||||
|
@ -113,9 +122,12 @@ importers:
|
|||
keyword-extractor:
|
||||
specifier: ^0.0.28
|
||||
version: 0.0.28
|
||||
koffi:
|
||||
specifier: ^2.9.0
|
||||
version: 2.9.0
|
||||
langchain:
|
||||
specifier: ^0.2.8
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
languagedetect:
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
|
@ -138,8 +150,8 @@ importers:
|
|||
specifier: ^7.0.7
|
||||
version: 7.0.7(socks@2.8.3)
|
||||
openai:
|
||||
specifier: ^4.52.2
|
||||
version: 4.52.2
|
||||
specifier: ^4.57.0
|
||||
version: 4.57.0(zod@3.23.8)
|
||||
pdf-parse:
|
||||
specifier: ^1.1.1
|
||||
version: 1.1.1
|
||||
|
@ -194,11 +206,14 @@ importers:
|
|||
wordpos:
|
||||
specifier: ^2.1.0
|
||||
version: 2.1.0
|
||||
ws:
|
||||
specifier: ^8.18.0
|
||||
version: 8.18.0
|
||||
xml2js:
|
||||
specifier: ^0.6.2
|
||||
version: 0.6.2
|
||||
zod:
|
||||
specifier: ^3.23.4
|
||||
specifier: ^3.23.8
|
||||
version: 3.23.8
|
||||
zod-to-json-schema:
|
||||
specifier: ^3.23.1
|
||||
|
@ -1637,6 +1652,9 @@ packages:
|
|||
'@types/express-serve-static-core@4.19.3':
|
||||
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
|
||||
|
||||
'@types/express@4.17.21':
|
||||
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
|
||||
|
||||
|
@ -1739,8 +1757,8 @@ packages:
|
|||
'@types/whatwg-url@11.0.5':
|
||||
resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==}
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
|
||||
'@types/ws@8.5.12':
|
||||
resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==}
|
||||
|
||||
'@types/yargs-parser@21.0.3':
|
||||
resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==}
|
||||
|
@ -2506,6 +2524,12 @@ packages:
|
|||
peerDependencies:
|
||||
express: 4 || 5 || ^5.0.0-beta.1
|
||||
|
||||
express-ws@5.0.2:
|
||||
resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==}
|
||||
engines: {node: '>=4.5.0'}
|
||||
peerDependencies:
|
||||
express: ^4.0.0 || ^5.0.0-alpha.1
|
||||
|
||||
express@4.19.2:
|
||||
resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==}
|
||||
engines: {node: '>= 0.10.0'}
|
||||
|
@ -3149,6 +3173,9 @@ packages:
|
|||
resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
koffi@2.9.0:
|
||||
resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==}
|
||||
|
||||
langchain@0.2.8:
|
||||
resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==}
|
||||
engines: {node: '>=18'}
|
||||
|
@ -3712,9 +3739,14 @@ packages:
|
|||
openai@3.3.0:
|
||||
resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==}
|
||||
|
||||
openai@4.52.2:
|
||||
resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==}
|
||||
openai@4.57.0:
|
||||
resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==}
|
||||
hasBin: true
|
||||
peerDependencies:
|
||||
zod: ^3.23.8
|
||||
peerDependenciesMeta:
|
||||
zod:
|
||||
optional: true
|
||||
|
||||
openapi-types@12.1.3:
|
||||
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
|
||||
|
@ -4647,8 +4679,20 @@ packages:
|
|||
resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==}
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
|
||||
ws@8.17.1:
|
||||
resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==}
|
||||
ws@7.5.10:
|
||||
resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
|
||||
engines: {node: '>=8.3.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
utf-8-validate: ^5.0.2
|
||||
peerDependenciesMeta:
|
||||
bufferutil:
|
||||
optional: true
|
||||
utf-8-validate:
|
||||
optional: true
|
||||
|
||||
ws@8.18.0:
|
||||
resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==}
|
||||
engines: {node: '>=10.0.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
|
@ -5286,13 +5330,13 @@ snapshots:
|
|||
|
||||
'@js-sdsl/ordered-map@4.4.2': {}
|
||||
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
|
||||
dependencies:
|
||||
ansi-styles: 5.2.0
|
||||
camelcase: 6.3.0
|
||||
decamelize: 1.2.0
|
||||
js-tiktoken: 1.0.12
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
ml-distance: 4.0.1
|
||||
mustache: 4.2.0
|
||||
p-queue: 6.6.2
|
||||
|
@ -5304,20 +5348,20 @@ snapshots:
|
|||
- langchain
|
||||
- openai
|
||||
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))':
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
js-tiktoken: 1.0.12
|
||||
openai: 4.52.2
|
||||
openai: 4.57.0(zod@3.23.8)
|
||||
zod: 3.23.8
|
||||
zod-to-json-schema: 3.23.1(zod@3.23.8)
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- langchain
|
||||
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
js-tiktoken: 1.0.12
|
||||
transitivePeerDependencies:
|
||||
- langchain
|
||||
|
@ -6545,8 +6589,8 @@ snapshots:
|
|||
dependencies:
|
||||
'@supabase/node-fetch': 2.6.15
|
||||
'@types/phoenix': 1.6.5
|
||||
'@types/ws': 8.5.10
|
||||
ws: 8.17.1
|
||||
'@types/ws': 8.5.12
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
|
@ -6643,6 +6687,12 @@ snapshots:
|
|||
'@types/range-parser': 1.2.7
|
||||
'@types/send': 0.17.4
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
dependencies:
|
||||
'@types/express': 4.17.21
|
||||
'@types/express-serve-static-core': 4.19.3
|
||||
'@types/ws': 8.5.12
|
||||
|
||||
'@types/express@4.17.21':
|
||||
dependencies:
|
||||
'@types/body-parser': 1.19.5
|
||||
|
@ -6766,7 +6816,7 @@ snapshots:
|
|||
dependencies:
|
||||
'@types/webidl-conversions': 7.0.3
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
'@types/ws@8.5.12':
|
||||
dependencies:
|
||||
'@types/node': 20.14.1
|
||||
|
||||
|
@ -7521,6 +7571,14 @@ snapshots:
|
|||
dependencies:
|
||||
express: 4.19.2
|
||||
|
||||
express-ws@5.0.2(express@4.19.2):
|
||||
dependencies:
|
||||
express: 4.19.2
|
||||
ws: 7.5.10
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
|
||||
express@4.19.2:
|
||||
dependencies:
|
||||
accepts: 1.3.8
|
||||
|
@ -8440,17 +8498,19 @@ snapshots:
|
|||
|
||||
kleur@3.0.3: {}
|
||||
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1):
|
||||
koffi@2.9.0: {}
|
||||
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
binary-extensions: 2.3.0
|
||||
js-tiktoken: 1.0.12
|
||||
js-yaml: 4.1.0
|
||||
jsonpointer: 5.0.1
|
||||
langchainhub: 0.0.11
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
ml-distance: 4.0.1
|
||||
openapi-types: 12.1.3
|
||||
p-retry: 4.6.2
|
||||
|
@ -8470,14 +8530,14 @@ snapshots:
|
|||
pdf-parse: 1.1.1
|
||||
puppeteer: 22.12.1(typescript@5.4.5)
|
||||
redis: 4.6.14
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- openai
|
||||
|
||||
langchainhub@0.0.11: {}
|
||||
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2):
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)):
|
||||
dependencies:
|
||||
'@types/uuid': 9.0.8
|
||||
commander: 10.0.1
|
||||
|
@ -8486,9 +8546,9 @@ snapshots:
|
|||
p-retry: 4.6.2
|
||||
uuid: 9.0.1
|
||||
optionalDependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
openai: 4.52.2
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
openai: 4.57.0(zod@3.23.8)
|
||||
|
||||
languagedetect@2.0.0: {}
|
||||
|
||||
|
@ -8881,16 +8941,19 @@ snapshots:
|
|||
transitivePeerDependencies:
|
||||
- debug
|
||||
|
||||
openai@4.52.2:
|
||||
openai@4.57.0(zod@3.23.8):
|
||||
dependencies:
|
||||
'@types/node': 18.19.39
|
||||
'@types/node-fetch': 2.6.11
|
||||
'@types/qs': 6.9.15
|
||||
abort-controller: 3.0.0
|
||||
agentkeepalive: 4.5.0
|
||||
form-data-encoder: 1.7.2
|
||||
formdata-node: 4.4.1
|
||||
node-fetch: 2.7.0
|
||||
web-streams-polyfill: 3.3.3
|
||||
qs: 6.12.2
|
||||
optionalDependencies:
|
||||
zod: 3.23.8
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
|
||||
|
@ -9195,7 +9258,7 @@ snapshots:
|
|||
chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070)
|
||||
debug: 4.3.5
|
||||
devtools-protocol: 0.0.1299070
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- supports-color
|
||||
|
@ -9877,7 +9940,9 @@ snapshots:
|
|||
imurmurhash: 0.1.4
|
||||
signal-exit: 4.1.0
|
||||
|
||||
ws@8.17.1: {}
|
||||
ws@7.5.10: {}
|
||||
|
||||
ws@8.18.0: {}
|
||||
|
||||
xml2js@0.6.2:
|
||||
dependencies:
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc
|
||||
Authorization: Bearer fc-
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"corterix.com"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||
|
|
|
@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(firstResponse.statusCode).toBe(200);
|
||||
|
||||
|
@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(secondResponse.statusCode).toBe(409);
|
||||
expect(secondResponse.body.error).toBe('Idempotency key already used');
|
||||
|
|
961
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
961
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
|
@ -0,0 +1,961 @@
|
|||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
delete process.env.USE_DB_AUTHENTICATION;
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it.concurrent("should return the production status", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
|
||||
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
|
||||
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
console.log('!!useDbAuthentication', !!useDbAuthentication);
|
||||
console.log('!useDbAuthentication', !useDbAuthentication);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
});
|
||||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.send({ url: "https://firecrawl.dev"})
|
||||
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.description).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.keywords).toBe(
|
||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
);
|
||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.ogDescription).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.ogUrl).toBe(
|
||||
"https://www.roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.ogImage).toBe(
|
||||
"https://www.roastmywebsite.ai/og.png"
|
||||
);
|
||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.sourceURL).toBe(
|
||||
"https://roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
|
||||
if (!("data" in responseWithoutRemoveTags.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithRemoveTags);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||
}, 30000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||
}, 60000);
|
||||
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(405);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(500);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
}, 3000);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.rawHtml).toContain("<html");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 8000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("links");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("PagerDuty");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("links");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.links).toContain("https://firecrawl.dev");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
|
||||
});
|
||||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://roastmywebsite.ai"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://usemotion.com",
|
||||
search: "pricing"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("usemotion.com/pricing");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
|
||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||
expect(containsDocsFirecrawlDev).toBe(true);
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: false
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||
})
|
||||
|
||||
it.concurrent("should return an error for invalid URL", async () => {
|
||||
const mapRequest = {
|
||||
url: "invalid-url",
|
||||
includeSubdomains: true,
|
||||
search: "test",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(400);
|
||||
expect(response.body).toHaveProperty("success", false);
|
||||
expect(response.body).toHaveProperty("error");
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe("POST /v1/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("id");
|
||||
expect(response.body.id).toMatch(
|
||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
||||
);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("url");
|
||||
expect(response.body.url).toContain("/v1/crawl/");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid includes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
includePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let response;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url).toContain("firecrawl.dev/blog");
|
||||
});
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
180000
|
||||
); // 180 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid excludes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
excludePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let isFinished = false;
|
||||
let response;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(3);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
|
||||
});
|
||||
},
|
||||
90000
|
||||
); // 90 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with max depth option for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
maxDepth: 1,
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
|
||||
// wait for 60 seconds
|
||||
let isCompleted = false;
|
||||
while (!isCompleted) {
|
||||
const statusCheckResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(statusCheckResponse.statusCode).toBe(200);
|
||||
isCompleted = statusCheckResponse.body.status === "completed";
|
||||
if (!isCompleted) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const pathSplits = new URL(url).pathname.split("/");
|
||||
const depth =
|
||||
pathSplits.length -
|
||||
(pathSplits[0].length === 0 &&
|
||||
pathSplits[pathSplits.length - 1].length === 0
|
||||
? 1
|
||||
: 0);
|
||||
expect(depth).toBeLessThanOrEqual(2);
|
||||
});
|
||||
},
|
||||
180000
|
||||
);
|
||||
})
|
||||
|
||||
describe("GET /v1/crawl/:jobId", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response = await request(TEST_URL).get("/v1/crawl/123");
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/123")
|
||||
.set("Authorization", `Bearer invalid-api-key`);
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return Job not found for invalid job ID",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/invalidJobId")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(404);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful crawl status response for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.firecrawl.dev" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.error
|
||||
).toBeUndefined();
|
||||
|
||||
const childrenLinks = completedResponse.body.data.filter(
|
||||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
},
|
||||
180000
|
||||
); // 120 seconds
|
||||
|
||||
it.concurrent(
|
||||
"If someone cancels a crawl job, it should turn into failed status",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.tatum.io", limit: 200 });
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const responseCancel = await request(TEST_URL)
|
||||
.delete(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(responseCancel.statusCode).toBe(200);
|
||||
expect(responseCancel.body).toHaveProperty("status");
|
||||
expect(responseCancel.body.status).toBe("cancelled");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("cancelled");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
})
|
||||
});
|
|
@ -1,11 +1,15 @@
|
|||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types";
|
||||
import {
|
||||
FirecrawlCrawlResponse,
|
||||
FirecrawlCrawlStatusResponse,
|
||||
FirecrawlScrapeResponse,
|
||||
} from "../../types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for API Routes", () => {
|
||||
describe("E2E Tests for v0 API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
|
@ -24,20 +28,27 @@ describe("E2E Tests for API Routes", () => {
|
|||
|
||||
describe("POST /v0/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape");
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL).post(
|
||||
"/v0/scrape"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -52,21 +63,36 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.data.content).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
|
||||
expect(response.body.data.metadata.description).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.keywords).toBe(
|
||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
);
|
||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
|
||||
expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
|
||||
expect(response.body.data.metadata.ogDescription).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.ogUrl).toBe(
|
||||
"https://www.roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.ogImage).toBe(
|
||||
"https://www.roastmywebsite.ai/og.png"
|
||||
);
|
||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai");
|
||||
expect(response.body.data.metadata.sourceURL).toBe(
|
||||
"https://roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
}, 30000); // 30 seconds timeout
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -86,44 +112,61 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 30000); // 30 seconds timeout
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a valid scrape with PDF file",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.content).toContain(
|
||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
||||
);
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.content).toContain(
|
||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
||||
);
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key with removeTags option",
|
||||
async () => {
|
||||
const responseWithoutRemoveTags: FirecrawlScrapeResponse =
|
||||
await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
|
@ -134,16 +177,27 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||
"Scrape This Site"
|
||||
);
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||
"Lessons and Videos"
|
||||
); // #footer
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||
"[Sandbox]("
|
||||
); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||
"web scraping"
|
||||
); // strong
|
||||
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com/",
|
||||
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
|
||||
});
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
|
@ -154,118 +208,157 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||
}, 30000); // 30 seconds timeout
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 400 page",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/400" });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
|
||||
}, 60000); // 60 seconds
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||
"bad request"
|
||||
);
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 401 page",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/401" });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
|
||||
}, 60000); // 60 seconds
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||
"unauthorized"
|
||||
);
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 403 page",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/403" });
|
||||
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
|
||||
}, 60000); // 60 seconds
|
||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||
"forbidden"
|
||||
);
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 404 page",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/404" });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||
}, 60000); // 60 seconds
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 405 page",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/405" });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||
}, 60000); // 60 seconds
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 500 page",
|
||||
async () => {
|
||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://httpstat.us/500" });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||
}, 60000); // 60 seconds
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
});
|
||||
|
||||
describe("POST /v0/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl");
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
|
||||
"/v0/crawl"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key for crawl", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key for crawl",
|
||||
async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -276,9 +369,12 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.jobId).toMatch(
|
||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
||||
);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and valid includes option", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid includes option",
|
||||
async () => {
|
||||
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -329,11 +425,19 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 180000); // 180 seconds
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
},
|
||||
180000
|
||||
); // 180 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid excludes option",
|
||||
async () => {
|
||||
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -364,7 +468,9 @@ describe("E2E Tests for API Routes", () => {
|
|||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
|
@ -375,9 +481,13 @@ describe("E2E Tests for API Routes", () => {
|
|||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
||||
});
|
||||
}, 90000); // 90 seconds
|
||||
},
|
||||
90000
|
||||
); // 90 seconds
|
||||
|
||||
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with max depth option for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -406,7 +516,9 @@ describe("E2E Tests for API Routes", () => {
|
|||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
|
@ -417,38 +529,56 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const pathSplits = new URL(url).pathname.split('/');
|
||||
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
|
||||
const pathSplits = new URL(url).pathname.split("/");
|
||||
const depth =
|
||||
pathSplits.length -
|
||||
(pathSplits[0].length === 0 &&
|
||||
pathSplits[pathSplits.length - 1].length === 0
|
||||
? 1
|
||||
: 0);
|
||||
expect(depth).toBeLessThanOrEqual(2);
|
||||
});
|
||||
}, 180000);
|
||||
},
|
||||
180000
|
||||
);
|
||||
});
|
||||
|
||||
describe("POST /v0/crawlWebsitePreview", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview");
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
|
||||
"/v0/crawlWebsitePreview"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawlWebsitePreview")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
||||
it.concurrent(
|
||||
"should return a timeout error when scraping takes longer than the specified timeout",
|
||||
async () => {
|
||||
const response: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -456,7 +586,9 @@ describe("E2E Tests for API Routes", () => {
|
|||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
}, 3000);
|
||||
},
|
||||
3000
|
||||
);
|
||||
});
|
||||
|
||||
describe("POST /v0/search", () => {
|
||||
|
@ -465,16 +597,21 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/search")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ query: "test" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key for search", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key for search",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/search")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -484,7 +621,9 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body).toHaveProperty("success");
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
}, 60000); // 60 seconds timeout
|
||||
},
|
||||
60000
|
||||
); // 60 seconds timeout
|
||||
});
|
||||
|
||||
describe("GET /v0/crawl/status/:jobId", () => {
|
||||
|
@ -493,26 +632,34 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v0/crawl/status/123")
|
||||
.set("Authorization", `Bearer invalid-api-key`);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return Job not found for invalid job ID", async () => {
|
||||
it.concurrent(
|
||||
"should return Job not found for invalid job ID",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v0/crawl/status/invalidJobId")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(404);
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful crawl status response for a valid crawl job", async () => {
|
||||
it.concurrent(
|
||||
"should return a successful crawl status response for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://mendable.ai/blog" });
|
||||
.send({ url: "https://firecrawl.dev/blog" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
@ -542,16 +689,23 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
|
||||
const childrenLinks = completedResponse.body.data.filter(doc =>
|
||||
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||
const childrenLinks = completedResponse.body.data.filter(
|
||||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL &&
|
||||
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
}, 180000); // 120 seconds
|
||||
},
|
||||
180000
|
||||
); // 120 seconds
|
||||
|
||||
// TODO: review the test below
|
||||
// it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
|
||||
|
@ -599,16 +753,18 @@ describe("E2E Tests for API Routes", () => {
|
|||
// expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
// }, 180000); // 120 seconds
|
||||
|
||||
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||
it.concurrent(
|
||||
"If someone cancels a crawl job, it should turn into failed status",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://jestjs.io" });
|
||||
.send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } });
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 20000));
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const responseCancel = await request(TEST_URL)
|
||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||
|
@ -628,22 +784,39 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body).toHaveProperty("data");
|
||||
|
||||
let isNullOrEmptyArray = false;
|
||||
if (completedResponse.body.data === null || completedResponse.body.data.length === 0) {
|
||||
if (
|
||||
completedResponse.body.data === null ||
|
||||
completedResponse.body.data.length === 0
|
||||
) {
|
||||
isNullOrEmptyArray = true;
|
||||
}
|
||||
expect(isNullOrEmptyArray).toBe(true);
|
||||
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
|
||||
expect(completedResponse.body).toHaveProperty("partial_data");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||
"content"
|
||||
);
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||
"markdown"
|
||||
);
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||
"metadata"
|
||||
);
|
||||
expect(
|
||||
completedResponse.body.partial_data[0].metadata.pageStatusCode
|
||||
).toBe(200);
|
||||
expect(
|
||||
completedResponse.body.partial_data[0].metadata.pageError
|
||||
).toBeUndefined();
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
});
|
||||
|
||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||
it.concurrent("should extract data using LLM extraction mode", async () => {
|
||||
it.concurrent(
|
||||
"should extract data using LLM extraction mode",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
|
@ -690,6 +863,8 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(llmExtraction).toHaveProperty("is_open_source");
|
||||
expect(llmExtraction.is_open_source).toBe(false);
|
||||
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||
}, 60000); // 60 secs
|
||||
},
|
||||
60000
|
||||
); // 60 secs
|
||||
});
|
||||
});
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { crawlController } from '../crawl'
|
||||
import { crawlController } from '../v0/crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
|
|
|
@ -1,22 +1,36 @@
|
|||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter } from "../../src/services/rate-limiter";
|
||||
import { parseApi } from "../lib/parseApi";
|
||||
import { getRateLimiter } from "../services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
PlanType,
|
||||
RateLimiterMode,
|
||||
} from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
} from "../types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../../src/services/redlock";
|
||||
import { getValue } from "../../src/services/redis";
|
||||
import { setValue } from "../../src/services/redis";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
// const { data, error } = await supabase_service
|
||||
// .from('api_keys')
|
||||
// .select(`
|
||||
// key,
|
||||
// team_id,
|
||||
// teams (
|
||||
// subscriptions (
|
||||
// price_id
|
||||
// )
|
||||
// )
|
||||
// `)
|
||||
// .eq('key', normalizedApi)
|
||||
// .limit(1)
|
||||
// .single();
|
||||
function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||
// Check if the string is a valid UUID
|
||||
return validate(potentialUuid);
|
||||
|
@ -88,9 +102,10 @@ export async function supaAuthenticateUser(
|
|||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: string;
|
||||
plan?: PlanType;
|
||||
}> {
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null);
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
}
|
||||
|
@ -118,7 +133,11 @@ export async function supaAuthenticateUser(
|
|||
let priceId: string | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
if (mode == RateLimiterMode.CrawlStatus) {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
} else {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
}
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
|
@ -154,7 +173,7 @@ export async function supaAuthenticateUser(
|
|||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
10
|
||||
60
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
|
@ -233,6 +252,13 @@ export async function supaAuthenticateUser(
|
|||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Map:
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Map,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
@ -285,6 +311,9 @@ export async function supaAuthenticateUser(
|
|||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape ||
|
||||
mode === RateLimiterMode.Preview ||
|
||||
mode === RateLimiterMode.Map ||
|
||||
mode === RateLimiterMode.Crawl ||
|
||||
mode === RateLimiterMode.CrawlStatus ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
|
@ -327,10 +356,10 @@ export async function supaAuthenticateUser(
|
|||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: subscriptionData.plan ?? "",
|
||||
plan: (subscriptionData.plan ?? "") as PlanType,
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
function getPlanByPriceId(price_id: string): PlanType {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return "starter";
|
||||
|
|
|
@ -1,231 +0,0 @@
|
|||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
import { addScrapeJob } from '../services/queue-jobs';
|
||||
import { getScrapeQueue } from '../services/queue-service';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: string
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
}, {}, jobId);
|
||||
|
||||
let doc;
|
||||
|
||||
const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => {
|
||||
try {
|
||||
doc = (await new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + timeout) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await job.getState();
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(job.id)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(job.id)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
}))[0]
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
returnCode: 408,
|
||||
}
|
||||
} else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) {
|
||||
return {
|
||||
success: false,
|
||||
error: e,
|
||||
returnCode: 500,
|
||||
};
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
span.setAttribute("result", JSON.stringify(doc));
|
||||
return null;
|
||||
});
|
||||
|
||||
if (err !== null) {
|
||||
return err;
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) {
|
||||
return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" });
|
||||
}
|
||||
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
// checkCredits
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 0; // billing for doc done on queue end
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [result.data],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") });
|
||||
}
|
||||
}
|
|
@ -1,11 +1,10 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
import { exec } from "node:child_process";
|
||||
import { sendSlackWebhook } from "../../services/alerts/slack";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
|
@ -94,7 +93,8 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
|
||||
const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([
|
||||
const [webScraperActive, webScraperWaiting, webScraperPriority] =
|
||||
await Promise.all([
|
||||
scrapeQueue.getActiveCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
scrapeQueue.getPrioritizedCount(),
|
||||
|
@ -103,17 +103,24 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||
|
||||
// get number of machines active
|
||||
const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines',
|
||||
const request = await fetch(
|
||||
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||
{
|
||||
headers: {
|
||||
'Authorization': `Bearer ${process.env.FLY_API_TOKEN}`
|
||||
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
||||
},
|
||||
}
|
||||
}
|
||||
)
|
||||
);
|
||||
const machines = await request.json();
|
||||
|
||||
// Only worker machines
|
||||
const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length;
|
||||
const activeMachines = machines.filter(
|
||||
(machine) =>
|
||||
(machine.state === "started" ||
|
||||
machine.state === "starting" ||
|
||||
machine.state === "replacing") &&
|
||||
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
|
||||
).length;
|
||||
|
||||
let targetMachineCount = activeMachines;
|
||||
|
||||
|
@ -123,29 +130,57 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
|
||||
// Scale up logic
|
||||
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3));
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 3
|
||||
);
|
||||
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2));
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 2
|
||||
);
|
||||
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp);
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp
|
||||
);
|
||||
}
|
||||
|
||||
// Scale down logic
|
||||
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3));
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 3
|
||||
);
|
||||
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2));
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 2
|
||||
);
|
||||
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown);
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown
|
||||
);
|
||||
}
|
||||
|
||||
if (targetMachineCount !== activeMachines) {
|
||||
Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`);
|
||||
Logger.info(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||
);
|
||||
|
||||
if (targetMachineCount > activeMachines) {
|
||||
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
} else {
|
||||
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
}
|
||||
return res.status(200).json({
|
||||
mode: "scale-descale",
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
60
apps/api/src/controllers/v0/crawl-cancel.ts
Normal file
60
apps/api/src/controllers/v0/crawl-cancel.ts
Normal file
|
@ -0,0 +1,60 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
// check if the job belongs to the team
|
||||
if (useDbAuthentication) {
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
|
@ -1,17 +1,19 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../src/lib/supabase-jobs";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
export async function getJobs(crawlId: string, ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
|
@ -50,12 +52,25 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
if (
|
||||
jobs.length > 0 &&
|
||||
jobs[0].data &&
|
||||
jobs[0].data.pageOptions &&
|
||||
!jobs[0].data.pageOptions.includeRawHtml
|
||||
) {
|
||||
data.forEach(item => {
|
||||
if (item) {
|
||||
delete item.rawHtml;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
|
@ -1,35 +1,24 @@
|
|||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import {
|
||||
defaultCrawlPageOptions,
|
||||
defaultCrawlerOptions,
|
||||
defaultOrigin,
|
||||
} from "../../src/lib/default-values";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
|
@ -148,6 +137,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
|
@ -163,7 +153,15 @@ export async function crawlController(req: Request, res: Response) {
|
|||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
|
@ -181,7 +179,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
@ -204,6 +202,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||
}
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
|
||||
// Not needed, first one should be 15.
|
||||
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
|
@ -1,17 +1,17 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const { success, error, status } = await authenticateUser(
|
||||
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
|
@ -89,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
robots,
|
||||
createdAt: Date.now(),
|
||||
};
|
|
@ -1,8 +1,8 @@
|
|||
|
||||
import { AuthResponse, RateLimiterMode } from "../types";
|
||||
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
|
||||
|
||||
export const keyAuthController = async (req: Request, res: Response) => {
|
295
apps/api/src/controllers/v0/scrape.ts
Normal file
295
apps/api/src/controllers/v0/scrape.ts
Normal file
|
@ -0,0 +1,295 @@
|
|||
import { ExtractorOptions, PageOptions } from "./../../lib/entities";
|
||||
import { Request, Response } from "express";
|
||||
import {
|
||||
billTeam,
|
||||
checkTeamCredits,
|
||||
} from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import {
|
||||
defaultPageOptions,
|
||||
defaultExtractorOptions,
|
||||
defaultTimeout,
|
||||
defaultOrigin,
|
||||
} from "../../lib/default-values";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (typeof url !== "string") {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
returnCode: 403,
|
||||
};
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
let doc;
|
||||
|
||||
const err = await Sentry.startSpan(
|
||||
{
|
||||
name: "Wait for job to finish",
|
||||
op: "bullmq.wait",
|
||||
attributes: { job: jobId },
|
||||
},
|
||||
async (span) => {
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
returnCode: 408,
|
||||
};
|
||||
} else if (
|
||||
typeof e === "string" &&
|
||||
(e.includes("Error generating completions: ") ||
|
||||
e.includes("Invalid schema for function") ||
|
||||
e.includes(
|
||||
"LLM extraction did not match the extraction schema you provided."
|
||||
))
|
||||
) {
|
||||
return {
|
||||
success: false,
|
||||
error: e,
|
||||
returnCode: 500,
|
||||
};
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
span.setAttribute("result", JSON.stringify(doc));
|
||||
return null;
|
||||
}
|
||||
);
|
||||
|
||||
if (err !== null) {
|
||||
return err;
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return {
|
||||
success: true,
|
||||
error: "No page found",
|
||||
returnCode: 200,
|
||||
data: doc,
|
||||
};
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (
|
||||
!pageOptions.includeRawHtml &&
|
||||
extractorOptions.mode == "llm-extraction-from-raw-html"
|
||||
) {
|
||||
if (doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageOptions.includeHtml) {
|
||||
if (doc.html) {
|
||||
delete doc.html;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = {
|
||||
...defaultExtractorOptions,
|
||||
...req.body.extractorOptions,
|
||||
};
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
if (
|
||||
typeof extractorOptions.extractionSchema !== "object" ||
|
||||
extractorOptions.extractionSchema === null
|
||||
) {
|
||||
return res.status(400).json({
|
||||
error:
|
||||
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
|
||||
});
|
||||
}
|
||||
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
// checkCredits
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({
|
||||
error:
|
||||
"Error checking team credits. Please contact hello@firecrawl.com for help.",
|
||||
});
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && result.data.markdown
|
||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 1;
|
||||
const creditsPerLLMExtract = 4;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
billTeam(team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({
|
||||
error:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
});
|
||||
}
|
||||
}
|
|
@ -1,17 +1,18 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { getScrapeQueue } from "../services/queue-service";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { addScrapeJob } from "../services/queue-jobs";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
|
@ -20,6 +21,7 @@ export async function searchHelper(
|
|||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
plan: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
|
@ -52,18 +54,10 @@ export async function searchHelper(
|
|||
|
||||
|
||||
if (justSearch) {
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
res.length
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
billTeam(team_id, res.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
|
@ -76,6 +70,8 @@ export async function searchHelper(
|
|||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
|
||||
|
||||
// filter out social media links
|
||||
|
||||
const jobDatas = res.map(x => {
|
||||
|
@ -92,7 +88,7 @@ export async function searchHelper(
|
|||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
priority: jobPriority,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
@ -108,24 +104,7 @@ export async function searchHelper(
|
|||
await getScrapeQueue().addBulk(jobs);
|
||||
}
|
||||
|
||||
const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + 60000) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await x.getState();
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(x.id)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(x.id)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
})))).map(x => x[0]);
|
||||
const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
|
@ -152,7 +131,7 @@ export async function searchHelper(
|
|||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
|
@ -162,17 +141,16 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: false,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||
fallback: req.body.pageOptions?.fallback ?? false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
|
@ -194,6 +172,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
crawlerOptions,
|
||||
pageOptions,
|
||||
searchOptions,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
|
@ -1,6 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
|
@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||
// }
|
||||
// }
|
||||
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||
|
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
|
@ -0,0 +1,47 @@
|
|||
import { crawlController } from '../crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
jest.mock('../auth', () => ({
|
||||
authenticateUser: jest.fn().mockResolvedValue({
|
||||
success: true,
|
||||
team_id: 'team123',
|
||||
error: null,
|
||||
status: 200
|
||||
}),
|
||||
reduce: jest.fn()
|
||||
}));
|
||||
jest.mock('../../services/idempotency/validate');
|
||||
|
||||
describe('crawlController', () => {
|
||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
||||
const req = {
|
||||
headers: {
|
||||
'x-idempotency-key': await uuidv4(),
|
||||
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
|
||||
},
|
||||
body: {
|
||||
url: 'https://mendable.ai'
|
||||
}
|
||||
} as unknown as Request;
|
||||
const res = {
|
||||
status: jest.fn().mockReturnThis(),
|
||||
json: jest.fn()
|
||||
} as unknown as Response;
|
||||
|
||||
// Mock the idempotency key validation to return false for the second call
|
||||
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
|
||||
|
||||
// First request should succeed
|
||||
await crawlController(req, res);
|
||||
expect(res.status).not.toHaveBeenCalledWith(409);
|
||||
|
||||
// Second request with the same key should fail
|
||||
await crawlController(req, res);
|
||||
expect(res.status).toHaveBeenCalledWith(409);
|
||||
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
|
||||
});
|
||||
});
|
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
|
@ -0,0 +1,64 @@
|
|||
import { url } from "../types";
|
||||
|
||||
describe("URL Schema Validation", () => {
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
it("should prepend http:// to URLs without a protocol", () => {
|
||||
const result = url.parse("example.com");
|
||||
expect(result).toBe("http://example.com");
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("http://example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject URLs with unsupported protocols", () => {
|
||||
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
|
||||
});
|
||||
|
||||
it("should reject URLs without a valid top-level domain", () => {
|
||||
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
|
||||
});
|
||||
|
||||
it("should reject blocked URLs", () => {
|
||||
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains correctly", () => {
|
||||
expect(() => url.parse("http://sub.example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://blog.example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with paths correctly", () => {
|
||||
expect(() => url.parse("http://example.com/path")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains that are blocked", () => {
|
||||
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with paths that are blocked", () => {
|
||||
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||
});
|
||||
})
|
|
@ -1,10 +1,12 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
162
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
162
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
|
@ -0,0 +1,162 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
error: string,
|
||||
}
|
||||
|
||||
type CatchupMessage = {
|
||||
type: "catchup",
|
||||
data: CrawlStatusResponse,
|
||||
}
|
||||
|
||||
type DocumentMessage = {
|
||||
type: "document",
|
||||
data: Document,
|
||||
}
|
||||
|
||||
type DoneMessage = { type: "done" }
|
||||
|
||||
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
||||
|
||||
function send(ws: WebSocket, msg: Message) {
|
||||
if (ws.readyState === 1) {
|
||||
return new Promise((resolve, reject) => {
|
||||
ws.send(JSON.stringify(msg), (err) => {
|
||||
if (err) reject(err);
|
||||
else resolve(null);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function close(ws: WebSocket, code: number, msg: Message) {
|
||||
if (ws.readyState <= 1) {
|
||||
ws.close(code, JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return close(ws, 1008, { type: "error", error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs = [];
|
||||
let finished = false;
|
||||
|
||||
const loop = async () => {
|
||||
if (finished) return;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
if (jobIDs.length === doneJobIDs.length) {
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
|
||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
|
||||
for (const jobID of newlyDoneJobIDs) {
|
||||
const job = await getJob(jobID);
|
||||
|
||||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
}
|
||||
}
|
||||
|
||||
doneJobIDs.push(...newlyDoneJobIDs);
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
};
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
|
||||
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobs = await getJobs(doneJobIDs);
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
send(ws, {
|
||||
type: "catchup",
|
||||
data: {
|
||||
success: true,
|
||||
status,
|
||||
total: jobIDs.length,
|
||||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
}
|
||||
});
|
||||
|
||||
if (status !== "scraping") {
|
||||
finished = true;
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
}
|
||||
|
||||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
} catch (err) {
|
||||
Sentry.captureException(err);
|
||||
|
||||
const id = uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
130
apps/api/src/controllers/v1/crawl-status.ts
Normal file
130
apps/api/src/controllers/v1/crawl-status.ts
Normal file
|
@ -0,0 +1,130 @@
|
|||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
if (job) {
|
||||
job.returnvalue = x.docs;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
jobs.forEach(job => {
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
});
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
const bytesLimit = 10485760; // 10 MiB in bytes
|
||||
const factor = 100; // chunking for faster retrieval
|
||||
|
||||
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
|
||||
// get current chunk and retrieve jobs
|
||||
const currentIDs = doneJobsOrder.slice(i, i+factor);
|
||||
const jobs = await getJobs(currentIDs);
|
||||
|
||||
// iterate through jobs and add them one them one to the byte counter
|
||||
// both loops will break once we cross the byte counter
|
||||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
}
|
||||
}
|
||||
|
||||
// if we ran over the bytes limit, remove the last document
|
||||
if (bytes > bytesLimit) {
|
||||
doneJobs.splice(doneJobs.length - 1, 1);
|
||||
}
|
||||
} else {
|
||||
doneJobs = await getJobs(doneJobsOrder);
|
||||
}
|
||||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
if (typeof req.query.limit === "string") {
|
||||
nextURL.searchParams.set("limit", req.query.limit);
|
||||
}
|
||||
|
||||
if (data.length > 0) {
|
||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||
if (data[ii]) {
|
||||
delete data[ii].rawHtml;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
success: true,
|
||||
status,
|
||||
completed: doneJobsLength,
|
||||
total: jobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
next:
|
||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
});
|
||||
}
|
||||
|
167
apps/api/src/controllers/v1/crawl.ts
Normal file
167
apps/api/src/controllers/v1/crawl.ts
Normal file
|
@ -0,0 +1,167 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
const { remainingCredits } = req.account;
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
);
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
if(req.body.webhook) {
|
||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
||||
}
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
6
apps/api/src/controllers/v1/liveness.ts
Normal file
6
apps/api/src/controllers/v1/liveness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
142
apps/api/src/controllers/v1/map.ts
Normal file
142
apps/api/src/controllers/v1/map.ts
Normal file
|
@ -0,0 +1,142 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
checkAndUpdateURLForMap,
|
||||
isSameDomain,
|
||||
isSameSubdomain,
|
||||
removeDuplicateUrls,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
configDotenv();
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
|
||||
const limit : number = req.body.limit ?? 5000;
|
||||
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
// limit to 100 results (beta)
|
||||
numResults: Math.min(limit, 100),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.search) {
|
||||
const searchQuery = req.body.search.toLowerCase();
|
||||
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
}
|
||||
|
||||
links = links.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim()
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}).filter(x => x !== null);
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
billTeam(req.auth.team_id, 1).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
const linksToReturn = links.slice(0, limit);
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: links.length > 0,
|
||||
message: "Map completed",
|
||||
num_docs: linksToReturn.length,
|
||||
docs: linksToReturn,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||
});
|
||||
}
|
6
apps/api/src/controllers/v1/readiness.ts
Normal file
6
apps/api/src/controllers/v1/readiness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
|
@ -0,0 +1,38 @@
|
|||
import { Response } from "express";
|
||||
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
|
||||
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
|
||||
|
||||
export async function scrapeStatusController(req: any, res: any) {
|
||||
try {
|
||||
const rateLimiter = scrapeStatusRateLimiter;
|
||||
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||
req.socket.remoteAddress) as string;
|
||||
const iptoken = incomingIP;
|
||||
await rateLimiter.consume(iptoken);
|
||||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
|
||||
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
});
|
||||
}
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: job?.docs[0],
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||
return res.status(429).json({
|
||||
success: false,
|
||||
error: "Rate limit exceeded. Please try again later.",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "An unexpected error occurred.",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
148
apps/api/src/controllers/v1/scrape.ts
Normal file
148
apps/api/src/controllers/v1/scrape.ts
Normal file
|
@ -0,0 +1,148 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ScrapeRequest,
|
||||
scrapeRequestSchema,
|
||||
ScrapeResponse,
|
||||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export async function scrapeController(
|
||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||
res: Response<ScrapeResponse>
|
||||
) {
|
||||
req.body = scrapeRequestSchema.parse(req.body);
|
||||
let earlyReturn = false;
|
||||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan as PlanType,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
let doc: any | undefined;
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
extractorOptions && extractorOptions.mode !== "markdown"
|
||||
? " - Could be due to LLM parsing issues"
|
||||
: ""
|
||||
}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc,
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
doc && doc.markdown
|
||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if(req.body.extract && req.body.formats.includes("extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
message: "Scrape completed",
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
380
apps/api/src/controllers/v1/types.ts
Normal file
380
apps/api/src/controllers/v1/types.ts
Normal file
|
@ -0,0 +1,380 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
| "html"
|
||||
| "rawHtml"
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage"
|
||||
| "extract";
|
||||
|
||||
export const url = z.preprocess(
|
||||
(x) => {
|
||||
if (!protocolIncluded(x as string)) {
|
||||
return `http://${x}`;
|
||||
}
|
||||
return x;
|
||||
},
|
||||
z
|
||||
.string()
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
|
||||
"URL must have a valid top-level domain or be a valid path"
|
||||
)
|
||||
.refine(
|
||||
(x) => {
|
||||
try {
|
||||
checkUrl(x as string)
|
||||
return true;
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
"Invalid URL"
|
||||
)
|
||||
.refine(
|
||||
(x) => !isUrlBlocked(x as string),
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||
)
|
||||
);
|
||||
|
||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const extractOptions = z.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
schema: z.any().optional(),
|
||||
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
|
||||
prompt: z.string().optional()
|
||||
}).strict(strictMessage);
|
||||
|
||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract"
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"]),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||
},
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
// includePaths?: string[];
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
webhook: z.string().url().optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
// export type ExtractorOptions = {
|
||||
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
// extractionPrompt?: string;
|
||||
// extractionSchema?: Record<string, any>;
|
||||
// }
|
||||
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000).optional(),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// };
|
||||
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
extract?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dcTermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dcTermsType?: string;
|
||||
dcType?: string;
|
||||
dcTermsAudience?: string;
|
||||
dcTermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dcTermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type ErrorResponse = {
|
||||
success: false;
|
||||
error: string;
|
||||
details?: any;
|
||||
};
|
||||
|
||||
export type ScrapeResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export interface ScrapeResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ScrapeResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
id: string;
|
||||
url: string;
|
||||
};
|
||||
|
||||
export type MapResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
links: string[];
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusParams = {
|
||||
jobId: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: string;
|
||||
next?: string;
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
remainingCredits: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth?: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface RequestWithAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined,
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface ResponseWithSentry<
|
||||
ResBody = undefined,
|
||||
> extends Response<ResBody> {
|
||||
sentry?: string,
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
maxCrawledLinks: x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
headers: x.headers,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
userPrompt: x.prompt ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc === null || doc === undefined) return null;
|
||||
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata.pageError,
|
||||
statusCode: doc.metadata.pageStatusCode,
|
||||
},
|
||||
};
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
import "dotenv/config";
|
||||
import "./services/sentry"
|
||||
import * as Sentry from "@sentry/node";
|
||||
import express from "express";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
|
@ -15,8 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events";
|
|||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
|
||||
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
|
@ -49,7 +53,8 @@ if (cluster.isMaster) {
|
|||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
const ws = expressWs(express());
|
||||
const app = ws.app;
|
||||
|
||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||
|
||||
|
@ -82,6 +87,7 @@ if (cluster.isMaster) {
|
|||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use("/v1", v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
|
@ -184,11 +190,42 @@ if (cluster.isMaster) {
|
|||
res.send({ isProduction: global.isProduction });
|
||||
});
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
} else {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
Sentry.setupExpressErrorHandler(app);
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) {
|
||||
return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' });
|
||||
}
|
||||
|
||||
const id = res.sentry ?? uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
});
|
||||
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// const sq = getScrapeQueue();
|
||||
|
||||
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
|
|
|
@ -15,7 +15,8 @@ export async function generateCompletions(
|
|||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const prompt = extractionOptions.extractionPrompt;
|
||||
const systemPrompt = extractionOptions.extractionPrompt;
|
||||
const prompt = extractionOptions.userPrompt;
|
||||
|
||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
|
@ -30,18 +31,23 @@ export async function generateCompletions(
|
|||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt,
|
||||
systemPrompt: systemPrompt,
|
||||
mode: mode,
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
if (schema) {
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||
throw new Error(
|
||||
`JSON parsing error(s): ${validate.errors
|
||||
?.map((err) => err.message)
|
||||
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||
.join(
|
||||
", "
|
||||
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
|
|
|
@ -16,7 +16,6 @@ function prepareOpenAIDoc(
|
|||
document: Document,
|
||||
mode: "markdown" | "raw-html"
|
||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
||||
|
||||
let markdown = document.markdown;
|
||||
|
||||
let extractionTarget = document.markdown;
|
||||
|
@ -33,34 +32,32 @@ function prepareOpenAIDoc(
|
|||
// );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// count number of tokens
|
||||
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
// trim the document to the maximum number of tokens, tokens != characters
|
||||
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
||||
extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
|
||||
}
|
||||
|
||||
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = process.env.MODEL_NAME || "gpt-4o",
|
||||
model = process.env.MODEL_NAME || "gpt-4o-mini",
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
systemPrompt = defaultPrompt,
|
||||
prompt,
|
||||
temperature,
|
||||
mode
|
||||
mode,
|
||||
}: {
|
||||
client: OpenAI;
|
||||
model?: string;
|
||||
document: Document;
|
||||
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||
prompt?: string;
|
||||
systemPrompt?: string;
|
||||
temperature?: number;
|
||||
mode: "markdown" | "raw-html";
|
||||
}): Promise<Document> {
|
||||
|
@ -70,18 +67,46 @@ export async function generateOpenAICompletions({
|
|||
if (preparedDoc === null) {
|
||||
return {
|
||||
...document,
|
||||
warning: "LLM extraction was not performed since the document's content is empty or missing.",
|
||||
warning:
|
||||
"LLM extraction was not performed since the document's content is empty or missing.",
|
||||
};
|
||||
}
|
||||
|
||||
const [content, numTokens] = preparedDoc;
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
let completion;
|
||||
let llmExtraction;
|
||||
if (prompt && !schema) {
|
||||
const jsonCompletion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: prompt,
|
||||
content: systemPrompt,
|
||||
},
|
||||
{ role: "user", content },
|
||||
{
|
||||
role: "user",
|
||||
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
||||
},
|
||||
],
|
||||
response_format: { type: "json_object" },
|
||||
temperature,
|
||||
});
|
||||
|
||||
try {
|
||||
llmExtraction = JSON.parse(
|
||||
jsonCompletion.choices[0].message.content.trim()
|
||||
);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
}
|
||||
} else {
|
||||
completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: systemPrompt,
|
||||
},
|
||||
{ role: "user", content },
|
||||
],
|
||||
|
@ -95,20 +120,26 @@ export async function generateOpenAICompletions({
|
|||
},
|
||||
},
|
||||
],
|
||||
tool_choice: { "type": "function", "function": {"name": "extract_content"}},
|
||||
tool_choice: { type: "function", function: { name: "extract_content" } },
|
||||
temperature,
|
||||
});
|
||||
|
||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||
|
||||
// Extract the LLM extraction content from the completion response
|
||||
const llmExtraction = JSON.parse(c);
|
||||
try {
|
||||
llmExtraction = JSON.parse(c);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
}
|
||||
}
|
||||
|
||||
// Return the document with the LLM extraction content added
|
||||
return {
|
||||
...document,
|
||||
llm_extraction: llmExtraction,
|
||||
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
|
||||
warning:
|
||||
numTokens > maxTokens
|
||||
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
||||
: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
40
apps/api/src/lib/__tests__/html-to-markdown.test.ts
Normal file
40
apps/api/src/lib/__tests__/html-to-markdown.test.ts
Normal file
|
@ -0,0 +1,40 @@
|
|||
import { parseMarkdown } from '../html-to-markdown';
|
||||
|
||||
describe('parseMarkdown', () => {
|
||||
it('should correctly convert simple HTML to Markdown', async () => {
|
||||
const html = '<p>Hello, world!</p>';
|
||||
const expectedMarkdown = 'Hello, world!';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should convert complex HTML with nested elements to Markdown', async () => {
|
||||
const html = '<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>';
|
||||
const expectedMarkdown = 'Hello **bold** world!\n\n- List item';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should return empty string when input is empty', async () => {
|
||||
const html = '';
|
||||
const expectedMarkdown = '';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should handle null input gracefully', async () => {
|
||||
const html = null;
|
||||
const expectedMarkdown = '';
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('should handle various types of invalid HTML gracefully', async () => {
|
||||
const invalidHtmls = [
|
||||
{ html: '<html><p>Unclosed tag', expected: 'Unclosed tag' },
|
||||
{ html: '<div><span>Missing closing div', expected: 'Missing closing div' },
|
||||
{ html: '<p><strong>Wrong nesting</em></strong></p>', expected: '**Wrong nesting**' },
|
||||
{ html: '<a href="http://example.com">Link without closing tag', expected: '[Link without closing tag](http://example.com)' }
|
||||
];
|
||||
|
||||
for (const { html, expected } of invalidHtmls) {
|
||||
await expect(parseMarkdown(html)).resolves.toBe(expected);
|
||||
}
|
||||
});
|
||||
});
|
134
apps/api/src/lib/__tests__/job-priority.test.ts
Normal file
134
apps/api/src/lib/__tests__/job-priority.test.ts
Normal file
|
@ -0,0 +1,134 @@
|
|||
import {
|
||||
getJobPriority,
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
} from "../job-priority";
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
jest.mock("../../services/queue-service", () => ({
|
||||
redisConnection: {
|
||||
sadd: jest.fn(),
|
||||
srem: jest.fn(),
|
||||
scard: jest.fn(),
|
||||
expire: jest.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
describe("Job Priority Tests", () => {
|
||||
afterEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
test("addJobPriority should add job_id to the set and set expiration", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
await addJobPriority(team_id, job_id);
|
||||
expect(redisConnection.sadd).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
job_id
|
||||
);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
});
|
||||
|
||||
test("deleteJobPriority should remove job_id from the set", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
await deleteJobPriority(team_id, job_id);
|
||||
expect(redisConnection.srem).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
job_id
|
||||
);
|
||||
});
|
||||
|
||||
test("getJobPriority should return correct priority based on plan and set length", async () => {
|
||||
const team_id = "team1";
|
||||
const plan: PlanType = "standard";
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
|
||||
|
||||
const priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(250);
|
||||
const priorityExceeded = await getJobPriority({ plan, team_id });
|
||||
expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4)
|
||||
});
|
||||
|
||||
test("getJobPriority should handle different plans correctly", async () => {
|
||||
const team_id = "team1";
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(50);
|
||||
let plan: PlanType = "hobby";
|
||||
let priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
|
||||
plan = "hobby";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3)
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(25);
|
||||
plan = "free";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(10);
|
||||
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(60);
|
||||
plan = "free";
|
||||
priority = await getJobPriority({ plan, team_id });
|
||||
expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5)
|
||||
});
|
||||
|
||||
test("addJobPriority should reset expiration time when adding new job", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id1 = "job1";
|
||||
const job_id2 = "job2";
|
||||
|
||||
await addJobPriority(team_id, job_id1);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
|
||||
// Clear the mock calls
|
||||
(redisConnection.expire as jest.Mock).mockClear();
|
||||
|
||||
// Add another job
|
||||
await addJobPriority(team_id, job_id2);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
});
|
||||
|
||||
test("Set should expire after 60 seconds", async () => {
|
||||
const team_id = "team1";
|
||||
const job_id = "job1";
|
||||
|
||||
jest.useFakeTimers();
|
||||
|
||||
await addJobPriority(team_id, job_id);
|
||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||
`limit_team_id:${team_id}`,
|
||||
60
|
||||
);
|
||||
|
||||
// Fast-forward time by 59 seconds
|
||||
jest.advanceTimersByTime(59000);
|
||||
|
||||
// The set should still exist
|
||||
expect(redisConnection.scard).not.toHaveBeenCalled();
|
||||
|
||||
// Fast-forward time by 2 more seconds (total 61 seconds)
|
||||
jest.advanceTimersByTime(2000);
|
||||
|
||||
// Check if the set has been removed (scard should return 0)
|
||||
(redisConnection.scard as jest.Mock).mockResolvedValue(0);
|
||||
const setSize = await redisConnection.scard(`limit_team_id:${team_id}`);
|
||||
expect(setSize).toBe(0);
|
||||
|
||||
jest.useRealTimers();
|
||||
});
|
||||
});
|
32
apps/api/src/lib/checkCredits.ts
Normal file
32
apps/api/src/lib/checkCredits.ts
Normal file
|
@ -0,0 +1,32 @@
|
|||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
type checkCreditsResponse = {
|
||||
status: number;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
|
||||
try {
|
||||
const {
|
||||
success: creditsCheckSuccess,
|
||||
message: creditsCheckMessage
|
||||
} = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return {
|
||||
status: 402,
|
||||
error: "Insufficient credits"
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return {
|
||||
status: 500,
|
||||
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
|
||||
};
|
||||
}
|
||||
return {
|
||||
status: 200,
|
||||
error: null
|
||||
}
|
||||
};
|
|
@ -6,6 +6,7 @@ export type StoredCrawl = {
|
|||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
plan: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
|
@ -26,6 +27,14 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
|||
return JSON.parse(x);
|
||||
}
|
||||
|
||||
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("crawl:" + id);
|
||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||
d.setMilliseconds(0);
|
||||
return d;
|
||||
}
|
||||
|
||||
export async function addCrawlJob(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
|
@ -38,13 +47,27 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
||||
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
||||
}
|
||||
|
||||
export async function isCrawlFinished(id: string) {
|
||||
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
||||
}
|
||||
|
||||
export async function isCrawlFinishedLocked(id: string) {
|
||||
return (await redisConnection.exists("crawl:" + id + ":finish"));
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
if (await isCrawlFinished(id)) {
|
||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||
|
|
|
@ -19,3 +19,4 @@ export class CustomError extends Error {
|
|||
Object.setPrototypeOf(this, CustomError.prototype);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,8 @@ export interface Progress {
|
|||
}
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
includeExtract?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
|
@ -24,8 +26,9 @@ export type PageOptions = {
|
|||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
includeLinks?: boolean;
|
||||
useFastMode?: boolean; // beta
|
||||
disableJSDom?: boolean; // beta
|
||||
disableJsDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
};
|
||||
|
||||
|
@ -33,6 +36,7 @@ export type ExtractorOptions = {
|
|||
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any>;
|
||||
userPrompt?: string;
|
||||
}
|
||||
|
||||
export type SearchOptions = {
|
||||
|
|
7
apps/api/src/lib/go-html-to-md/README.md
Normal file
7
apps/api/src/lib/go-html-to-md/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
To build the go-html-to-md library, run the following command:
|
||||
|
||||
```bash
|
||||
cd apps/api/src/lib/go-html-to-md
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
|
||||
chmod +x html-to-markdown.so
|
||||
```
|
14
apps/api/src/lib/go-html-to-md/go.mod
Normal file
14
apps/api/src/lib/go-html-to-md/go.mod
Normal file
|
@ -0,0 +1,14 @@
|
|||
module html-to-markdown.go
|
||||
|
||||
go 1.19
|
||||
|
||||
require github.com/JohannesKaufmann/html-to-markdown v1.6.0
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.9.2 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
golang.org/x/net v0.25.0 // indirect
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
)
|
93
apps/api/src/lib/go-html-to-md/go.sum
Normal file
93
apps/api/src/lib/go-html-to-md/go.sum
Normal file
|
@ -0,0 +1,93 @@
|
|||
github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k=
|
||||
github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ=
|
||||
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
|
||||
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
|
||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
|
||||
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
|
||||
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
|
||||
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
|
||||
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
|
||||
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
|
||||
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
|
||||
github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
|
||||
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
25
apps/api/src/lib/go-html-to-md/html-to-markdown.go
Normal file
25
apps/api/src/lib/go-html-to-md/html-to-markdown.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"log"
|
||||
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/JohannesKaufmann/html-to-markdown/plugin"
|
||||
)
|
||||
|
||||
//export ConvertHTMLToMarkdown
|
||||
func ConvertHTMLToMarkdown(html *C.char) *C.char {
|
||||
converter := md.NewConverter("", true, nil)
|
||||
converter.Use(plugin.GitHubFlavored())
|
||||
|
||||
markdown, err := converter.ConvertString(C.GoString(html))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
return C.CString(markdown)
|
||||
}
|
||||
|
||||
func main() {
|
||||
// This function is required for the main package
|
||||
}
|
|
@ -1,8 +1,68 @@
|
|||
|
||||
export async function parseMarkdown(html: string) {
|
||||
import koffi from 'koffi';
|
||||
import { join } from 'path';
|
||||
import "../services/sentry"
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { Logger } from './logger';
|
||||
dotenv.config();
|
||||
|
||||
// TODO: add a timeout to the Go parser
|
||||
|
||||
class GoMarkdownConverter {
|
||||
private static instance: GoMarkdownConverter;
|
||||
private convert: any;
|
||||
|
||||
private constructor() {
|
||||
const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so');
|
||||
const lib = koffi.load(goExecutablePath);
|
||||
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
|
||||
}
|
||||
|
||||
public static getInstance(): GoMarkdownConverter {
|
||||
if (!GoMarkdownConverter.instance) {
|
||||
GoMarkdownConverter.instance = new GoMarkdownConverter();
|
||||
}
|
||||
return GoMarkdownConverter.instance;
|
||||
}
|
||||
|
||||
public async convertHTMLToMarkdown(html: string): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.convert.async(html, (err: Error, res: string) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(res);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function parseMarkdown(html: string): Promise<string> {
|
||||
if (!html) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
|
||||
const converter = GoMarkdownConverter.getInstance();
|
||||
let markdownContent = await converter.convertHTMLToMarkdown(html);
|
||||
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
Logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
return markdownContent;
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
}
|
||||
|
||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||
var TurndownService = require("turndown");
|
||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||
|
||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||
|
||||
const turndownService = new TurndownService();
|
||||
turndownService.addRule("inlineLink", {
|
||||
|
@ -21,29 +81,20 @@ export async function parseMarkdown(html: string) {
|
|||
});
|
||||
var gfm = turndownPluginGfm.gfm;
|
||||
turndownService.use(gfm);
|
||||
let markdownContent = "";
|
||||
const turndownPromise = new Promise<string>((resolve, reject) => {
|
||||
try {
|
||||
const result = turndownService.turndown(html);
|
||||
resolve(result);
|
||||
} catch (error) {
|
||||
reject("Error converting HTML to Markdown: " + error);
|
||||
}
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<string>((resolve, reject) => {
|
||||
const timeout = 5000; // Timeout in milliseconds
|
||||
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
|
||||
});
|
||||
|
||||
try {
|
||||
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
|
||||
let markdownContent = await turndownService.turndown(html);
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
|
||||
return markdownContent;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
console.error("Error converting HTML to Markdown: ", error);
|
||||
return ""; // Optionally return an empty string or handle the error as needed
|
||||
}
|
||||
}
|
||||
|
||||
// multiple line links
|
||||
function processMultiLineLinks(markdownContent: string): string {
|
||||
let insideLinkContent = false;
|
||||
let newMarkdownContent = "";
|
||||
let linkOpenCount = 0;
|
||||
|
@ -63,12 +114,14 @@ export async function parseMarkdown(html: string) {
|
|||
newMarkdownContent += char;
|
||||
}
|
||||
}
|
||||
markdownContent = newMarkdownContent;
|
||||
return newMarkdownContent;
|
||||
}
|
||||
|
||||
function removeSkipToContentLinks(markdownContent: string): string {
|
||||
// Remove [Skip to Content](#page) and [Skip to content](#skip)
|
||||
markdownContent = markdownContent.replace(
|
||||
const newMarkdownContent = markdownContent.replace(
|
||||
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
||||
""
|
||||
);
|
||||
return markdownContent;
|
||||
return newMarkdownContent;
|
||||
}
|
91
apps/api/src/lib/job-priority.ts
Normal file
91
apps/api/src/lib/job-priority.ts
Normal file
|
@ -0,0 +1,91 @@
|
|||
import { redisConnection } from "../../src/services/queue-service";
|
||||
import { PlanType } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
const SET_KEY_PREFIX = "limit_team_id:";
|
||||
export async function addJobPriority(team_id, job_id) {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// Add scrape job id to the set
|
||||
await redisConnection.sadd(setKey, job_id);
|
||||
|
||||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||
await redisConnection.expire(setKey, 60);
|
||||
} catch (e) {
|
||||
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function deleteJobPriority(team_id, job_id) {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// remove job_id from the set
|
||||
await redisConnection.srem(setKey, job_id);
|
||||
} catch (e) {
|
||||
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function getJobPriority({
|
||||
plan,
|
||||
team_id,
|
||||
basePriority = 10,
|
||||
}: {
|
||||
plan: PlanType;
|
||||
team_id: string;
|
||||
basePriority?: number;
|
||||
}): Promise<number> {
|
||||
try {
|
||||
const setKey = SET_KEY_PREFIX + team_id;
|
||||
|
||||
// Get the length of the set
|
||||
const setLength = await redisConnection.scard(setKey);
|
||||
|
||||
// Determine the priority based on the plan and set length
|
||||
let planModifier = 1;
|
||||
let bucketLimit = 0;
|
||||
|
||||
switch (plan) {
|
||||
case "free":
|
||||
bucketLimit = 25;
|
||||
planModifier = 0.5;
|
||||
break;
|
||||
case "hobby":
|
||||
bucketLimit = 100;
|
||||
planModifier = 0.3;
|
||||
break;
|
||||
case "standard":
|
||||
case "standardnew":
|
||||
bucketLimit = 200;
|
||||
planModifier = 0.2;
|
||||
break;
|
||||
case "growth":
|
||||
case "growthdouble":
|
||||
bucketLimit = 400;
|
||||
planModifier = 0.1;
|
||||
break;
|
||||
|
||||
default:
|
||||
bucketLimit = 25;
|
||||
planModifier = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// if length set is smaller than set, just return base priority
|
||||
if (setLength <= bucketLimit) {
|
||||
return basePriority;
|
||||
} else {
|
||||
// If not, we keep base priority + planModifier
|
||||
return Math.ceil(
|
||||
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
Logger.error(
|
||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
||||
);
|
||||
return basePriority;
|
||||
}
|
||||
}
|
|
@ -1,3 +1,6 @@
|
|||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
|
@ -25,7 +28,8 @@ export class Logger {
|
|||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// if (process.env.USE_DB_AUTH) {
|
||||
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// if (useDbAuthentication) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
|
|
46
apps/api/src/lib/map-cosine.ts
Normal file
46
apps/api/src/lib/map-cosine.ts
Normal file
|
@ -0,0 +1,46 @@
|
|||
import { Logger } from "./logger";
|
||||
|
||||
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||
try {
|
||||
// Function to calculate cosine similarity
|
||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||
const magnitude1 = Math.sqrt(
|
||||
vec1.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
const magnitude2 = Math.sqrt(
|
||||
vec2.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||
return dotProduct / (magnitude1 * magnitude2);
|
||||
};
|
||||
|
||||
// Function to convert text to vector
|
||||
const textToVector = (text: string): number[] => {
|
||||
const words = searchQuery.toLowerCase().split(/\W+/);
|
||||
return words.map((word) => {
|
||||
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
|
||||
.length;
|
||||
return count / text.length;
|
||||
});
|
||||
};
|
||||
|
||||
// Calculate similarity scores
|
||||
const similarityScores = links.map((link) => {
|
||||
const linkVector = textToVector(link);
|
||||
const searchVector = textToVector(searchQuery);
|
||||
return cosineSimilarity(linkVector, searchVector);
|
||||
});
|
||||
|
||||
// Sort links based on similarity scores and print scores
|
||||
const a = links
|
||||
.map((link, index) => ({ link, score: similarityScores[index] }))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
|
||||
links = a.map((item) => item.link);
|
||||
return links;
|
||||
} catch (error) {
|
||||
Logger.error(`Error performing cosine similarity: ${error}`);
|
||||
return links;
|
||||
}
|
||||
}
|
|
@ -2,6 +2,8 @@ import { Job } from "bullmq";
|
|||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
type: "error",
|
||||
|
@ -36,7 +38,8 @@ export class ScrapeEvents {
|
|||
static async insert(jobId: string, content: ScrapeEvent) {
|
||||
if (jobId === "TEST") return null;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
try {
|
||||
const result = await supabase.from("scrape_events").insert({
|
||||
job_id: jobId,
|
||||
|
|
|
@ -1,10 +1,17 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
/**
|
||||
* Get a single firecrawl_job by ID
|
||||
* @param jobId ID of Job
|
||||
* @returns {any | null} Job
|
||||
*/
|
||||
export const supabaseGetJobById = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.eq('job_id', jobId)
|
||||
.from("firecrawl_jobs")
|
||||
.select("*")
|
||||
.eq("job_id", jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
|
@ -16,15 +23,22 @@ export const supabaseGetJobById = async (jobId: string) => {
|
|||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once.
|
||||
* @param jobIds IDs of Jobs
|
||||
* @returns {any[]} Jobs
|
||||
*/
|
||||
export const supabaseGetJobsById = async (jobIds: string[]) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.in('job_id', jobIds);
|
||||
.from("firecrawl_jobs")
|
||||
.select()
|
||||
.in("job_id", jobIds);
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
||||
|
@ -33,5 +47,47 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
|
|||
}
|
||||
|
||||
return data;
|
||||
};
|
||||
|
||||
/**
|
||||
* Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once.
|
||||
* @param crawlId ID of crawl
|
||||
* @returns {any[]} Jobs
|
||||
*/
|
||||
export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.select()
|
||||
.eq("crawl_id", crawlId)
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return data;
|
||||
};
|
||||
|
||||
|
||||
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.select("docs, team_id")
|
||||
.eq("job_id", jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
};
|
159
apps/api/src/lib/validateUrl.test.ts
Normal file
159
apps/api/src/lib/validateUrl.test.ts
Normal file
|
@ -0,0 +1,159 @@
|
|||
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
|
||||
import { isSameSubdomain } from "./validateUrl";
|
||||
|
||||
describe("isSameDomain", () => {
|
||||
it("should return true for a subdomain", () => {
|
||||
const result = isSameDomain("http://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain", () => {
|
||||
const result = isSameDomain("http://example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameDomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with different protocols", () => {
|
||||
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameDomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameDomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with www prefix", () => {
|
||||
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain with www prefix", () => {
|
||||
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
describe("isSameSubdomain", () => {
|
||||
it("should return false for a subdomain", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different subdomains", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameSubdomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameSubdomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with different protocols", () => {
|
||||
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with www prefix", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("removeDuplicateUrls", () => {
|
||||
it("should remove duplicate URLs with different protocols", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com",
|
||||
"http://www.example.com",
|
||||
"https://www.example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should keep URLs with different paths", () => {
|
||||
const urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
]);
|
||||
});
|
||||
|
||||
it("should prefer https over http", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should prefer non-www over www", () => {
|
||||
const urls = [
|
||||
"https://www.example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should handle empty input", () => {
|
||||
const urls: string[] = [];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it("should handle URLs with different cases", () => {
|
||||
const urls = [
|
||||
"https://EXAMPLE.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||
});
|
||||
|
||||
it("should handle URLs with trailing slashes", () => {
|
||||
const urls = [
|
||||
"https://example.com",
|
||||
"https://example.com/"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
});
|
|
@ -1,9 +1,8 @@
|
|||
|
||||
const protocolIncluded = (url: string) => {
|
||||
export const protocolIncluded = (url: string) => {
|
||||
// if :// not in the start of the url assume http (maybe https?)
|
||||
// regex checks if :// appears before any .
|
||||
return(/^([^.:]+:\/\/)/.test(url));
|
||||
}
|
||||
return /^([^.:]+:\/\/)/.test(url);
|
||||
};
|
||||
|
||||
const getURLobj = (s: string) => {
|
||||
// URL fails if we dont include the protocol ie google.com
|
||||
|
@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
|
|||
};
|
||||
|
||||
export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
|
@ -35,4 +33,138 @@ export const checkAndUpdateURL = (url: string) => {
|
|||
}
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
export const checkUrl = (url: string) => {
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
|
||||
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
|
||||
}
|
||||
|
||||
return url;
|
||||
};
|
||||
|
||||
/**
|
||||
* Same domain check
|
||||
* It checks if the domain of the url is the same as the base url
|
||||
* It accounts true for subdomains and www.subdomains
|
||||
* @param url
|
||||
* @param baseUrl
|
||||
* @returns
|
||||
*/
|
||||
export function isSameDomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
return domain1 === domain2;
|
||||
}
|
||||
|
||||
|
||||
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
||||
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
||||
|
||||
// Check if the domains are the same and the subdomains are the same
|
||||
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||
}
|
||||
|
||||
|
||||
export const checkAndUpdateURLForMap = (url: string) => {
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
// remove last slash if present
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
// remove any query params
|
||||
url = url.split("?")[0].trim();
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||
const urlMap = new Map<string, string>();
|
||||
|
||||
for (const url of urls) {
|
||||
const parsedUrl = new URL(url);
|
||||
const protocol = parsedUrl.protocol;
|
||||
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
||||
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||
|
||||
const key = `${hostname}${path}`;
|
||||
|
||||
if (!urlMap.has(key)) {
|
||||
urlMap.set(key, url);
|
||||
} else {
|
||||
const existingUrl = new URL(urlMap.get(key)!);
|
||||
const existingProtocol = existingUrl.protocol;
|
||||
|
||||
if (protocol === 'https:' && existingProtocol === 'http:') {
|
||||
urlMap.set(key, url);
|
||||
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
||||
urlMap.set(key, url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(Array.from(urlMap.values()))];
|
||||
}
|
|
@ -1,5 +1,8 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
|
@ -7,7 +10,8 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
originalFunction: (...args: U) => Promise<T>
|
||||
) {
|
||||
return async function (...args: U): Promise<T> {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
if (warningCount < 5) {
|
||||
Logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
|
@ -17,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
|
@ -26,7 +28,12 @@ export async function startWebScraperPipeline({
|
|||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
|
@ -49,6 +56,7 @@ export async function startWebScraperPipeline({
|
|||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}
|
||||
export async function runWebScraper({
|
||||
|
@ -63,6 +71,7 @@ export async function runWebScraper({
|
|||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
try {
|
||||
const provider = new WebScraperDataProvider();
|
||||
|
@ -110,17 +119,15 @@ export async function runWebScraper({
|
|||
})
|
||||
: docs;
|
||||
|
||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||
|
||||
if (!billingResult.success) {
|
||||
// throw new Error("Failed to bill team, no subscription was found");
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to bill team, no subscription was found",
|
||||
docs: [],
|
||||
};
|
||||
if(is_scrape === false) {
|
||||
billTeam(team_id, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
onSuccess(filteredDocs, mode);
|
||||
|
||||
|
@ -134,7 +141,8 @@ export async function runWebScraper({
|
|||
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.update({ docs: result })
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
import {
|
||||
autoscalerController,
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
} from "../controllers/v0/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
import { searchController } from "../../src/controllers/v0/search";
|
||||
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
import { livenessController } from "../controllers/v0/liveness";
|
||||
import { readinessController } from "../controllers/v0/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
|
|
164
apps/api/src/routes/v1.ts
Normal file
164
apps/api/src/routes/v1.ts
Normal file
|
@ -0,0 +1,164 @@
|
|||
import express, { NextFunction, Request, Response } from "express";
|
||||
import { crawlController } from "../controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../controllers/v1/crawl-status";
|
||||
import { mapController } from "../controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/auth";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
|
||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
}
|
||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
};
|
||||
}
|
||||
|
||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
}
|
||||
|
||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
(async () => {
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
}
|
||||
}
|
||||
|
||||
expressWs(express());
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
blocklistMiddleware,
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
wrap(scrapeStatusController)
|
||||
);
|
||||
|
||||
v1Router.ws(
|
||||
"/crawl/:jobId",
|
||||
crawlStatusWSController
|
||||
);
|
||||
|
||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||
|
||||
|
||||
v1Router.delete(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
crawlCancelController
|
||||
);
|
||||
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
// v1Router.get("/keyAuth", keyAuthController);
|
||||
|
||||
// // Search routes
|
||||
// v0Router.post("/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
|
@ -34,4 +34,4 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 10000);
|
||||
}, 15000);
|
||||
|
|
|
@ -309,6 +309,23 @@ export class WebCrawler {
|
|||
return null;
|
||||
}
|
||||
|
||||
public extractLinksFromHTML(html: string, url: string) {
|
||||
let links: string[] = [];
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
|
@ -352,15 +369,7 @@ export class WebCrawler {
|
|||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push({ url: u, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
});
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
|
|
|
@ -294,28 +294,32 @@ export class WebScraperDataProvider {
|
|||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
extractionMode === "llm-extraction" ||
|
||||
extractionMode === "llm-extraction-from-markdown" ||
|
||||
extractionMode === "llm-extraction-from-raw-html"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"markdown"
|
||||
completionMode
|
||||
);
|
||||
}
|
||||
if (
|
||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"raw-html"
|
||||
);
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
@ -347,6 +351,7 @@ export class WebScraperDataProvider {
|
|||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
|
@ -569,12 +574,24 @@ export class WebScraperDataProvider {
|
|||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: [],
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
atsv: options.pageOptions?.atsv ?? false
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
@ -599,6 +616,8 @@ export class WebScraperDataProvider {
|
|||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
if (!url.trim().startsWith("http")) {
|
||||
|
|
|
@ -55,7 +55,7 @@ export async function scrapWithFireEngine({
|
|||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
@ -69,15 +69,15 @@ export async function scrapWithFireEngine({
|
|||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
|
@ -96,6 +96,7 @@ export async function scrapWithFireEngine({
|
|||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
|
@ -104,12 +105,13 @@ export async function scrapWithFireEngine({
|
|||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
|
@ -125,7 +127,7 @@ export async function scrapWithFireEngine({
|
|||
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
|
|||
|
||||
dotenv.config();
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
|
||||
export const baseScrapers = [
|
||||
"fire-engine;chrome-cdp",
|
||||
"fire-engine",
|
||||
"scrapingBee",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
|
@ -85,23 +88,23 @@ function getScrapingFallbackOrder(
|
|||
});
|
||||
|
||||
let defaultOrder = [
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
||||
"scrapingBee",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
].filter(Boolean);
|
||||
}
|
||||
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
// defaultOrder = [
|
||||
// "fire-engine",
|
||||
// useFireEngine ? undefined : "playwright",
|
||||
// ...defaultOrder.filter(
|
||||
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
// ),
|
||||
// ].filter(Boolean);
|
||||
// }
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
(scraper: (typeof baseScrapers)[number]) =>
|
||||
|
@ -122,22 +125,42 @@ function getScrapingFallbackOrder(
|
|||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = "",
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
includeExtract: pageOptions.includeExtract ?? false,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
useFastMode: pageOptions.useFastMode ?? false,
|
||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||
atsv: pageOptions.atsv ?? false
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
|
@ -180,6 +203,7 @@ export async function scrapSingleUrl(
|
|||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
atsv: pageOptions.atsv,
|
||||
disableJsDom: pageOptions.disableJsDom,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
|
@ -341,8 +365,8 @@ export async function scrapSingleUrl(
|
|||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||
|
@ -364,20 +388,22 @@ export async function scrapSingleUrl(
|
|||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
|
@ -389,11 +415,11 @@ export async function scrapSingleUrl(
|
|||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
|
@ -402,7 +428,7 @@ export async function scrapSingleUrl(
|
|||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -416,9 +442,9 @@ export async function scrapSingleUrl(
|
|||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
|
|
@ -17,6 +17,8 @@ const socialMediaBlocklist = [
|
|||
'researchhub.com',
|
||||
'youtube.com',
|
||||
'corterix.com',
|
||||
'southwest.com',
|
||||
'ryanair.com'
|
||||
];
|
||||
|
||||
const allowedKeywords = [
|
||||
|
|
|
@ -242,5 +242,13 @@ export const urlSpecificParams = {
|
|||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"lorealparis.hu":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
|
|
@ -39,16 +39,8 @@ export const excludeNonMainTags = [
|
|||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".pagination",
|
||||
"#pagination",
|
||||
".widget",
|
||||
"#widget",
|
||||
".related",
|
||||
"#related",
|
||||
".tag",
|
||||
"#tag",
|
||||
".category",
|
||||
"#category",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
||||
|
|
|
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
|
45
apps/api/src/search/fireEngine.ts
Normal file
45
apps/api/src/search/fireEngine.ts
Normal file
|
@ -0,0 +1,45 @@
|
|||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fireEngineMap(q: string, options: {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
numResults: number;
|
||||
page?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
let data = JSON.stringify({
|
||||
query: q,
|
||||
lang: options.lang,
|
||||
country: options.country,
|
||||
location: options.location,
|
||||
tbs: options.tbs,
|
||||
numResults: options.numResults,
|
||||
page: options.page ?? 1,
|
||||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
|
||||
return [];
|
||||
}
|
||||
|
||||
let config = {
|
||||
method: "POST",
|
||||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response) {
|
||||
return response.data
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
|||
|
||||
|
||||
|
||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
|
@ -32,10 +30,18 @@ export async function search({
|
|||
timeout?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location,
|
||||
});
|
||||
}
|
||||
return await google_search(
|
||||
return await googleSearch(
|
||||
query,
|
||||
advanced,
|
||||
num_results,
|
||||
|
@ -49,7 +55,6 @@ export async function search({
|
|||
);
|
||||
} catch (error) {
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
return [];
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
|
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
let sortedCoupons = [];
|
||||
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
(total, coupon) => total + coupon.credits,
|
||||
0
|
||||
);
|
||||
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
|
||||
}
|
||||
|
||||
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
|
||||
// using coupon credits:
|
||||
if (couponCredits > 0) {
|
||||
// if there is no subscription and they have enough coupon credits
|
||||
|
@ -175,9 +176,25 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
||||
await Promise.all([
|
||||
|
||||
let cacheKeySubscription = `subscription_${team_id}`;
|
||||
let cacheKeyCoupons = `coupons_${team_id}`;
|
||||
|
||||
// Try to get data from cache first
|
||||
const [cachedSubscription, cachedCoupons] = await Promise.all([
|
||||
getValue(cacheKeySubscription),
|
||||
getValue(cacheKeyCoupons)
|
||||
]);
|
||||
|
||||
let subscription, subscriptionError;
|
||||
let coupons : {credits: number}[];
|
||||
|
||||
if (cachedSubscription && cachedCoupons) {
|
||||
subscription = JSON.parse(cachedSubscription);
|
||||
coupons = JSON.parse(cachedCoupons);
|
||||
} else {
|
||||
// If not in cache, retrieve from database
|
||||
const [subscriptionResult, couponsResult] = await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
|
@ -191,6 +208,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
subscription = subscriptionResult.data;
|
||||
subscriptionError = subscriptionResult.error;
|
||||
coupons = couponsResult.data;
|
||||
|
||||
// Cache the results for a minute, sub can be null and that's fine
|
||||
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
|
||||
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
|
||||
|
||||
}
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
couponCredits = coupons.reduce(
|
||||
|
@ -199,40 +226,79 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
// Free credits, no coupons
|
||||
if (subscriptionError || !subscription) {
|
||||
// If there is no active subscription but there are available coupons
|
||||
|
||||
// If there are available coupons and they are enough for the operation
|
||||
if (couponCredits >= credits) {
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||
}
|
||||
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service
|
||||
|
||||
// Free credits, no coupons
|
||||
if (!subscription || subscriptionError) {
|
||||
|
||||
let creditUsages;
|
||||
let creditUsageError;
|
||||
let totalCreditsUsed = 0;
|
||||
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
|
||||
|
||||
// Try to get credit usage from cache
|
||||
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
|
||||
|
||||
if (cachedCreditUsage) {
|
||||
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||
} else {
|
||||
let retries = 0;
|
||||
const maxRetries = 3;
|
||||
const retryInterval = 2000; // 2 seconds
|
||||
|
||||
while (retries < maxRetries) {
|
||||
// Reminder, this has an 1000 limit.
|
||||
const result = await supabase_service
|
||||
.from("credit_usage")
|
||||
.select("credits_used")
|
||||
.is("subscription_id", null)
|
||||
.eq("team_id", team_id);
|
||||
|
||||
creditUsages = result.data;
|
||||
creditUsageError = result.error;
|
||||
|
||||
if (!creditUsageError) {
|
||||
break;
|
||||
}
|
||||
|
||||
retries++;
|
||||
if (retries < maxRetries) {
|
||||
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||
}
|
||||
}
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||
throw new Error(
|
||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||
);
|
||||
}
|
||||
|
||||
const totalCreditsUsed = creditUsages.reduce(
|
||||
totalCreditsUsed = creditUsages.reduce(
|
||||
(acc, usage) => acc + usage.credits_used,
|
||||
0
|
||||
);
|
||||
|
||||
// Cache the result for 30 seconds
|
||||
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
|
||||
}
|
||||
|
||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||
|
||||
const end = new Date();
|
||||
end.setDate(end.getDate() + 30);
|
||||
// check if usage is within 80% of the limit
|
||||
const creditLimit = FREE_CREDITS;
|
||||
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
|
||||
const creditUsagePercentage = totalCreditsUsed / creditLimit;
|
||||
|
||||
if (creditUsagePercentage >= 0.8) {
|
||||
// Add a check to ensure totalCreditsUsed is greater than 0
|
||||
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
|
@ -242,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
}
|
||||
|
||||
// 5. Compare the total credits used with the credits allowed by the plan.
|
||||
if (totalCreditsUsed + credits > FREE_CREDITS) {
|
||||
if (totalCreditsUsed >= FREE_CREDITS) {
|
||||
// Send email notification for insufficient credits
|
||||
await sendNotification(
|
||||
team_id,
|
||||
|
@ -286,7 +352,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
|
||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||
}
|
||||
}
|
||||
|
@ -299,8 +365,17 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
// Get the price details
|
||||
const { data: price, error: priceError } = await supabase_service
|
||||
|
||||
// Get the price details from cache or database
|
||||
const priceCacheKey = `price_${subscription.price_id}`;
|
||||
let price : {credits: number};
|
||||
|
||||
try {
|
||||
const cachedPrice = await getValue(priceCacheKey);
|
||||
if (cachedPrice) {
|
||||
price = JSON.parse(cachedPrice);
|
||||
} else {
|
||||
const { data, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
.select("credits")
|
||||
.eq("id", subscription.price_id)
|
||||
|
@ -312,11 +387,25 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
);
|
||||
}
|
||||
|
||||
price = data;
|
||||
// There are only 21 records, so this is super fine
|
||||
// Cache the price for a long time (e.g., 1 day)
|
||||
await setValue(priceCacheKey, JSON.stringify(price), 86400);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error retrieving or caching price: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
// If errors, just assume it's a big number so user don't get an error
|
||||
price = { credits: 10000000 };
|
||||
}
|
||||
|
||||
const creditLimit = price.credits;
|
||||
const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit;
|
||||
|
||||
// Removal of + credits
|
||||
const creditUsagePercentage = adjustedCreditsUsed / creditLimit;
|
||||
|
||||
// Compare the adjusted total credits used with the credits allowed by the plan
|
||||
if (adjustedCreditsUsed + credits > price.credits) {
|
||||
if (adjustedCreditsUsed >= price.credits) {
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
|
@ -324,7 +413,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
subscription.current_period_end
|
||||
);
|
||||
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||
} else if (creditUsagePercentage >= 0.8) {
|
||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
// Send email notification for approaching credit limit
|
||||
await sendNotification(
|
||||
team_id,
|
||||
|
@ -439,7 +528,7 @@ async function createCreditUsage({
|
|||
subscription_id?: string;
|
||||
credits: number;
|
||||
}) {
|
||||
const { data: credit_usage } = await supabase_service
|
||||
await supabase_service
|
||||
.from("credit_usage")
|
||||
.insert([
|
||||
{
|
||||
|
@ -448,8 +537,7 @@ async function createCreditUsage({
|
|||
subscription_id: subscription_id || null,
|
||||
created_at: new Date(),
|
||||
},
|
||||
])
|
||||
.select();
|
||||
]);
|
||||
|
||||
return { success: true, credit_usage };
|
||||
return { success: true };
|
||||
}
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import "dotenv/config";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logCrawl(job_id: string, team_id: string) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === 'true') {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
|
|
|
@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
|
|||
import { posthog } from "../posthog";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
|
|||
import { supabase_service } from "../supabase";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
export async function logScrape(
|
||||
scrapeLog: ScrapeLog,
|
||||
pageOptions?: PageOptions
|
||||
) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
Logger.debug("Skipping logging scrape to Supabase");
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -8,10 +8,11 @@ async function addScrapeJobRaw(
|
|||
webScraperOptions: any,
|
||||
options: any,
|
||||
jobId: string,
|
||||
jobPriority: number = 10
|
||||
): Promise<Job> {
|
||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||
...options,
|
||||
priority: webScraperOptions.crawl_id ? 20 : 10,
|
||||
priority: jobPriority,
|
||||
jobId,
|
||||
});
|
||||
}
|
||||
|
@ -20,7 +21,9 @@ export async function addScrapeJob(
|
|||
webScraperOptions: WebScraperOptions,
|
||||
options: any = {},
|
||||
jobId: string = uuidv4(),
|
||||
jobPriority: number = 10
|
||||
): Promise<Job> {
|
||||
|
||||
if (Sentry.isInitialized()) {
|
||||
const size = JSON.stringify(webScraperOptions).length;
|
||||
return await Sentry.startSpan({
|
||||
|
@ -39,10 +42,31 @@ export async function addScrapeJob(
|
|||
baggage: Sentry.spanToBaggageHeader(span),
|
||||
size,
|
||||
},
|
||||
}, options, jobId);
|
||||
}, options, jobId, jobPriority);
|
||||
});
|
||||
} else {
|
||||
return await addScrapeJobRaw(webScraperOptions, options, jobId);
|
||||
return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
|
||||
}
|
||||
}
|
||||
|
||||
export function waitForJob(jobId: string, timeout: number) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + timeout) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await getScrapeQueue().getJobState(jobId);
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 500);
|
||||
})
|
||||
}
|
||||
|
|
|
@ -16,6 +16,14 @@ export function getScrapeQueue() {
|
|||
scrapeQueueName,
|
||||
{
|
||||
connection: redisConnection,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
removeOnFail: {
|
||||
age: 90000, // 25 hours
|
||||
},
|
||||
},
|
||||
}
|
||||
// {
|
||||
// settings: {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import "dotenv/config";
|
||||
import "./sentry"
|
||||
import "./sentry";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { CustomError } from "../lib/custom-error";
|
||||
import {
|
||||
|
@ -17,10 +17,27 @@ import { Logger } from "../lib/logger";
|
|||
import { Worker } from "bullmq";
|
||||
import systemMonitor from "./system-monitor";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobDone,
|
||||
crawlToCrawler,
|
||||
finishCrawl,
|
||||
getCrawl,
|
||||
getCrawlJobs,
|
||||
lockURL,
|
||||
} from "../lib/crawl-redis";
|
||||
import { StoredCrawl } from "../lib/crawl-redis";
|
||||
import { addScrapeJob } from "./queue-jobs";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import {
|
||||
addJobPriority,
|
||||
deleteJobPriority,
|
||||
getJobPriority,
|
||||
} from "../../src/lib/job-priority";
|
||||
import { PlanType } from "../types";
|
||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({
|
||||
|
@ -50,6 +67,7 @@ const processJobInternal = async (token: string, job: Job) => {
|
|||
await job.extendLock(token, jobLockExtensionTime);
|
||||
}, jobLockExtendInterval);
|
||||
|
||||
await addJobPriority(job.data.team_id, job.id);
|
||||
let err = null;
|
||||
try {
|
||||
const result = await processJob(job, token);
|
||||
|
@ -59,14 +77,14 @@ const processJobInternal = async (token: string, job: Job) => {
|
|||
} else {
|
||||
await job.moveToCompleted(result.docs, token, false);
|
||||
}
|
||||
}catch(e){
|
||||
}
|
||||
} catch (e) {}
|
||||
} catch (error) {
|
||||
console.log("Job failed, error:", error);
|
||||
Sentry.captureException(error);
|
||||
err = error;
|
||||
await job.moveToFailed(error, token, false);
|
||||
} finally {
|
||||
await deleteJobPriority(job.data.team_id, job.id);
|
||||
clearInterval(extendLockInterval);
|
||||
}
|
||||
|
||||
|
@ -80,7 +98,10 @@ process.on("SIGINT", () => {
|
|||
isShuttingDown = true;
|
||||
});
|
||||
|
||||
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<any>) => {
|
||||
const workerFun = async (
|
||||
queueName: string,
|
||||
processJobInternal: (token: string, job: Job) => Promise<any>
|
||||
) => {
|
||||
const worker = new Worker(queueName, null, {
|
||||
connection: redisConnection,
|
||||
lockDuration: 1 * 60 * 1000, // 1 minute
|
||||
|
@ -109,44 +130,60 @@ const workerFun = async (queueName: string, processJobInternal: (token: string,
|
|||
const job = await worker.getNextJob(token);
|
||||
if (job) {
|
||||
if (job.data && job.data.sentry && Sentry.isInitialized()) {
|
||||
Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => {
|
||||
Sentry.startSpan({
|
||||
Sentry.continueTrace(
|
||||
{
|
||||
sentryTrace: job.data.sentry.trace,
|
||||
baggage: job.data.sentry.baggage,
|
||||
},
|
||||
() => {
|
||||
Sentry.startSpan(
|
||||
{
|
||||
name: "Scrape job",
|
||||
attributes: {
|
||||
job: job.id,
|
||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||
},
|
||||
}, async (span) => {
|
||||
await Sentry.startSpan({
|
||||
},
|
||||
async (span) => {
|
||||
await Sentry.startSpan(
|
||||
{
|
||||
name: "Process scrape job",
|
||||
op: "queue.process",
|
||||
attributes: {
|
||||
"messaging.message.id": job.id,
|
||||
"messaging.destination.name": getScrapeQueue().name,
|
||||
"messaging.message.body.size": job.data.sentry.size,
|
||||
"messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp),
|
||||
"messaging.message.receive.latency":
|
||||
Date.now() - (job.processedOn ?? job.timestamp),
|
||||
"messaging.message.retry.count": job.attemptsMade,
|
||||
}
|
||||
}, async () => {
|
||||
},
|
||||
},
|
||||
async () => {
|
||||
const res = await processJobInternal(token, job);
|
||||
if (res !== null) {
|
||||
span.setStatus({ code: 2 }); // ERROR
|
||||
} else {
|
||||
span.setStatus({ code: 1 }); // OK
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
} else {
|
||||
Sentry.startSpan({
|
||||
Sentry.startSpan(
|
||||
{
|
||||
name: "Scrape job",
|
||||
attributes: {
|
||||
job: job.id,
|
||||
worker: process.env.FLY_MACHINE_ID ?? worker.id,
|
||||
},
|
||||
}, () => {
|
||||
},
|
||||
() => {
|
||||
processJobInternal(token, job);
|
||||
});
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
await sleep(gotJobInterval);
|
||||
|
@ -163,13 +200,20 @@ async function processJob(job: Job, token: string) {
|
|||
|
||||
// Check if the job URL is researchhub and block it immediately
|
||||
// TODO: remove this once solve the root issue
|
||||
if (job.data.url && (job.data.url.includes("researchhub.com") || job.data.url.includes("ebay.com") || job.data.url.includes("youtube.com") || job.data.url.includes("microsoft.com") )) {
|
||||
if (
|
||||
job.data.url &&
|
||||
(job.data.url.includes("researchhub.com") ||
|
||||
job.data.url.includes("ebay.com") ||
|
||||
job.data.url.includes("youtube.com") ||
|
||||
job.data.url.includes("microsoft.com"))
|
||||
) {
|
||||
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
||||
const data = {
|
||||
success: false,
|
||||
docs: [],
|
||||
project_id: job.data.project_id,
|
||||
error: "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||
error:
|
||||
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
|
||||
};
|
||||
await job.moveToCompleted(data.docs, token, false);
|
||||
return data;
|
||||
|
@ -188,9 +232,16 @@ async function processJob(job: Job, token: string) {
|
|||
job,
|
||||
token,
|
||||
});
|
||||
|
||||
// Better if we throw here so we capture with the correct error
|
||||
if (!success) {
|
||||
throw new Error(message);
|
||||
}
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
|
||||
const data = {
|
||||
success,
|
||||
result: {
|
||||
|
@ -206,8 +257,26 @@ async function processJob(job: Job, token: string) {
|
|||
docs,
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl") {
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
// No idea what this does and when it is called.
|
||||
if (job.data.mode === "crawl" && !job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.id as string,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
);
|
||||
}
|
||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||
await callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.page",
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
|
@ -229,27 +298,35 @@ async function processJob(job: Job, token: string) {
|
|||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
let linksOnPage = [];
|
||||
try{
|
||||
linksOnPage = data.docs[0]?.linksOnPage ?? [];
|
||||
}catch(e){
|
||||
linksOnPage = []
|
||||
}
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||
.filter(x => x !== null),
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
)
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||
const newJob = await addScrapeJob({
|
||||
// This seems to work really welel
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: sc.plan as PlanType,
|
||||
team_id: sc.team_id,
|
||||
basePriority: job.data.crawl_id ? 20 : 10,
|
||||
});
|
||||
const jobId = uuidv4();
|
||||
|
||||
// console.log("plan: ", sc.plan);
|
||||
// console.log("team_id: ", sc.team_id)
|
||||
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
||||
// console.log("job priority: " , jobPriority, "\n\n\n")
|
||||
|
||||
const newJob = await addScrapeJob(
|
||||
{
|
||||
url: link,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
|
@ -257,7 +334,12 @@ async function processJob(job: Job, token: string) {
|
|||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
});
|
||||
v1: job.data.v1,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||
}
|
||||
|
@ -266,35 +348,21 @@ async function processJob(job: Job, token: string) {
|
|||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||
if (x === job.id) {
|
||||
return {
|
||||
async getState() {
|
||||
return "completed"
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
returnvalue: docs,
|
||||
}
|
||||
}
|
||||
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||
const jobStatus =
|
||||
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||
? "failed"
|
||||
: "completed";
|
||||
|
||||
const j = await getScrapeQueue().getJob(x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(j.id);
|
||||
|
||||
if (supabaseData) {
|
||||
j.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
|
||||
|
||||
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
const fullDocs = jobs.map((x) =>
|
||||
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
||||
);
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
|
@ -326,7 +394,52 @@ async function processJob(job: Job, token: string) {
|
|||
docs: fullDocs,
|
||||
};
|
||||
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id, data);
|
||||
// v0 web hooks, call when done with all the data
|
||||
if (!job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x)));
|
||||
const jobStatus =
|
||||
sc.cancelled || jobStatuses.some((x) => x === "failed")
|
||||
? "failed"
|
||||
: "completed";
|
||||
|
||||
// v1 web hooks, call when done with no data, but with event completed
|
||||
if (job.data.v1 && job.data.webhook) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id,
|
||||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
);
|
||||
}
|
||||
|
||||
await logJob({
|
||||
job_id: job.data.crawl_id,
|
||||
success: jobStatus === "completed",
|
||||
message: sc.cancelled ? "Cancelled" : message,
|
||||
num_docs: jobIDs.length,
|
||||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -335,11 +448,13 @@ async function processJob(job: Job, token: string) {
|
|||
} catch (error) {
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
|
||||
if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) {
|
||||
Sentry.captureException(error, {
|
||||
data: {
|
||||
job: job.id
|
||||
job: job.id,
|
||||
},
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
|
@ -369,8 +484,24 @@ async function processJob(job: Job, token: string) {
|
|||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||
if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.data.crawl_id ?? (job.id as string),
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
);
|
||||
}
|
||||
if (job.data.v1) {
|
||||
callWebhook(
|
||||
job.data.team_id,
|
||||
job.id as string,
|
||||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.failed"
|
||||
);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
|
@ -380,7 +511,8 @@ async function processJob(job: Job, token: string) {
|
|||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
: error.message ??
|
||||
"Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
|
@ -401,7 +533,8 @@ async function processJob(job: Job, token: string) {
|
|||
message:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||
: error.message ??
|
||||
"Something went wrong... Contact help@mendable.ai",
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
|
|
|
@ -65,7 +65,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter2.points).toBe(50);
|
||||
expect(limiter2.points).toBe(100);
|
||||
|
||||
const limiter3 = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
|
@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(150);
|
||||
expect(limiter4.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the default rate limiter if plan is not provided", () => {
|
||||
|
@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => {
|
|||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'crawl' mode", async () => {
|
||||
|
@ -188,14 +188,13 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someTokenXY",
|
||||
"hobby"
|
||||
);
|
||||
// expect hobby to have 100 points
|
||||
expect(limiter.points).toBe(10);
|
||||
expect(limiter.points).toBe(20);
|
||||
|
||||
const consumePoints = 5;
|
||||
|
||||
const res = await limiter.consume("test-prefix:someTokenXY", consumePoints);
|
||||
expect(res.consumedPoints).toBe(5);
|
||||
expect(res.remainingPoints).toBe(5);
|
||||
expect(res.remainingPoints).toBe(15);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'crawl' mode", () => {
|
||||
|
@ -227,7 +226,7 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
expect(limiter.points).toBe(10);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
|
@ -241,7 +240,14 @@ describe("Rate Limiter Service", () => {
|
|||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter3.points).toBe(50);
|
||||
expect(limiter3.points).toBe(100);
|
||||
|
||||
const limiter4 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(1000);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'search' mode", () => {
|
||||
|
@ -309,7 +315,7 @@ describe("Rate Limiter Service", () => {
|
|||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'testSuite' mode", () => {
|
||||
|
|
|
@ -6,7 +6,7 @@ const RATE_LIMITS = {
|
|||
crawl: {
|
||||
default: 3,
|
||||
free: 2,
|
||||
starter: 3,
|
||||
starter: 10,
|
||||
standard: 5,
|
||||
standardOld: 40,
|
||||
scale: 50,
|
||||
|
@ -17,9 +17,22 @@ const RATE_LIMITS = {
|
|||
growthdouble: 50,
|
||||
},
|
||||
scrape: {
|
||||
default: 20,
|
||||
free: 10,
|
||||
starter: 100,
|
||||
standard: 100,
|
||||
standardOld: 100,
|
||||
scale: 500,
|
||||
hobby: 20,
|
||||
standardNew: 100,
|
||||
standardnew: 100,
|
||||
growth: 1000,
|
||||
growthdouble: 1000,
|
||||
},
|
||||
search: {
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
starter: 50,
|
||||
standard: 50,
|
||||
standardOld: 40,
|
||||
scale: 500,
|
||||
|
@ -29,12 +42,12 @@ const RATE_LIMITS = {
|
|||
growth: 500,
|
||||
growthdouble: 500,
|
||||
},
|
||||
search: {
|
||||
map:{
|
||||
default: 20,
|
||||
free: 5,
|
||||
starter: 20,
|
||||
standard: 40,
|
||||
standardOld: 40,
|
||||
starter: 50,
|
||||
standard: 50,
|
||||
standardOld: 50,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
|
@ -52,7 +65,7 @@ const RATE_LIMITS = {
|
|||
},
|
||||
crawlStatus: {
|
||||
free: 150,
|
||||
default: 150,
|
||||
default: 250,
|
||||
},
|
||||
testSuite: {
|
||||
free: 10000,
|
||||
|
@ -91,6 +104,25 @@ export const devBRateLimiter = new RateLimiterRedis({
|
|||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
export const manualRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix: "manual",
|
||||
points: 2000,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
|
||||
export const scrapeStatusRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix: "scrape-status",
|
||||
points: 400,
|
||||
duration: 60, // Duration in seconds
|
||||
});
|
||||
|
||||
const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"];
|
||||
|
||||
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
|
||||
|
||||
export function getRateLimiter(
|
||||
mode: RateLimiterMode,
|
||||
token: string,
|
||||
|
@ -98,14 +130,18 @@ export function getRateLimiter(
|
|||
teamId?: string
|
||||
) {
|
||||
|
||||
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) {
|
||||
if (testSuiteTokens.some(testToken => token.includes(testToken))) {
|
||||
return testSuiteRateLimiter;
|
||||
}
|
||||
|
||||
if(teamId === process.env.DEV_B_TEAM_ID) {
|
||||
if(teamId && teamId === process.env.DEV_B_TEAM_ID) {
|
||||
return devBRateLimiter;
|
||||
}
|
||||
|
||||
if(teamId && manual.includes(teamId)) {
|
||||
return manualRateLimiter;
|
||||
}
|
||||
|
||||
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
|
||||
|
||||
if (!rateLimitConfig) return serverRateLimiter;
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
@ -8,8 +10,9 @@ class SupabaseService {
|
|||
constructor() {
|
||||
const supabaseUrl = process.env.SUPABASE_URL;
|
||||
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
if (!useDbAuthentication) {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
Logger.warn(
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user