Merge branch 'main' into o1-crawler

This commit is contained in:
Eric Ciarla 2024-09-16 16:06:15 -04:00 committed by GitHub
commit 2619522fe7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
177 changed files with 14938 additions and 5251 deletions

View File

@ -1,7 +1,7 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
title: "[Bug] "
labels: bug
assignees: ''

View File

@ -1,7 +1,7 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[Feat]"
title: "[Feat] "
labels: ''
assignees: ''

View File

@ -0,0 +1,40 @@
---
name: Self-host issue
about: Report an issue with self-hosting Firecrawl
title: "[Self-Host] "
labels: self-host
assignees: ''
---
**Describe the Issue**
Provide a clear and concise description of the self-hosting issue you're experiencing.
**To Reproduce**
Steps to reproduce the issue:
1. Configure the environment or settings with '...'
2. Run the command '...'
3. Observe the error or unexpected output at '...'
4. Log output/error message
**Expected Behavior**
A clear and concise description of what you expected to happen when self-hosting.
**Screenshots**
If applicable, add screenshots or copies of the command line output to help explain the self-hosting issue.
**Environment (please complete the following information):**
- OS: [e.g. macOS, Linux, Windows]
- Firecrawl Version: [e.g. 1.2.3]
- Node.js Version: [e.g. 14.x]
- Docker Version (if applicable): [e.g. 20.10.14]
- Database Type and Version: [e.g. PostgreSQL 13.4]
**Logs**
If applicable, include detailed logs to help understand the self-hosting problem.
**Configuration**
Provide relevant parts of your configuration files (with sensitive information redacted).
**Additional Context**
Add any other context about the self-hosting issue here, such as specific infrastructure details, network setup, or any modifications made to the original Firecrawl setup.

42
.github/archive/publish-rust-sdk.yml vendored Normal file
View File

@ -0,0 +1,42 @@
name: Publish Rust SDK
on: []
env:
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
jobs:
build-and-publish:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
default: true
profile: minimal
- name: Install dependencies
run: cargo build --release
- name: Run version check script
id: version_check_script
run: |
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
- name: Build the package
if: ${{ env.VERSION_INCREMENTED == 'true' }}
run: cargo package
working-directory: ./apps/rust-sdk
- name: Publish to crates.io
if: ${{ env.VERSION_INCREMENTED == 'true' }}
env:
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
run: cargo publish
working-directory: ./apps/rust-sdk

61
.github/archive/rust-sdk.yml vendored Normal file
View File

@ -0,0 +1,61 @@
name: Run Rust SDK E2E Tests
on: []
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs:
build:
runs-on: ubuntu-latest
services:
redis:
image: redis
ports:
- 6379:6379
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies for API
run: pnpm install
working-directory: ./apps/api
- name: Start the application
run: npm start &
working-directory: ./apps/api
id: start_app
- name: Start workers
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Set up Rust
uses: actions/setup-rust@v1
with:
rust-version: stable
- name: Try the lib build
working-directory: ./apps/rust-sdk
run: cargo build
- name: Run E2E tests for Rust SDK
run: cargo test --test e2e_with_auth

View File

@ -15,6 +15,7 @@ false
"""
import json
import toml
import os
import re
import sys
@ -53,6 +54,19 @@ def get_npm_version(package_name: str) -> str:
version = response.json()['version']
return version.strip()
def get_rust_version(file_path: str) -> str:
"""Extract version string from Cargo.toml."""
cargo_toml = toml.load(file_path)
if 'package' in cargo_toml and 'version' in cargo_toml['package']:
return cargo_toml['package']['version'].strip()
raise RuntimeError("Unable to find version string in Cargo.toml.")
def get_crates_version(package_name: str) -> str:
"""Get latest version of Rust package from crates.io."""
response = requests.get(f"https://crates.io/api/v1/crates/{package_name}")
version = response.json()['crate']['newest_version']
return version.strip()
def is_version_incremented(local_version: str, published_version: str) -> bool:
"""Compare local and published versions."""
local_version_parsed: Version = parse_version(local_version)
@ -74,6 +88,12 @@ if __name__ == "__main__":
current_version = get_js_version(os.path.join(package_path, 'package.json'))
# Get published version from npm
published_version = get_npm_version(package_name)
if package_type == "rust":
# Get current version from Cargo.toml
current_version = get_rust_version(os.path.join(package_path, 'Cargo.toml'))
# Get published version from crates.io
published_version = get_crates_version(package_name)
else:
raise ValueError("Invalid package type. Use 'python' or 'js'.")

View File

@ -1,2 +1,3 @@
requests
packaging
toml

View File

@ -28,7 +28,8 @@ env:
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
ENV: ${{ secrets.ENV }}
jobs:
pre-deploy:

View File

@ -22,12 +22,19 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
ENV: ${{ secrets.ENV }}
jobs:
deploy:
name: Deploy app
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v3
- uses: superfly/flyctl-actions/setup-flyctl@master

View File

@ -26,7 +26,10 @@ env:
PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
ENV: ${{ secrets.ENV }}
jobs:
pre-deploy-e2e-tests:
@ -56,6 +59,9 @@ jobs:
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Wait for the application to be ready
run: |
sleep 10
- name: Run E2E tests
run: |
npm run test:prod
@ -132,7 +138,7 @@ jobs:
working-directory: ./apps/python-sdk
- name: Run E2E tests for Python SDK
run: |
pytest firecrawl/__tests__/e2e_withAuth/test.py
pytest firecrawl/__tests__/v1/e2e_withAuth/test.py
working-directory: ./apps/python-sdk
js-sdk-tests:
@ -205,10 +211,45 @@ jobs:
run: go test -v ./... -timeout 180s
working-directory: ./apps/go-sdk/firecrawl
rust-sdk-tests:
name: Rust SDK Tests
needs: pre-deploy-e2e-tests
runs-on: ubuntu-latest
services:
redis:
image: redis
ports:
- 6379:6379
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies for API
run: pnpm install
working-directory: ./apps/api
- name: Start the application
run: npm start &
working-directory: ./apps/api
id: start_app
- name: Start workers
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Set up Rust
uses: actions/setup-rust@v1
with:
rust-version: stable
- name: Try the lib build
working-directory: ./apps/rust-sdk
run: cargo build
- name: Run E2E tests for Rust SDK
run: cargo test --test e2e_with_auth
deploy:
name: Deploy app
runs-on: ubuntu-latest
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests]
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests, rust-sdk-tests]
steps:
- uses: actions/checkout@v3
- uses: superfly/flyctl-actions/setup-flyctl@master
@ -299,4 +340,39 @@ jobs:
run: |
npm run build-and-publish
working-directory: ./apps/js-sdk/firecrawl
build-and-publish-rust-sdk:
name: Build and publish Rust SDK
runs-on: ubuntu-latest
needs: deploy
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
default: true
profile: minimal
- name: Install dependencies
run: cargo build --release
- name: Run version check script
id: version_check_script
run: |
VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name)
echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV
- name: Build the package
if: ${{ env.VERSION_INCREMENTED == 'true' }}
run: cargo package
working-directory: ./apps/rust-sdk
- name: Publish to crates.io
if: ${{ env.VERSION_INCREMENTED == 'true' }}
env:
CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
run: cargo publish
working-directory: ./apps/rust-sdk

8
.gitignore vendored
View File

@ -19,4 +19,10 @@ apps/test-suite/load-test-results/test-run-report.json
apps/playwright-service-ts/node_modules/
apps/playwright-service-ts/package-lock.json
/examples/o1_web_crawler /venv
/examples/o1_web_crawler/venv
*.pyc
.rdb
apps/js-sdk/firecrawl/dist

8
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "apps/go-sdk/firecrawl"]
path = apps/go-sdk/firecrawl
[submodule "apps/go-sdk/firecrawl-go"]
path = apps/go-sdk/firecrawl-go
url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"]
path = apps/go-sdk/examples
[submodule "apps/go-sdk/firecrawl-go-examples"]
path = apps/go-sdk/firecrawl-go-examples
url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs

376
README.md
View File

@ -1,3 +1,37 @@
<h3 align="center">
<img
src="https://raw.githubusercontent.com/mendableai/firecrawl/main/img/firecrawl_logo.png"
height="200"
>
</h3>
<div align="center">
<a href="https://github.com/mendableai/firecrawl/blob/main/LICENSE">
<img src="https://img.shields.io/github/license/mendableai/firecrawl" alt="License">
</a>
<a href="https://pepy.tech/project/firecrawl-py">
<img src="https://static.pepy.tech/badge/firecrawl-py" alt="Downloads">
</a>
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
</a>
<a href="https://firecrawl.dev">
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
</a>
</div>
<div>
<p align="center">
<a href="https://twitter.com/firecrawl_dev">
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
</a>
<a href="https://www.linkedin.com/company/104100957">
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
</a>
<a href="https://discord.com/invite/gSmWdAkdwd">
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
</a>
</p>
</div>
# 🔥 Firecrawl
Crawl and convert any website into LLM-ready markdown or structured data. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the Firecrawl community. Includes powerful scraping, crawling and data extraction capabilities.
@ -6,11 +40,13 @@ _This repository is in its early development stages. We are still merging custom
## What is Firecrawl?
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev).
_Pst. hey, you, join our stargazers :)_
<img src="https://github.com/mendableai/firecrawl/assets/44934913/53c4483a-0f0e-40c6-bd84-153a07f94d29" width="200">
<a href="https://github.com/mendableai/firecrawl">
<img src="https://img.shields.io/github/stars/mendableai/firecrawl.svg?style=social&label=Star&maxAge=2592000" alt="GitHub stars">
</a>
## How to use it?
@ -41,18 +77,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.
```bash
curl -X POST https://api.firecrawl.dev/v0/crawl \
curl -X POST https://api.firecrawl.dev/v1/crawl \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Authorization: Bearer fc-YOUR_API_KEY' \
-d '{
"url": "https://mendable.ai"
"url": "https://docs.firecrawl.dev",
"limit": 100,
"scrapeOptions": {
"formats": ["markdown", "html"]
}
}'
```
Returns a jobId
Returns a crawl job id and the url to check the status of the crawl.
```json
{ "jobId": "1234-5678-9101" }
{
"success": true,
"id": "123-456-789",
"url": "https://api.firecrawl.dev/v1/crawl/123-456-789"
}
```
### Check Crawl Job
@ -60,7 +104,7 @@ Returns a jobId
Used to check the status of a crawl job and get its result.
```bash
curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY'
```
@ -68,18 +112,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
```json
{
"status": "completed",
"current": 22,
"total": 22,
"total": 36,
"creditsUsed": 36,
"expiresAt": "2024-00-00T00:00:00.000Z",
"data": [
{
"content": "Raw Content ",
"markdown": "# Markdown Content",
"provider": "web-scraper",
"markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...",
"html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...",
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
"title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl",
"language": "en",
"sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3",
"description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.",
"ogLocaleAlternate": [],
"statusCode": 200
}
}
]
@ -88,14 +134,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
### Scraping
Used to scrape a URL and get its content.
Used to scrape a URL and get its content in the specified formats.
```bash
curl -X POST https://api.firecrawl.dev/v0/scrape \
curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://mendable.ai"
"url": "https://docs.firecrawl.dev",
"formats" : ["markdown", "html"]
}'
```
@ -105,68 +152,95 @@ Response:
{
"success": true,
"data": {
"content": "Raw Content ",
"markdown": "# Markdown Content",
"provider": "web-scraper",
"markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...",
"html": "<!DOCTYPE html><html lang=\"en\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...",
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
"title": "Home - Firecrawl",
"description": "Firecrawl crawls and converts any website into clean markdown.",
"language": "en",
"keywords": "Firecrawl,Markdown,Data,Mendable,Langchain",
"robots": "follow, index",
"ogTitle": "Firecrawl",
"ogDescription": "Turn any website into LLM-ready data.",
"ogUrl": "https://www.firecrawl.dev/",
"ogImage": "https://www.firecrawl.dev/og.png?123",
"ogLocaleAlternate": [],
"ogSiteName": "Firecrawl",
"sourceURL": "https://firecrawl.dev",
"statusCode": 200
}
}
}
```
### Search (Beta)
### Map (Alpha)
Used to search the web, get the most relevant results, scrape each page and return the markdown.
Used to map a URL and get urls of the website. This returns most links present on the website.
```bash
curl -X POST https://api.firecrawl.dev/v0/search \
```bash cURL
curl -X POST https://api.firecrawl.dev/v1/map \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"query": "firecrawl",
"pageOptions": {
"fetchPageContent": true // false for a fast serp api
}
"url": "https://firecrawl.dev"
}'
```
Response:
```json
{
"success": true,
"data": [
{
"url": "https://mendable.ai",
"markdown": "# Markdown Content",
"provider": "web-scraper",
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
}
}
"status": "success",
"links": [
"https://firecrawl.dev",
"https://www.firecrawl.dev/pricing",
"https://www.firecrawl.dev/blog",
"https://www.firecrawl.dev/playground",
"https://www.firecrawl.dev/smart-crawl",
]
}
```
### Intelligent Extraction (Beta)
#### Map with search
Map with `search` param allows you to search for specific urls inside a website.
```bash cURL
curl -X POST https://api.firecrawl.dev/v1/map \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://firecrawl.dev",
"search": "docs"
}'
```
Response will be an ordered list from the most relevant to the least relevant.
```json
{
"status": "success",
"links": [
"https://docs.firecrawl.dev",
"https://docs.firecrawl.dev/sdks/python",
"https://docs.firecrawl.dev/learn/rag-llama3",
]
}
```
### LLM Extraction (Beta)
Used to extract structured data from scraped pages.
```bash
curl -X POST https://api.firecrawl.dev/v0/scrape \
curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://www.mendable.ai/",
"extractorOptions": {
"mode": "llm-extraction",
"extractionPrompt": "Based on the information on the page, extract the information from the schema. ",
"extractionSchema": {
"formats": ["extract"],
"extract": {
"schema": {
"type": "object",
"properties": {
"company_mission": {
@ -220,6 +294,59 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
}
```
### Extracting without a schema (New)
You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data.
```bash
curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://docs.firecrawl.dev/",
"formats": ["extract"],
"extract": {
"prompt": "Extract the company mission from the page."
}
}'
```
### Search (v0) (Beta)
Used to search the web, get the most relevant results, scrape each page and return the markdown.
```bash
curl -X POST https://api.firecrawl.dev/v0/search \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"query": "firecrawl",
"pageOptions": {
"fetchPageContent": true // false for a fast serp api
}
}'
```
```json
{
"success": true,
"data": [
{
"url": "https://mendable.ai",
"markdown": "# Markdown Content",
"provider": "web-scraper",
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
}
}
]
}
```
## Using Python SDK
### Installing Python SDK
@ -231,24 +358,28 @@ pip install firecrawl-py
### Crawl a website
```python
from firecrawl import FirecrawlApp
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY")
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
# Scrape a website:
scrape_status = app.scrape_url(
'https://firecrawl.dev',
params={'formats': ['markdown', 'html']}
)
print(scrape_status)
# Get the markdown
for result in crawl_result:
print(result['markdown'])
```
### Scraping a URL
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```python
url = 'https://example.com'
scraped_data = app.scrape_url(url)
# Crawl a website:
crawl_status = app.crawl_url(
'https://firecrawl.dev',
params={
'limit': 100,
'scrapeOptions': {'formats': ['markdown', 'html']}
},
wait_until_done=True,
poll_interval=30
)
print(crawl_status)
```
### Extracting structured data from a URL
@ -256,6 +387,11 @@ scraped_data = app.scrape_url(url)
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
```python
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
class ArticleSchema(BaseModel):
title: str
points: int
@ -266,24 +402,12 @@ class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
data = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
'formats': ['extract'],
'extract': {
'schema': TopArticlesSchema.model_json_schema()
}
})
print(data["llm_extraction"])
```
### Search for a query
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
```python
query = 'What is Mendable?'
search_result = app.search(query)
print(data["extract"])
```
## Using the Node SDK
@ -301,54 +425,33 @@ npm install @mendable/firecrawl-js
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
### Scraping a URL
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```js
try {
const url = "https://example.com";
const scrapedData = await app.scrapeUrl(url);
console.log(scrapedData);
} catch (error) {
console.error("Error occurred while scraping:", error.message);
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
// Scrape a website
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
formats: ['markdown', 'html'],
});
if (scrapeResponse) {
console.log(scrapeResponse)
}
// Crawl a website
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
limit: 100,
scrapeOptions: {
formats: ['markdown', 'html'],
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
if (crawlResponse) {
console.log(crawlResponse)
}
```
### Crawling a Website
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
```js
const crawlUrl = "https://example.com";
const params = {
crawlerOptions: {
excludes: ["blog/"],
includes: [], // leave empty for all pages
limit: 1000,
},
pageOptions: {
onlyMainContent: true,
},
};
const waitUntilDone = true;
const timeout = 5;
const crawlResult = await app.crawlUrl(
crawlUrl,
params,
waitUntilDone,
timeout
);
```
### Checking Crawl Status
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```js
const status = await app.checkCrawlStatus(jobId);
console.log(status);
```
### Extracting structured data from a URL
@ -359,7 +462,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
import { z } from "zod";
const app = new FirecrawlApp({
apiKey: "fc-YOUR_API_KEY",
apiKey: "fc-YOUR_API_KEY"
});
// Define schema to extract contents into
@ -384,19 +487,6 @@ const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
console.log(scrapeResult.data["llm_extraction"]);
```
### Search for a query
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
```js
const query = "what is mendable?";
const searchResults = await app.search(query, {
pageOptions: {
fetchPageContent: true, // Fetch the page content for each search result
},
});
```
## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
@ -107,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad
If youd like to test the crawl endpoint, you can run this:
```bash
curl -X POST http://localhost:3002/v0/crawl \
curl -X POST http://localhost:3002/v1/crawl \
-H 'Content-Type: application/json' \
-d '{
"url": "https://mendable.ai"

View File

@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY=
# set if you have a serper key you'd like to use as a search api
SERPER_API_KEY=
# set if you'd like to send slack server health status messages
SLACK_WEBHOOK_URL=
# set if you'd like to send posthog events like job logs

2
apps/api/.gitignore vendored
View File

@ -7,5 +7,5 @@ dump.rdb
/.next/
# Sentry Config File
.rdb
.sentryclirc

View File

@ -17,8 +17,15 @@ RUN pnpm install
RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
# Install packages needed for deployment
# Install Go
FROM golang:1.19 AS go-base
COPY src/lib/go-html-to-md /app/src/lib/go-html-to-md
# Install Go dependencies and build parser lib
RUN cd /app/src/lib/go-html-to-md && \
go mod tidy && \
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
chmod +x html-to-markdown.so
FROM base
RUN apt-get update -qq && \
@ -26,9 +33,7 @@ RUN apt-get update -qq && \
rm -rf /var/lib/apt/lists /var/cache/apt/archives
COPY --from=prod-deps /app/node_modules /app/node_modules
COPY --from=build /app /app
COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so
# Start the server by default, this can be overwritten at runtime
EXPOSE 8080

924
apps/api/openapi-v0.json Normal file
View File

@ -0,0 +1,924 @@
{
"openapi": "3.0.0",
"info": {
"title": "Firecrawl API",
"version": "0.0.0",
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
"servers": [
{
"url": "https://api.firecrawl.dev/v0"
}
],
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
},
"extractorOptions": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScrapeResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl": {
"post": {
"summary": "Crawl multiple URLs based on options",
"operationId": "crawlUrls",
"tags": ["Crawling"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The base URL to start crawling from"
},
"crawlerOptions": {
"type": "object",
"properties": {
"includes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
},
"excludes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
},
"generateImgAltText": {
"type": "boolean",
"description": "Generate alt text for images using LLMs (must have a paid plan)",
"default": false
},
"returnOnlyUrls": {
"type": "boolean",
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false
},
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
},
"allowBackwardCrawling": {
"type": "boolean",
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
"default": false
},
"allowExternalContentLinks": {
"type": "boolean",
"description": "Allows the crawler to follow links to external websites.",
"default": false
}
}
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CrawlResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/search": {
"post": {
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
"operationId": "searchGoogle",
"tags": ["Search"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"format": "uri",
"description": "The query to search for"
},
"pageOptions": {
"type": "object",
"properties": {
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"fetchPageContent": {
"type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
}
}
},
"searchOptions": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of results. Max is 20 during beta."
}
}
}
},
"required": ["query"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SearchResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/status/{jobId}": {
"get": {
"tags": ["Crawl"],
"summary": "Get the status of a crawl job",
"operationId": "getCrawlStatus",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Status of the job (completed, active, failed, paused)"
},
"current": {
"type": "integer",
"description": "Current page number"
},
"total": {
"type": "integer",
"description": "Total number of pages"
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
}
},
"components": {
"securitySchemes": {
"bearerAuth": {
"type": "http",
"scheme": "bearer"
}
},
"schemas": {
"ScrapeResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
}
},
"SearchResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}
}
}
}
},
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"type": "string"
}
}
}
}
},
"security": [
{
"bearerAuth": []
}
]
}

View File

@ -18,8 +18,8 @@
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"summary": "Scrape a single URL",
"operationId": "scrape",
"tags": ["Scraping"],
"security": [
{
@ -38,94 +38,47 @@
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
"formats": {
"type": "array",
"items": {
"type": "string",
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
},
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
"default": ["markdown"]
},
"extractorOptions": {
"headers": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"excludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": true
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
},
"required": ["url"]
@ -741,24 +694,42 @@
"success": {
"type": "boolean"
},
"warning": {
"type": "string",
"nullable": true,
"description": "Warning message to let you know of any issues."
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
@ -780,27 +751,16 @@
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
@ -810,24 +770,33 @@
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
@ -849,11 +818,11 @@
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
@ -871,34 +840,63 @@
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
"markdown": {
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
@ -909,8 +907,15 @@
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"success": {
"type": "boolean"
},
"id": {
"type": "string"
},
"url": {
"type": "string",
"format": "uri"
}
}
}

View File

@ -61,6 +61,8 @@
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",
"@supabase/supabase-js": "^2.44.2",
"@types/express-ws": "^3.0.4",
"@types/ws": "^8.5.12",
"ajv": "^8.16.0",
"async": "^3.2.5",
"async-mutex": "^0.5.0",
@ -76,6 +78,7 @@
"dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2",
"express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"form-data": "^4.0.0",
"glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5",
@ -83,6 +86,7 @@
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-schema-to-zod": "^2.3.0",
"keyword-extractor": "^0.0.28",
"koffi": "^2.9.0",
"langchain": "^0.2.8",
"languagedetect": "^2.0.0",
"logsnag": "^1.0.0",
@ -91,7 +95,7 @@
"moment": "^2.29.4",
"mongoose": "^8.4.4",
"natural": "^7.0.7",
"openai": "^4.52.2",
"openai": "^4.57.0",
"pdf-parse": "^1.1.1",
"pos": "^0.4.2",
"posthog-node": "^4.0.1",
@ -110,8 +114,9 @@
"unstructured-client": "^0.11.3",
"uuid": "^10.0.0",
"wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2",
"zod": "^3.23.4",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1"
},
"nodemonConfig": {

View File

@ -47,6 +47,12 @@ importers:
'@supabase/supabase-js':
specifier: ^2.44.2
version: 2.44.2
'@types/express-ws':
specifier: ^3.0.4
version: 3.0.4
'@types/ws':
specifier: ^8.5.12
version: 8.5.12
ajv:
specifier: ^8.16.0
version: 8.16.0
@ -92,6 +98,9 @@ importers:
express-rate-limit:
specifier: ^7.3.1
version: 7.3.1(express@4.19.2)
express-ws:
specifier: ^5.0.2
version: 5.0.2(express@4.19.2)
form-data:
specifier: ^4.0.0
version: 4.0.0
@ -113,9 +122,12 @@ importers:
keyword-extractor:
specifier: ^0.0.28
version: 0.0.28
koffi:
specifier: ^2.9.0
version: 2.9.0
langchain:
specifier: ^0.2.8
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
languagedetect:
specifier: ^2.0.0
version: 2.0.0
@ -138,8 +150,8 @@ importers:
specifier: ^7.0.7
version: 7.0.7(socks@2.8.3)
openai:
specifier: ^4.52.2
version: 4.52.2
specifier: ^4.57.0
version: 4.57.0(zod@3.23.8)
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
@ -194,11 +206,14 @@ importers:
wordpos:
specifier: ^2.1.0
version: 2.1.0
ws:
specifier: ^8.18.0
version: 8.18.0
xml2js:
specifier: ^0.6.2
version: 0.6.2
zod:
specifier: ^3.23.4
specifier: ^3.23.8
version: 3.23.8
zod-to-json-schema:
specifier: ^3.23.1
@ -1637,6 +1652,9 @@ packages:
'@types/express-serve-static-core@4.19.3':
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
'@types/express-ws@3.0.4':
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
'@types/express@4.17.21':
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
@ -1739,8 +1757,8 @@ packages:
'@types/whatwg-url@11.0.5':
resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==}
'@types/ws@8.5.10':
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
'@types/ws@8.5.12':
resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==}
'@types/yargs-parser@21.0.3':
resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==}
@ -2506,6 +2524,12 @@ packages:
peerDependencies:
express: 4 || 5 || ^5.0.0-beta.1
express-ws@5.0.2:
resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==}
engines: {node: '>=4.5.0'}
peerDependencies:
express: ^4.0.0 || ^5.0.0-alpha.1
express@4.19.2:
resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==}
engines: {node: '>= 0.10.0'}
@ -3149,6 +3173,9 @@ packages:
resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==}
engines: {node: '>=6'}
koffi@2.9.0:
resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==}
langchain@0.2.8:
resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==}
engines: {node: '>=18'}
@ -3712,9 +3739,14 @@ packages:
openai@3.3.0:
resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==}
openai@4.52.2:
resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==}
openai@4.57.0:
resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==}
hasBin: true
peerDependencies:
zod: ^3.23.8
peerDependenciesMeta:
zod:
optional: true
openapi-types@12.1.3:
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
@ -4647,8 +4679,20 @@ packages:
resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==}
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
ws@8.17.1:
resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==}
ws@7.5.10:
resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
engines: {node: '>=8.3.0'}
peerDependencies:
bufferutil: ^4.0.1
utf-8-validate: ^5.0.2
peerDependenciesMeta:
bufferutil:
optional: true
utf-8-validate:
optional: true
ws@8.18.0:
resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==}
engines: {node: '>=10.0.0'}
peerDependencies:
bufferutil: ^4.0.1
@ -5286,13 +5330,13 @@ snapshots:
'@js-sdsl/ordered-map@4.4.2': {}
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
dependencies:
ansi-styles: 5.2.0
camelcase: 6.3.0
decamelize: 1.2.0
js-tiktoken: 1.0.12
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
ml-distance: 4.0.1
mustache: 4.2.0
p-queue: 6.6.2
@ -5304,20 +5348,20 @@ snapshots:
- langchain
- openai
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))':
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
js-tiktoken: 1.0.12
openai: 4.52.2
openai: 4.57.0(zod@3.23.8)
zod: 3.23.8
zod-to-json-schema: 3.23.1(zod@3.23.8)
transitivePeerDependencies:
- encoding
- langchain
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
js-tiktoken: 1.0.12
transitivePeerDependencies:
- langchain
@ -6545,8 +6589,8 @@ snapshots:
dependencies:
'@supabase/node-fetch': 2.6.15
'@types/phoenix': 1.6.5
'@types/ws': 8.5.10
ws: 8.17.1
'@types/ws': 8.5.12
ws: 8.18.0
transitivePeerDependencies:
- bufferutil
- utf-8-validate
@ -6643,6 +6687,12 @@ snapshots:
'@types/range-parser': 1.2.7
'@types/send': 0.17.4
'@types/express-ws@3.0.4':
dependencies:
'@types/express': 4.17.21
'@types/express-serve-static-core': 4.19.3
'@types/ws': 8.5.12
'@types/express@4.17.21':
dependencies:
'@types/body-parser': 1.19.5
@ -6766,7 +6816,7 @@ snapshots:
dependencies:
'@types/webidl-conversions': 7.0.3
'@types/ws@8.5.10':
'@types/ws@8.5.12':
dependencies:
'@types/node': 20.14.1
@ -7521,6 +7571,14 @@ snapshots:
dependencies:
express: 4.19.2
express-ws@5.0.2(express@4.19.2):
dependencies:
express: 4.19.2
ws: 7.5.10
transitivePeerDependencies:
- bufferutil
- utf-8-validate
express@4.19.2:
dependencies:
accepts: 1.3.8
@ -8440,17 +8498,19 @@ snapshots:
kleur@3.0.3: {}
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1):
koffi@2.9.0: {}
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
binary-extensions: 2.3.0
js-tiktoken: 1.0.12
js-yaml: 4.1.0
jsonpointer: 5.0.1
langchainhub: 0.0.11
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
ml-distance: 4.0.1
openapi-types: 12.1.3
p-retry: 4.6.2
@ -8470,14 +8530,14 @@ snapshots:
pdf-parse: 1.1.1
puppeteer: 22.12.1(typescript@5.4.5)
redis: 4.6.14
ws: 8.17.1
ws: 8.18.0
transitivePeerDependencies:
- encoding
- openai
langchainhub@0.0.11: {}
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2):
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)):
dependencies:
'@types/uuid': 9.0.8
commander: 10.0.1
@ -8486,9 +8546,9 @@ snapshots:
p-retry: 4.6.2
uuid: 9.0.1
optionalDependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
openai: 4.52.2
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.57.0(zod@3.23.8)
languagedetect@2.0.0: {}
@ -8881,16 +8941,19 @@ snapshots:
transitivePeerDependencies:
- debug
openai@4.52.2:
openai@4.57.0(zod@3.23.8):
dependencies:
'@types/node': 18.19.39
'@types/node-fetch': 2.6.11
'@types/qs': 6.9.15
abort-controller: 3.0.0
agentkeepalive: 4.5.0
form-data-encoder: 1.7.2
formdata-node: 4.4.1
node-fetch: 2.7.0
web-streams-polyfill: 3.3.3
qs: 6.12.2
optionalDependencies:
zod: 3.23.8
transitivePeerDependencies:
- encoding
@ -9195,7 +9258,7 @@ snapshots:
chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070)
debug: 4.3.5
devtools-protocol: 0.0.1299070
ws: 8.17.1
ws: 8.18.0
transitivePeerDependencies:
- bufferutil
- supports-color
@ -9877,7 +9940,9 @@ snapshots:
imurmurhash: 0.1.4
signal-exit: 4.1.0
ws@8.17.1: {}
ws@7.5.10: {}
ws@8.18.0: {}
xml2js@0.6.2:
dependencies:

View File

@ -1,12 +1,16 @@
### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer fc
Authorization: Bearer fc-
content-type: application/json
{
"url":"firecrawl.dev"
"url":"corterix.com"
}
### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
Authorization: Bearer fc-
### Check Job Status
GET http://localhost:3002/v0/jobs/active HTTP/1.1

View File

@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' });
.send({ url: 'https://docs.firecrawl.dev' });
expect(firstResponse.statusCode).toBe(200);
@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' });
.send({ url: 'https://docs.firecrawl.dev' });
expect(secondResponse.statusCode).toBe(409);
expect(secondResponse.body.error).toBe('Idempotency key already used');

View File

@ -0,0 +1,961 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for v1 API Routes", () => {
beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true";
});
afterAll(() => {
delete process.env.USE_DB_AUTHENTICATION;
});
describe("GET /is-production", () => {
it.concurrent("should return the production status", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
"/is-production"
);
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
console.log('!!useDbAuthentication', !!useDbAuthentication);
console.log('!useDbAuthentication', !useDbAuthentication);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("isProduction");
});
});
describe("POST /v1/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.send({ url: "https://firecrawl.dev"})
expect(response.statusCode).toBe(401);
});
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
);
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
);
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
); // 30 seconds timeout
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
// formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true
};
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
if (!("data" in responseWithoutRemoveTags.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
const scrapeRequestWithRemoveTags: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
excludeTags: ['.nav', '#footer', 'strong'],
onlyMainContent: false // default is true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithRemoveTags);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
}, 30000);
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(400);
}, 60000);
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(401);
}, 60000);
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(403);
}, 60000);
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(404);
}, 60000);
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(405);
}, 60000);
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(500);
}, 60000);
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408);
}, 3000);
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.rawHtml).toContain("<html");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with waitFor",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 8000
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("links");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("PagerDuty");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with a valid links on page",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["links"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("links");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.links).toContain("https://firecrawl.dev");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
});
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should return a successful response with a valid API key", async () => {
const mapRequest = {
url: "https://roastmywebsite.ai"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
});
it.concurrent("should return a successful response with a valid API key and search", async () => {
const mapRequest = {
url: "https://usemotion.com",
search: "pricing"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("usemotion.com/pricing");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
const mapRequest = {
url: "https://firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
expect(containsDocsFirecrawlDev).toBe(true);
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
expect(containsDocsFirecrawlDev).toBe(true);
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: false
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).not.toContain("docs.firecrawl.dev");
})
it.concurrent("should return an error for invalid URL", async () => {
const mapRequest = {
url: "invalid-url",
includeSubdomains: true,
search: "test",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("error");
});
});
describe("POST /v1/crawl", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/crawl")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
);
it.concurrent("should return a successful response", async () => {
const response = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("id");
expect(response.body.id).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("url");
expect(response.body.url).toContain("/v1/crawl/");
});
it.concurrent(
"should return a successful response with a valid API key and valid includes option",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
limit: 10,
includePaths: ["blog/*"],
});
let response;
let isFinished = false;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url).toContain("firecrawl.dev/blog");
});
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
180000
); // 180 seconds
it.concurrent(
"should return a successful response with a valid API key and valid excludes option",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
limit: 10,
excludePaths: ["blog/*"],
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(
TEST_URL
)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(3);
urls.forEach((url: string) => {
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
});
},
90000
); // 90 seconds
it.concurrent(
"should return a successful response with max depth option for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
maxDepth: 1,
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
// wait for 60 seconds
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(
TEST_URL
)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThanOrEqual(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split("/");
const depth =
pathSplits.length -
(pathSplits[0].length === 0 &&
pathSplits[pathSplits.length - 1].length === 0
? 1
: 0);
expect(depth).toBeLessThanOrEqual(2);
});
},
180000
);
})
describe("GET /v1/crawl/:jobId", () => {
it.concurrent("should require authorization", async () => {
const response = await request(TEST_URL).get("/v1/crawl/123");
expect(response.statusCode).toBe(401);
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL)
.get("/v1/crawl/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
}
);
it.concurrent(
"should return Job not found for invalid job ID",
async () => {
const response = await request(TEST_URL)
.get("/v1/crawl/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
}
);
it.concurrent(
"should return a successful crawl status response for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.firecrawl.dev" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
if (response.body.status === "completed") {
isCompleted = true;
} else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(
completedResponse.body.data[0].metadata.error
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(
(doc) =>
doc.metadata &&
doc.metadata.sourceURL
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
},
180000
); // 120 seconds
it.concurrent(
"If someone cancels a crawl job, it should turn into failed status",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.tatum.io", limit: 200 });
expect(crawlResponse.statusCode).toBe(200);
await new Promise((r) => setTimeout(r, 10000));
const responseCancel = await request(TEST_URL)
.delete(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(responseCancel.statusCode).toBe(200);
expect(responseCancel.body).toHaveProperty("status");
expect(responseCancel.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("cancelled");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
60000
); // 60 seconds
})
});

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
import { crawlController } from '../crawl'
import { crawlController } from '../v0/crawl'
import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create';

View File

@ -1,22 +1,36 @@
import { parseApi } from "../../src/lib/parseApi";
import { getRateLimiter } from "../../src/services/rate-limiter";
import { parseApi } from "../lib/parseApi";
import { getRateLimiter } from "../services/rate-limiter";
import {
AuthResponse,
NotificationType,
PlanType,
RateLimiterMode,
} from "../../src/types";
import { supabase_service } from "../../src/services/supabase";
import { withAuth } from "../../src/lib/withAuth";
} from "../types";
import { supabase_service } from "../services/supabase";
import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
import { redlock } from "../../src/services/redlock";
import { getValue } from "../../src/services/redis";
import { setValue } from "../../src/services/redis";
import { redlock } from "../services/redlock";
import { getValue } from "../services/redis";
import { setValue } from "../services/redis";
import { validate } from "uuid";
import * as Sentry from "@sentry/node";
// const { data, error } = await supabase_service
// .from('api_keys')
// .select(`
// key,
// team_id,
// teams (
// subscriptions (
// price_id
// )
// )
// `)
// .eq('key', normalizedApi)
// .limit(1)
// .single();
function normalizedApiIsUuid(potentialUuid: string): boolean {
// Check if the string is a valid UUID
return validate(potentialUuid);
@ -88,9 +102,10 @@ export async function supaAuthenticateUser(
team_id?: string;
error?: string;
status?: number;
plan?: string;
plan?: PlanType;
}> {
const authHeader = req.headers.authorization;
const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null);
if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 };
}
@ -118,7 +133,11 @@ export async function supaAuthenticateUser(
let priceId: string | null = null;
if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
if (mode == RateLimiterMode.CrawlStatus) {
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
} else {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
}
teamId = "preview";
} else {
normalizedApi = parseApi(token);
@ -154,7 +173,7 @@ export async function supaAuthenticateUser(
await setValue(
cacheKey,
JSON.stringify({ team_id: teamId, price_id: priceId }),
10
60
);
}
} catch (error) {
@ -233,6 +252,13 @@ export async function supaAuthenticateUser(
subscriptionData.plan
);
break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(
RateLimiterMode.Map,
token,
subscriptionData.plan
);
break;
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
@ -285,6 +311,9 @@ export async function supaAuthenticateUser(
token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape ||
mode === RateLimiterMode.Preview ||
mode === RateLimiterMode.Map ||
mode === RateLimiterMode.Crawl ||
mode === RateLimiterMode.CrawlStatus ||
mode === RateLimiterMode.Search)
) {
return { success: true, team_id: "preview" };
@ -327,10 +356,10 @@ export async function supaAuthenticateUser(
return {
success: true,
team_id: subscriptionData.team_id,
plan: subscriptionData.plan ?? "",
plan: (subscriptionData.plan ?? "") as PlanType,
};
}
function getPlanByPriceId(price_id: string) {
function getPlanByPriceId(price_id: string): PlanType {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER:
return "starter";

View File

@ -1,231 +0,0 @@
import { ExtractorOptions, PageOptions } from './../lib/entities';
import { Request, Response } from "express";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { addScrapeJob } from '../services/queue-jobs';
import { getScrapeQueue } from '../services/queue-service';
import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger';
import * as Sentry from "@sentry/node";
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
plan?: string
): Promise<{
success: boolean;
error?: string;
data?: Document;
returnCode: number;
}> {
const url = req.body.url;
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions,
team_id,
pageOptions,
extractorOptions,
origin: req.body.origin ?? defaultOrigin,
}, {}, jobId);
let doc;
const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => {
try {
doc = (await new Promise((resolve, reject) => {
const start = Date.now();
const int = setInterval(async () => {
if (Date.now() >= start + timeout) {
clearInterval(int);
reject(new Error("Job wait "));
} else {
const state = await job.getState();
if (state === "completed") {
clearInterval(int);
resolve((await getScrapeQueue().getJob(job.id)).returnvalue);
} else if (state === "failed") {
clearInterval(int);
reject((await getScrapeQueue().getJob(job.id)).failedReason);
}
}
}, 1000);
}))[0]
} catch (e) {
if (e instanceof Error && e.message.startsWith("Job wait")) {
span.setAttribute("timedOut", true);
return {
success: false,
error: "Request timed out",
returnCode: 408,
}
} else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) {
return {
success: false,
error: e,
returnCode: 500,
};
} else {
throw e;
}
}
span.setAttribute("result", JSON.stringify(doc));
return null;
});
if (err !== null) {
return err;
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return { success: true, error: "No page found", returnCode: 200, data: doc };
}
delete doc.index;
delete doc.provider;
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
delete doc.rawHtml;
}
return {
success: true,
data: doc,
returnCode: 200,
};
}
export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode.includes("llm-extraction")) {
if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) {
return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" });
}
pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000;
}
// checkCredits
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
earlyReturn = true;
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
Logger.error(error);
earlyReturn = true;
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
}
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
extractorOptions,
timeout,
plan
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
if (result.success) {
let creditsToBeBilled = 0; // billing for doc done on queue end
const creditsPerLLMExtract = 50;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
const billingResult = await billTeam(
team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.",
});
}
}
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: 1,
docs: [result.data],
time_taken: timeTakenInSeconds,
team_id: team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") });
}
}

View File

@ -1,11 +1,10 @@
import { Request, Response } from "express";
import { Job } from "bullmq";
import { Logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { checkAlerts } from "../../services/alerts";
import { exec } from "node:child_process";
import { sendSlackWebhook } from "../../services/alerts/slack";
import { Logger } from "../../../lib/logger";
import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../../services/alerts";
import { sendSlackWebhook } from "../../../services/alerts/slack";
export async function cleanBefore24hCompleteJobsController(
req: Request,
@ -94,26 +93,34 @@ export async function autoscalerController(req: Request, res: Response) {
const scrapeQueue = getScrapeQueue();
const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([
scrapeQueue.getActiveCount(),
scrapeQueue.getWaitingCount(),
scrapeQueue.getPrioritizedCount(),
]);
const [webScraperActive, webScraperWaiting, webScraperPriority] =
await Promise.all([
scrapeQueue.getActiveCount(),
scrapeQueue.getWaitingCount(),
scrapeQueue.getPrioritizedCount(),
]);
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
// get number of machines active
const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines',
const request = await fetch(
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
{
headers: {
'Authorization': `Bearer ${process.env.FLY_API_TOKEN}`
}
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
},
}
)
);
const machines = await request.json();
// Only worker machines
const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length;
const activeMachines = machines.filter(
(machine) =>
(machine.state === "started" ||
machine.state === "starting" ||
machine.state === "replacing") &&
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
).length;
let targetMachineCount = activeMachines;
@ -123,29 +130,57 @@ export async function autoscalerController(req: Request, res: Response) {
// Scale up logic
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3));
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 3
);
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2));
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 2
);
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp);
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp
);
}
// Scale down logic
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3));
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 3
);
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2));
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 2
);
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown);
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown
);
}
if (targetMachineCount !== activeMachines) {
Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`);
Logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
);
if(targetMachineCount > activeMachines) {
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
if (targetMachineCount > activeMachines) {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
} else {
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
}
return res.status(200).json({
mode: "scale-descale",

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express";
import Redis from "ioredis";
import { Logger } from "../../lib/logger";
import { redisRateLimitClient } from "../../services/rate-limiter";
import { Logger } from "../../../lib/logger";
import { redisRateLimitClient } from "../../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) {
const retryOperation = async (operation, retries = 3) => {

View File

@ -0,0 +1,60 @@
import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
// check if the job belongs to the team
if (useDbAuthentication) {
const { data, error: supaError } = await supabase_service
.from("bulljobs_teams")
.select("*")
.eq("job_id", req.params.jobId)
.eq("team_id", team_id);
if (supaError) {
return res.status(500).json({ error: supaError.message });
}
if (data.length === 0) {
return res.status(403).json({ error: "Unauthorized" });
}
}
try {
sc.cancelled = true;
await saveCrawl(req.params.jobId, sc);
} catch (error) {
Logger.error(error);
}
res.json({
status: "cancelled"
});
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,17 +1,19 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { getScrapeQueue } from "../../src/services/queue-service";
import { Logger } from "../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../src/lib/supabase-jobs";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJobs(ids: string[]) {
export async function getJobs(crawlId: string, ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
@ -50,12 +52,25 @@ export async function crawlStatusController(req: Request, res: Response) {
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
if (
jobs.length > 0 &&
jobs[0].data &&
jobs[0].data.pageOptions &&
!jobs[0].data.pageOptions.includeRawHtml
) {
data.forEach(item => {
if (item) {
delete item.rawHtml;
}
});
}
res.json({
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,

View File

@ -1,35 +1,24 @@
import { Request, Response } from "express";
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addScrapeJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import {
defaultCrawlPageOptions,
defaultCrawlerOptions,
defaultOrigin,
} from "../../src/lib/default-values";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
@ -148,6 +137,7 @@ export async function crawlController(req: Request, res: Response) {
crawlerOptions,
pageOptions,
team_id,
plan,
createdAt: Date.now(),
};
@ -163,7 +153,15 @@ export async function crawlController(req: Request, res: Response) {
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(sitemap.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
@ -181,7 +179,7 @@ export async function crawlController(req: Request, res: Response) {
},
opts: {
jobId: uuid,
priority: 20,
priority: jobPriority,
},
};
});
@ -204,6 +202,10 @@ export async function crawlController(req: Request, res: Response) {
}
} else {
await lockURL(id, sc, url);
// Not needed, first one should be 15.
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
const job = await addScrapeJob(
{
url,

View File

@ -1,17 +1,17 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { addScrapeJob } from "../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, error, status } = await authenticateUser(
const { success, error, status, team_id:a, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
@ -89,6 +89,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
crawlerOptions,
pageOptions,
team_id,
plan,
robots,
createdAt: Date.now(),
};

View File

@ -1,8 +1,8 @@
import { AuthResponse, RateLimiterMode } from "../types";
import { AuthResponse, RateLimiterMode } from "../../types";
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { authenticateUser } from "../auth";
export const keyAuthController = async (req: Request, res: Response) => {

View File

@ -0,0 +1,295 @@
import { ExtractorOptions, PageOptions } from "./../../lib/entities";
import { Request, Response } from "express";
import {
billTeam,
checkTeamCredits,
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import { Document } from "../../lib/entities";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import {
defaultPageOptions,
defaultExtractorOptions,
defaultTimeout,
defaultOrigin,
} from "../../lib/default-values";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue } from "../../services/queue-service";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
plan?: PlanType
): Promise<{
success: boolean;
error?: string;
data?: Document;
returnCode: number;
}> {
const url = req.body.url;
if (typeof url !== "string") {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return {
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
returnCode: 403,
};
}
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
const job = await addScrapeJob(
{
url,
mode: "single_urls",
crawlerOptions,
team_id,
pageOptions,
extractorOptions,
origin: req.body.origin ?? defaultOrigin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
let doc;
const err = await Sentry.startSpan(
{
name: "Wait for job to finish",
op: "bullmq.wait",
attributes: { job: jobId },
},
async (span) => {
try {
doc = (await waitForJob(job.id, timeout))[0];
} catch (e) {
if (e instanceof Error && e.message.startsWith("Job wait")) {
span.setAttribute("timedOut", true);
return {
success: false,
error: "Request timed out",
returnCode: 408,
};
} else if (
typeof e === "string" &&
(e.includes("Error generating completions: ") ||
e.includes("Invalid schema for function") ||
e.includes(
"LLM extraction did not match the extraction schema you provided."
))
) {
return {
success: false,
error: e,
returnCode: 500,
};
} else {
throw e;
}
}
span.setAttribute("result", JSON.stringify(doc));
return null;
}
);
if (err !== null) {
return err;
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return {
success: true,
error: "No page found",
returnCode: 200,
data: doc,
};
}
delete doc.index;
delete doc.provider;
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (
!pageOptions.includeRawHtml &&
extractorOptions.mode == "llm-extraction-from-raw-html"
) {
if (doc.rawHtml) {
delete doc.rawHtml;
}
}
if (!pageOptions.includeHtml) {
if (doc.html) {
delete doc.html;
}
}
return {
success: true,
data: doc,
returnCode: 200,
};
}
export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = {
...defaultExtractorOptions,
...req.body.extractorOptions,
};
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode.includes("llm-extraction")) {
if (
typeof extractorOptions.extractionSchema !== "object" ||
extractorOptions.extractionSchema === null
) {
return res.status(400).json({
error:
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
});
}
pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000;
}
// checkCredits
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
earlyReturn = true;
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
Logger.error(error);
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact hello@firecrawl.com for help.",
});
}
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
extractorOptions,
timeout,
plan
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
result.data && result.data.markdown
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
: 0;
if (result.success) {
let creditsToBeBilled = 1;
const creditsPerLLMExtract = 4;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
}
let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({
error:
typeof error === "string"
? error
: error?.message ?? "Internal Server Error",
});
}
}

View File

@ -1,17 +1,18 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities";
import { search } from "../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import { PageOptions, SearchOptions } from "../../lib/entities";
import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger";
import { getScrapeQueue } from "../services/queue-service";
import { Logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import * as Sentry from "@sentry/node";
import { addScrapeJob } from "../services/queue-jobs";
import { getJobPriority } from "../../lib/job-priority";
export async function searchHelper(
jobId: string,
@ -20,6 +21,7 @@ export async function searchHelper(
crawlerOptions: any,
pageOptions: PageOptions,
searchOptions: SearchOptions,
plan: PlanType
): Promise<{
success: boolean;
error?: string;
@ -52,18 +54,10 @@ export async function searchHelper(
if (justSearch) {
const billingResult = await billTeam(
team_id,
res.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
billTeam(team_id, res.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
return { success: true, data: res, returnCode: 200 };
}
@ -76,6 +70,8 @@ export async function searchHelper(
return { success: true, error: "No search results found", returnCode: 200 };
}
const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
// filter out social media links
const jobDatas = res.map(x => {
@ -92,7 +88,7 @@ export async function searchHelper(
},
opts: {
jobId: uuid,
priority: 20,
priority: jobPriority,
}
};
})
@ -108,24 +104,7 @@ export async function searchHelper(
await getScrapeQueue().addBulk(jobs);
}
const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => {
const start = Date.now();
const int = setInterval(async () => {
if (Date.now() >= start + 60000) {
clearInterval(int);
reject(new Error("Job wait "));
} else {
const state = await x.getState();
if (state === "completed") {
clearInterval(int);
resolve((await getScrapeQueue().getJob(x.id)).returnvalue);
} else if (state === "failed") {
clearInterval(int);
reject((await getScrapeQueue().getJob(x.id)).failedReason);
}
}
}, 1000);
})))).map(x => x[0]);
const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 };
@ -152,7 +131,7 @@ export async function searchHelper(
export async function searchController(req: Request, res: Response) {
try {
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser(
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Search
@ -162,17 +141,16 @@ export async function searchController(req: Request, res: Response) {
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? {
includeHtml: false,
onlyMainContent: true,
fetchPageContent: true,
removeTags: [],
fallback: false,
includeHtml: req.body.pageOptions?.includeHtml ?? false,
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
removeTags: req.body.pageOptions?.removeTags ?? [],
fallback: req.body.pageOptions?.fallback ?? false,
};
const origin = req.body.origin ?? "api";
const searchOptions = req.body.searchOptions ?? { limit: 5 };
const jobId = uuidv4();
try {
@ -194,6 +172,7 @@ export async function searchController(req: Request, res: Response) {
crawlerOptions,
pageOptions,
searchOptions,
plan
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express";
import { Logger } from "../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
// }
// }
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -0,0 +1,47 @@
import { crawlController } from '../crawl'
import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create';
import { validateIdempotencyKey } from '../../services/idempotency/validate';
import { v4 as uuidv4 } from 'uuid';
jest.mock('../auth', () => ({
authenticateUser: jest.fn().mockResolvedValue({
success: true,
team_id: 'team123',
error: null,
status: 200
}),
reduce: jest.fn()
}));
jest.mock('../../services/idempotency/validate');
describe('crawlController', () => {
it('should prevent duplicate requests using the same idempotency key', async () => {
const req = {
headers: {
'x-idempotency-key': await uuidv4(),
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
},
body: {
url: 'https://mendable.ai'
}
} as unknown as Request;
const res = {
status: jest.fn().mockReturnThis(),
json: jest.fn()
} as unknown as Response;
// Mock the idempotency key validation to return false for the second call
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
// First request should succeed
await crawlController(req, res);
expect(res.status).not.toHaveBeenCalledWith(409);
// Second request with the same key should fail
await crawlController(req, res);
expect(res.status).toHaveBeenCalledWith(409);
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
});
});

View File

@ -0,0 +1,64 @@
import { url } from "../types";
describe("URL Schema Validation", () => {
beforeEach(() => {
jest.resetAllMocks();
});
it("should prepend http:// to URLs without a protocol", () => {
const result = url.parse("example.com");
expect(result).toBe("http://example.com");
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("http://example.com")).not.toThrow();
expect(() => url.parse("https://example.com")).not.toThrow();
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("example.com")).not.toThrow();
});
it("should reject URLs with unsupported protocols", () => {
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
});
it("should reject URLs without a valid top-level domain", () => {
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
});
it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with subdomains correctly", () => {
expect(() => url.parse("http://sub.example.com")).not.toThrow();
expect(() => url.parse("https://blog.example.com")).not.toThrow();
});
it("should handle URLs with paths correctly", () => {
expect(() => url.parse("http://example.com/path")).not.toThrow();
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
});
it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should reject malformed URLs starting with 'http://http'", () => {
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
});
})

View File

@ -1,10 +1,12 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { supabase_service } from "../../src/services/supabase";
import { Logger } from "../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types";
import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {

View File

@ -0,0 +1,162 @@
import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
type ErrorMessage = {
type: "error",
error: string,
}
type CatchupMessage = {
type: "catchup",
data: CrawlStatusResponse,
}
type DocumentMessage = {
type: "document",
data: Document,
}
type DoneMessage = { type: "done" }
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
function send(ws: WebSocket, msg: Message) {
if (ws.readyState === 1) {
return new Promise((resolve, reject) => {
ws.send(JSON.stringify(msg), (err) => {
if (err) reject(err);
else resolve(null);
});
});
}
}
function close(ws: WebSocket, code: number, msg: Message) {
if (ws.readyState <= 1) {
ws.close(code, JSON.stringify(msg));
}
}
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return close(ws, 1008, { type: "error", error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return close(ws, 3003, { type: "error", error: "Forbidden" });
}
let doneJobIDs = [];
let finished = false;
const loop = async () => {
if (finished) return;
const jobIDs = await getCrawlJobs(req.params.jobId);
if (jobIDs.length === doneJobIDs.length) {
return close(ws, 1000, { type: "done" });
}
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
for (const jobID of newlyDoneJobIDs) {
const job = await getJob(jobID);
if (job.returnvalue) {
send(ws, {
type: "document",
data: legacyDocumentConverter(job.returnvalue),
})
} else {
return close(ws, 3000, { type: "error", error: job.failedReason });
}
}
doneJobIDs.push(...newlyDoneJobIDs);
setTimeout(loop, 1000);
};
setTimeout(loop, 1000);
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
const doneJobs = await getJobs(doneJobIDs);
const data = doneJobs.map(x => x.returnvalue);
send(ws, {
type: "catchup",
data: {
success: true,
status,
total: jobIDs.length,
completed: doneJobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data.map(x => legacyDocumentConverter(x)),
}
});
if (status !== "scraping") {
finished = true;
return close(ws, 1000, { type: "done" });
}
}
// Basically just middleware and error wrapping
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
try {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
null,
RateLimiterMode.CrawlStatus,
);
if (!success) {
return close(ws, 3000, {
type: "error",
error,
});
}
req.auth = { team_id, plan };
await crawlStatusWS(ws, req);
} catch (err) {
Sentry.captureException(err);
const id = uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
});
}
}

View File

@ -0,0 +1,130 @@
import { Response } from "express";
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);
if (!job) return job;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
return job;
}
export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
if (job) {
job.returnvalue = x.docs;
}
})
}
jobs.forEach(job => {
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
});
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return res.status(403).json({ success: false, error: "Forbidden" });
}
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
let doneJobs = [];
if (end === undefined) { // determine 10 megabyte limit
let bytes = 0;
const bytesLimit = 10485760; // 10 MiB in bytes
const factor = 100; // chunking for faster retrieval
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
// get current chunk and retrieve jobs
const currentIDs = doneJobsOrder.slice(i, i+factor);
const jobs = await getJobs(currentIDs);
// iterate through jobs and add them one them one to the byte counter
// both loops will break once we cross the byte counter
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
const job = jobs[ii];
doneJobs.push(job);
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
}
}
// if we ran over the bytes limit, remove the last document
if (bytes > bytesLimit) {
doneJobs.splice(doneJobs.length - 1, 1);
}
} else {
doneJobs = await getJobs(doneJobsOrder);
}
const data = doneJobs.map(x => x.returnvalue);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());
if (typeof req.query.limit === "string") {
nextURL.searchParams.set("limit", req.query.limit);
}
if (data.length > 0) {
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
for (let ii = 0; ii < doneJobs.length; ii++) {
if (data[ii]) {
delete data[ii].rawHtml;
}
}
}
}
res.status(200).json({
success: true,
status,
completed: doneJobsLength,
total: jobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
next:
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
? undefined
: nextURL.href,
data: data.map(x => legacyDocumentConverter(x)),
});
}

View File

@ -0,0 +1,167 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
CrawlRequest,
crawlRequestSchema,
CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook";
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
res: Response<CrawlResponse>
) {
req.body = crawlRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
const { remainingCredits } = req.account;
const crawlerOptions = legacyCrawlerOptions(req.body);
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
// TODO: @rafa, is this right? copied from v0
if (Array.isArray(crawlerOptions.includes)) {
for (const x of crawlerOptions.includes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
if (Array.isArray(crawlerOptions.excludes)) {
for (const x of crawlerOptions.excludes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
);
}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(sitemap.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
const job = await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{
priority: 15,
}
);
await addCrawlJob(id, job.id);
}
if(req.body.webhook) {
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
}
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
});
}

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function livenessController(req: Request, res: Response) {
//TODO: add checks if the application is live and healthy like checking the redis connection
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,142 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
import {
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine";
import { Logger } from "../../lib/logger";
configDotenv();
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
const startTime = new Date().getTime();
req.body = mapRequestSchema.parse(req.body);
const limit : number = req.body.limit ?? 5000;
const id = uuidv4();
let links: string[] = [req.body.url];
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body),
pageOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
const crawler = crawlToCrawler(id, sc);
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
sitemap.map((x) => {
links.push(x.url);
});
}
let urlWithoutWww = req.body.url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
// limit to 100 results (beta)
numResults: Math.min(limit, 100),
});
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
});
}
}
// Perform cosine similarity between the search query and the list of links
if (req.body.search) {
const searchQuery = req.body.search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
}
links = links.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim()
} catch (_) {
return null;
}
}).filter(x => x !== null);
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
// if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url));
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
billTeam(req.auth.team_id, 1).catch(error => {
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const linksToReturn = links.slice(0, limit);
logJob({
job_id: id,
success: links.length > 0,
message: "Map completed",
num_docs: linksToReturn.length,
docs: linksToReturn,
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "map",
url: req.body.url,
crawlerOptions: {},
pageOptions: {},
origin: req.body.origin,
extractor_options: { mode: "markdown" },
num_tokens: 0,
});
return res.status(200).json({
success: true,
links: linksToReturn,
scrape_id: req.body.origin?.includes("website") ? id : undefined,
});
}

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function readinessController(req: Request, res: Response) {
// TODO: add checks when the application is ready to serve traffic
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,38 @@
import { Response } from "express";
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
export async function scrapeStatusController(req: any, res: any) {
try {
const rateLimiter = scrapeStatusRateLimiter;
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP;
await rateLimiter.consume(iptoken);
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource.",
});
}
return res.status(200).json({
success: true,
data: job?.docs[0],
});
} catch (error) {
if (error instanceof Error && error.message == "Too Many Requests") {
return res.status(429).json({
success: false,
error: "Rate limit exceeded. Please try again later.",
});
} else {
return res.status(500).json({
success: false,
error: "An unexpected error occurred.",
});
}
}
}

View File

@ -0,0 +1,148 @@
import { Request, Response } from "express";
import { Logger } from "../../lib/logger";
import {
Document,
legacyDocumentConverter,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
ScrapeResponse,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
res: Response<ScrapeResponse>
) {
req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false;
const origin = req.body.origin;
const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4();
const startTime = new Date().getTime();
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10,
});
const job = await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: {},
team_id: req.auth.team_id,
pageOptions,
extractorOptions,
origin: req.body.origin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
let doc: any | undefined;
try {
doc = (await waitForJob(job.id, timeout))[0];
} catch (e) {
Logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && e.message.startsWith("Job wait")) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
} else {
return res.status(500).json({
success: false,
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
extractorOptions && extractorOptions.mode !== "markdown"
? " - Could be due to LLM parsing issues"
: ""
}`,
});
}
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({
success: true,
warning: "No page found",
data: doc,
});
}
delete doc.index;
delete doc.provider;
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
doc && doc.markdown
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
: 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
if(req.body.extract && req.body.formats.includes("extract")) {
creditsToBeBilled = 5;
}
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({
job_id: jobId,
success: true,
message: "Scrape completed",
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: {},
pageOptions: pageOptions,
origin: origin,
extractor_options: { mode: "markdown" },
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}

View File

@ -0,0 +1,380 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
export type Format =
| "markdown"
| "html"
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage"
| "extract";
export const url = z.preprocess(
(x) => {
if (!protocolIncluded(x as string)) {
return `http://${x}`;
}
return x;
},
z
.string()
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
"URL must have a valid top-level domain or be a valid path"
)
.refine(
(x) => {
try {
checkUrl(x as string)
return true;
} catch (_) {
return false;
}
},
"Invalid URL"
)
.refine(
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
);
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const extractOptions = z.object({
mode: z.enum(["llm"]).default("llm"),
schema: z.any().optional(),
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
prompt: z.string().optional()
}).strict(strictMessage);
export type ExtractOptions = z.infer<typeof extractOptions>;
export const scrapeOptions = z.object({
formats: z
.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
"extract"
])
.array()
.optional()
.default(["markdown"]),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000),
waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true),
}).strict(strictMessage)
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
url,
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
},
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
maxDepth: z.number().default(10), // default?
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
}).strict(strictMessage);
// export type CrawlerOptions = {
// includePaths?: string[];
// excludePaths?: string[];
// maxDepth?: number;
// limit?: number;
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
// allowExternalLinks?: boolean;
// ignoreSitemap?: boolean;
// };
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
limit: z.number().default(10000),
}).strict(strictMessage);
// export type CrawlRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
// };
// export type ExtractorOptions = {
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
// extractionPrompt?: string;
// extractionSchema?: Record<string, any>;
// }
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000).optional(),
}).strict(strictMessage);
// export type MapRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// };
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string;
extract?: string;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
metadata: {
title?: string;
description?: string;
language?: string;
keywords?: string;
robots?: string;
ogTitle?: string;
ogDescription?: string;
ogUrl?: string;
ogImage?: string;
ogAudio?: string;
ogDeterminer?: string;
ogLocale?: string;
ogLocaleAlternate?: string[];
ogSiteName?: string;
ogVideo?: string;
dcTermsCreated?: string;
dcDateCreated?: string;
dcDate?: string;
dcTermsType?: string;
dcType?: string;
dcTermsAudience?: string;
dcTermsSubject?: string;
dcSubject?: string;
dcDescription?: string;
dcTermsKeywords?: string;
modifiedTime?: string;
publishedTime?: string;
articleTag?: string;
articleSection?: string;
sourceURL?: string;
statusCode?: number;
error?: string;
};
};
export type ErrorResponse = {
success: false;
error: string;
details?: any;
};
export type ScrapeResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document;
scrape_id?: string;
};
export interface ScrapeResponseRequestTest {
statusCode: number;
body: ScrapeResponse;
error?: string;
}
export type CrawlResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
};
export type MapResponse =
| ErrorResponse
| {
success: true;
links: string[];
scrape_id?: string;
};
export type CrawlStatusParams = {
jobId: string;
};
export type CrawlStatusResponse =
| ErrorResponse
| {
success: true;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: string;
next?: string;
data: Document[];
};
type AuthObject = {
team_id: string;
plan: PlanType;
};
type Account = {
remainingCredits: number;
};
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
account?: Account;
}
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
}
export interface ResponseWithSentry<
ResBody = undefined,
> extends Response<ResBody> {
sentry?: string,
}
export function legacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
};
}
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
includeExtract: x.formats.includes("extract"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
headers: x.headers,
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
};
}
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
return {
mode: x.mode ? "llm-extraction" : "markdown",
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
extractionSchema: x.schema,
userPrompt: x.prompt ?? "",
};
}
export function legacyDocumentConverter(doc: any): Document {
if (doc === null || doc === undefined) return null;
if (doc.metadata) {
if (doc.metadata.screenshot) {
doc.screenshot = doc.metadata.screenshot;
delete doc.metadata.screenshot;
}
if (doc.metadata.fullPageScreenshot) {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
}
return {
markdown: doc.markdown,
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
metadata: {
...doc.metadata,
pageError: undefined,
pageStatusCode: undefined,
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
};
}

View File

@ -1,7 +1,7 @@
import "dotenv/config";
import "./services/sentry"
import * as Sentry from "@sentry/node";
import express from "express";
import express, { NextFunction, Request, Response } from "express";
import bodyParser from "body-parser";
import cors from "cors";
import { getScrapeQueue } from "./services/queue-service";
@ -15,8 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events";
import http from 'node:http';
import https from 'node:https';
import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1";
import expressWs from "express-ws";
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid";
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -49,7 +53,8 @@ if (cluster.isMaster) {
}
});
} else {
const app = express();
const ws = expressWs(express());
const app = ws.app;
global.isProduction = process.env.IS_PRODUCTION === "true";
@ -82,6 +87,7 @@ if (cluster.isMaster) {
// register router
app.use(v0Router);
app.use("/v1", v1Router);
app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002;
@ -184,11 +190,42 @@ if (cluster.isMaster) {
res.send({ isProduction: global.isProduction });
});
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
if (err instanceof ZodError) {
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
} else {
next(err);
}
});
Sentry.setupExpressErrorHandler(app);
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) {
return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' });
}
const id = res.sentry ?? uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
});
Logger.info(`Worker ${process.pid} started`);
}
// const sq = getScrapeQueue();
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));

View File

@ -15,7 +15,8 @@ export async function generateCompletions(
// const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;
const systemPrompt = extractionOptions.extractionPrompt;
const prompt = extractionOptions.userPrompt;
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
@ -24,30 +25,35 @@ export async function generateCompletions(
switch (switchVariable) {
case "openAI":
const llm = new OpenAI();
try{
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt,
mode: mode,
});
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(
`JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
);
}
try {
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt,
systemPrompt: systemPrompt,
mode: mode,
});
// Validate the JSON output against the schema using AJV
if (schema) {
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error(
`JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(
", "
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
);
}
}
return completionResult;
} catch (error) {
Logger.error(`Error generating completions: ${error}`);
throw error;
}
return completionResult;
} catch (error) {
Logger.error(`Error generating completions: ${error}`);
throw error;
}
default:
throw new Error("Invalid client");
}

View File

@ -16,7 +16,6 @@ function prepareOpenAIDoc(
document: Document,
mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
let markdown = document.markdown;
let extractionTarget = document.markdown;
@ -33,34 +32,32 @@ function prepareOpenAIDoc(
// );
}
// count number of tokens
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
}
return [[{ type: "text", text: extractionTarget }], numTokens];
}
export async function generateOpenAICompletions({
client,
model = process.env.MODEL_NAME || "gpt-4o",
model = process.env.MODEL_NAME || "gpt-4o-mini",
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
systemPrompt = defaultPrompt,
prompt,
temperature,
mode
mode,
}: {
client: OpenAI;
model?: string;
document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string;
systemPrompt?: string;
temperature?: number;
mode: "markdown" | "raw-html";
}): Promise<Document> {
@ -70,45 +67,79 @@ export async function generateOpenAICompletions({
if (preparedDoc === null) {
return {
...document,
warning: "LLM extraction was not performed since the document's content is empty or missing.",
warning:
"LLM extraction was not performed since the document's content is empty or missing.",
};
}
const [content, numTokens] = preparedDoc;
const completion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: prompt,
},
{ role: "user", content },
],
tools: [
{
type: "function",
function: {
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
let completion;
let llmExtraction;
if (prompt && !schema) {
const jsonCompletion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: systemPrompt,
},
},
],
tool_choice: { "type": "function", "function": {"name": "extract_content"}},
temperature,
});
{ role: "user", content },
{
role: "user",
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
},
],
response_format: { type: "json_object" },
temperature,
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
try {
llmExtraction = JSON.parse(
jsonCompletion.choices[0].message.content.trim()
);
} catch (e) {
throw new Error("Invalid JSON");
}
} else {
completion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: systemPrompt,
},
{ role: "user", content },
],
tools: [
{
type: "function",
function: {
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema,
},
},
],
tool_choice: { type: "function", function: { name: "extract_content" } },
temperature,
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c);
// Extract the LLM extraction content from the completion response
try {
llmExtraction = JSON.parse(c);
} catch (e) {
throw new Error("Invalid JSON");
}
}
// Return the document with the LLM extraction content added
return {
...document,
llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
warning:
numTokens > maxTokens
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
: undefined,
};
}

View File

@ -0,0 +1,40 @@
import { parseMarkdown } from '../html-to-markdown';
describe('parseMarkdown', () => {
it('should correctly convert simple HTML to Markdown', async () => {
const html = '<p>Hello, world!</p>';
const expectedMarkdown = 'Hello, world!';
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
});
it('should convert complex HTML with nested elements to Markdown', async () => {
const html = '<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>';
const expectedMarkdown = 'Hello **bold** world!\n\n- List item';
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
});
it('should return empty string when input is empty', async () => {
const html = '';
const expectedMarkdown = '';
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
});
it('should handle null input gracefully', async () => {
const html = null;
const expectedMarkdown = '';
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
});
it('should handle various types of invalid HTML gracefully', async () => {
const invalidHtmls = [
{ html: '<html><p>Unclosed tag', expected: 'Unclosed tag' },
{ html: '<div><span>Missing closing div', expected: 'Missing closing div' },
{ html: '<p><strong>Wrong nesting</em></strong></p>', expected: '**Wrong nesting**' },
{ html: '<a href="http://example.com">Link without closing tag', expected: '[Link without closing tag](http://example.com)' }
];
for (const { html, expected } of invalidHtmls) {
await expect(parseMarkdown(html)).resolves.toBe(expected);
}
});
});

View File

@ -0,0 +1,134 @@
import {
getJobPriority,
addJobPriority,
deleteJobPriority,
} from "../job-priority";
import { redisConnection } from "../../services/queue-service";
import { PlanType } from "../../types";
jest.mock("../../services/queue-service", () => ({
redisConnection: {
sadd: jest.fn(),
srem: jest.fn(),
scard: jest.fn(),
expire: jest.fn(),
},
}));
describe("Job Priority Tests", () => {
afterEach(() => {
jest.clearAllMocks();
});
test("addJobPriority should add job_id to the set and set expiration", async () => {
const team_id = "team1";
const job_id = "job1";
await addJobPriority(team_id, job_id);
expect(redisConnection.sadd).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
});
test("deleteJobPriority should remove job_id from the set", async () => {
const team_id = "team1";
const job_id = "job1";
await deleteJobPriority(team_id, job_id);
expect(redisConnection.srem).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
);
});
test("getJobPriority should return correct priority based on plan and set length", async () => {
const team_id = "team1";
const plan: PlanType = "standard";
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
const priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(250);
const priorityExceeded = await getJobPriority({ plan, team_id });
expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4)
});
test("getJobPriority should handle different plans correctly", async () => {
const team_id = "team1";
(redisConnection.scard as jest.Mock).mockResolvedValue(50);
let plan: PlanType = "hobby";
let priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
plan = "hobby";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3)
(redisConnection.scard as jest.Mock).mockResolvedValue(25);
plan = "free";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(60);
plan = "free";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5)
});
test("addJobPriority should reset expiration time when adding new job", async () => {
const team_id = "team1";
const job_id1 = "job1";
const job_id2 = "job2";
await addJobPriority(team_id, job_id1);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
// Clear the mock calls
(redisConnection.expire as jest.Mock).mockClear();
// Add another job
await addJobPriority(team_id, job_id2);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
});
test("Set should expire after 60 seconds", async () => {
const team_id = "team1";
const job_id = "job1";
jest.useFakeTimers();
await addJobPriority(team_id, job_id);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
// Fast-forward time by 59 seconds
jest.advanceTimersByTime(59000);
// The set should still exist
expect(redisConnection.scard).not.toHaveBeenCalled();
// Fast-forward time by 2 more seconds (total 61 seconds)
jest.advanceTimersByTime(2000);
// Check if the set has been removed (scard should return 0)
(redisConnection.scard as jest.Mock).mockResolvedValue(0);
const setSize = await redisConnection.scard(`limit_team_id:${team_id}`);
expect(setSize).toBe(0);
jest.useRealTimers();
});
});

View File

@ -0,0 +1,32 @@
import { checkTeamCredits } from "../services/billing/credit_billing";
import { Logger } from "./logger";
type checkCreditsResponse = {
status: number;
error: string | null;
}
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
try {
const {
success: creditsCheckSuccess,
message: creditsCheckMessage
} = await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return {
status: 402,
error: "Insufficient credits"
};
}
} catch (error) {
Logger.error(error);
return {
status: 500,
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
};
}
return {
status: 200,
error: null
}
};

View File

@ -6,6 +6,7 @@ export type StoredCrawl = {
crawlerOptions: any;
pageOptions: any;
team_id: string;
plan: string;
robots?: string;
cancelled?: boolean;
createdAt: number;
@ -26,6 +27,14 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
return JSON.parse(x);
}
export async function getCrawlExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("crawl:" + id);
d.setMilliseconds(d.getMilliseconds() + ttl);
d.setMilliseconds(0);
return d;
}
export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
@ -38,13 +47,27 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
}
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
}
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
}
export async function isCrawlFinished(id: string) {
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
}
export async function isCrawlFinishedLocked(id: string) {
return (await redisConnection.exists("crawl:" + id + ":finish"));
}
export async function finishCrawl(id: string) {
if (await isCrawlFinished(id)) {
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");

View File

@ -19,3 +19,4 @@ export class CustomError extends Error {
Object.setPrototypeOf(this, CustomError.prototype);
}
}

View File

@ -11,6 +11,8 @@ export interface Progress {
}
export type PageOptions = {
includeMarkdown?: boolean;
includeExtract?: boolean;
onlyMainContent?: boolean;
includeHtml?: boolean;
includeRawHtml?: boolean;
@ -24,8 +26,9 @@ export type PageOptions = {
parsePDF?: boolean;
removeTags?: string | string[];
onlyIncludeTags?: string | string[];
includeLinks?: boolean;
useFastMode?: boolean; // beta
disableJSDom?: boolean; // beta
disableJsDom?: boolean; // beta
atsv?: boolean; // beta
};
@ -33,6 +36,7 @@ export type ExtractorOptions = {
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string;
extractionSchema?: Record<string, any>;
userPrompt?: string;
}
export type SearchOptions = {

View File

@ -0,0 +1,7 @@
To build the go-html-to-md library, run the following command:
```bash
cd apps/api/src/lib/go-html-to-md
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
```

View File

@ -0,0 +1,14 @@
module html-to-markdown.go
go 1.19
require github.com/JohannesKaufmann/html-to-markdown v1.6.0
require (
github.com/PuerkitoBio/goquery v1.9.2 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/kr/pretty v0.3.0 // indirect
golang.org/x/net v0.25.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)

View File

@ -0,0 +1,93 @@
github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k=
github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.1 h1:3bajkSilaCbjdKVsKdZjZCLBNPL9pYzrCakKaf4U49U=
github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=

View File

@ -0,0 +1,25 @@
package main
import (
"C"
"log"
md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/JohannesKaufmann/html-to-markdown/plugin"
)
//export ConvertHTMLToMarkdown
func ConvertHTMLToMarkdown(html *C.char) *C.char {
converter := md.NewConverter("", true, nil)
converter.Use(plugin.GitHubFlavored())
markdown, err := converter.ConvertString(C.GoString(html))
if err != nil {
log.Fatal(err)
}
return C.CString(markdown)
}
func main() {
// This function is required for the main package
}

View File

@ -1,8 +1,68 @@
export async function parseMarkdown(html: string) {
import koffi from 'koffi';
import { join } from 'path';
import "../services/sentry"
import * as Sentry from "@sentry/node";
import dotenv from 'dotenv';
import { Logger } from './logger';
dotenv.config();
// TODO: add a timeout to the Go parser
class GoMarkdownConverter {
private static instance: GoMarkdownConverter;
private convert: any;
private constructor() {
const goExecutablePath = join(__dirname, 'go-html-to-md/html-to-markdown.so');
const lib = koffi.load(goExecutablePath);
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
}
public static getInstance(): GoMarkdownConverter {
if (!GoMarkdownConverter.instance) {
GoMarkdownConverter.instance = new GoMarkdownConverter();
}
return GoMarkdownConverter.instance;
}
public async convertHTMLToMarkdown(html: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.convert.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(res);
}
});
});
}
}
export async function parseMarkdown(html: string): Promise<string> {
if (!html) {
return '';
}
try {
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
const converter = GoMarkdownConverter.getInstance();
let markdownContent = await converter.convertHTMLToMarkdown(html);
markdownContent = processMultiLineLinks(markdownContent);
markdownContent = removeSkipToContentLinks(markdownContent);
Logger.info(`HTML to Markdown conversion using Go parser successful`);
return markdownContent;
}
} catch (error) {
Sentry.captureException(error);
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
}
// Fallback to TurndownService if Go parser fails or is not enabled
var TurndownService = require("turndown");
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
var turndownPluginGfm = require('joplin-turndown-plugin-gfm');
const turndownService = new TurndownService();
turndownService.addRule("inlineLink", {
@ -21,29 +81,20 @@ export async function parseMarkdown(html: string) {
});
var gfm = turndownPluginGfm.gfm;
turndownService.use(gfm);
let markdownContent = "";
const turndownPromise = new Promise<string>((resolve, reject) => {
try {
const result = turndownService.turndown(html);
resolve(result);
} catch (error) {
reject("Error converting HTML to Markdown: " + error);
}
});
const timeoutPromise = new Promise<string>((resolve, reject) => {
const timeout = 5000; // Timeout in milliseconds
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
});
try {
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
let markdownContent = await turndownService.turndown(html);
markdownContent = processMultiLineLinks(markdownContent);
markdownContent = removeSkipToContentLinks(markdownContent);
return markdownContent;
} catch (error) {
console.error(error);
console.error("Error converting HTML to Markdown: ", error);
return ""; // Optionally return an empty string or handle the error as needed
}
}
// multiple line links
function processMultiLineLinks(markdownContent: string): string {
let insideLinkContent = false;
let newMarkdownContent = "";
let linkOpenCount = 0;
@ -63,12 +114,14 @@ export async function parseMarkdown(html: string) {
newMarkdownContent += char;
}
}
markdownContent = newMarkdownContent;
return newMarkdownContent;
}
function removeSkipToContentLinks(markdownContent: string): string {
// Remove [Skip to Content](#page) and [Skip to content](#skip)
markdownContent = markdownContent.replace(
const newMarkdownContent = markdownContent.replace(
/\[Skip to Content\]\(#[^\)]*\)/gi,
""
);
return markdownContent;
return newMarkdownContent;
}

View File

@ -0,0 +1,91 @@
import { redisConnection } from "../../src/services/queue-service";
import { PlanType } from "../../src/types";
import { Logger } from "./logger";
const SET_KEY_PREFIX = "limit_team_id:";
export async function addJobPriority(team_id, job_id) {
try {
const setKey = SET_KEY_PREFIX + team_id;
// Add scrape job id to the set
await redisConnection.sadd(setKey, job_id);
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
await redisConnection.expire(setKey, 60);
} catch (e) {
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
}
}
export async function deleteJobPriority(team_id, job_id) {
try {
const setKey = SET_KEY_PREFIX + team_id;
// remove job_id from the set
await redisConnection.srem(setKey, job_id);
} catch (e) {
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
}
}
export async function getJobPriority({
plan,
team_id,
basePriority = 10,
}: {
plan: PlanType;
team_id: string;
basePriority?: number;
}): Promise<number> {
try {
const setKey = SET_KEY_PREFIX + team_id;
// Get the length of the set
const setLength = await redisConnection.scard(setKey);
// Determine the priority based on the plan and set length
let planModifier = 1;
let bucketLimit = 0;
switch (plan) {
case "free":
bucketLimit = 25;
planModifier = 0.5;
break;
case "hobby":
bucketLimit = 100;
planModifier = 0.3;
break;
case "standard":
case "standardnew":
bucketLimit = 200;
planModifier = 0.2;
break;
case "growth":
case "growthdouble":
bucketLimit = 400;
planModifier = 0.1;
break;
default:
bucketLimit = 25;
planModifier = 1;
break;
}
// if length set is smaller than set, just return base priority
if (setLength <= bucketLimit) {
return basePriority;
} else {
// If not, we keep base priority + planModifier
return Math.ceil(
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
);
}
} catch (e) {
Logger.error(
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
);
return basePriority;
}
}

View File

@ -1,3 +1,6 @@
import { configDotenv } from "dotenv";
configDotenv();
enum LogLevel {
NONE = 'NONE', // No logs will be output.
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
@ -25,7 +28,8 @@ export class Logger {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
// if (process.env.USE_DB_AUTH) {
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// if (useDbAuthentication) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }

View File

@ -0,0 +1,46 @@
import { Logger } from "./logger";
export function performCosineSimilarity(links: string[], searchQuery: string) {
try {
// Function to calculate cosine similarity
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const magnitude1 = Math.sqrt(
vec1.reduce((sum, val) => sum + val * val, 0)
);
const magnitude2 = Math.sqrt(
vec2.reduce((sum, val) => sum + val * val, 0)
);
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);
};
// Function to convert text to vector
const textToVector = (text: string): number[] => {
const words = searchQuery.toLowerCase().split(/\W+/);
return words.map((word) => {
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
.length;
return count / text.length;
});
};
// Calculate similarity scores
const similarityScores = links.map((link) => {
const linkVector = textToVector(link);
const searchVector = textToVector(searchQuery);
return cosineSimilarity(linkVector, searchVector);
});
// Sort links based on similarity scores and print scores
const a = links
.map((link, index) => ({ link, score: similarityScores[index] }))
.sort((a, b) => b.score - a.score);
links = a.map((item) => item.link);
return links;
} catch (error) {
Logger.error(`Error performing cosine similarity: ${error}`);
return links;
}
}

View File

@ -2,6 +2,8 @@ import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger";
import { configDotenv } from "dotenv";
configDotenv();
export type ScrapeErrorEvent = {
type: "error",
@ -36,7 +38,8 @@ export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
if (process.env.USE_DB_AUTHENTICATION) {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
try {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,

View File

@ -1,10 +1,17 @@
import { supabase_service } from "../services/supabase";
import { Logger } from "./logger";
import * as Sentry from "@sentry/node";
/**
* Get a single firecrawl_job by ID
* @param jobId ID of Job
* @returns {any | null} Job
*/
export const supabaseGetJobById = async (jobId: string) => {
const { data, error } = await supabase_service
.from('firecrawl_jobs')
.select('*')
.eq('job_id', jobId)
.from("firecrawl_jobs")
.select("*")
.eq("job_id", jobId)
.single();
if (error) {
@ -16,15 +23,22 @@ export const supabaseGetJobById = async (jobId: string) => {
}
return data;
}
};
/**
* Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once.
* @param jobIds IDs of Jobs
* @returns {any[]} Jobs
*/
export const supabaseGetJobsById = async (jobIds: string[]) => {
const { data, error } = await supabase_service
.from('firecrawl_jobs')
.select('*')
.in('job_id', jobIds);
.from("firecrawl_jobs")
.select()
.in("job_id", jobIds);
if (error) {
Logger.error(`Error in supabaseGetJobsById: ${error}`);
Sentry.captureException(error);
return [];
}
@ -33,5 +47,47 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
}
return data;
}
};
/**
* Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once.
* @param crawlId ID of crawl
* @returns {any[]} Jobs
*/
export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select()
.eq("crawl_id", crawlId)
if (error) {
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
Sentry.captureException(error);
return [];
}
if (!data) {
return [];
}
return data;
};
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select("docs, team_id")
.eq("job_id", jobId)
.single();
if (error) {
return null;
}
if (!data) {
return null;
}
return data;
};

View File

@ -0,0 +1,159 @@
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
import { isSameSubdomain } from "./validateUrl";
describe("isSameDomain", () => {
it("should return true for a subdomain", () => {
const result = isSameDomain("http://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain", () => {
const result = isSameDomain("http://example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for different domains", () => {
const result = isSameDomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return true for a subdomain with different protocols", () => {
const result = isSameDomain("https://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for invalid URLs", () => {
const result = isSameDomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameDomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for a subdomain with www prefix", () => {
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain with www prefix", () => {
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
expect(result).toBe(true);
});
});
describe("isSameSubdomain", () => {
it("should return false for a subdomain", () => {
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
expect(result).toBe(false);
});
it("should return true for the same subdomain", () => {
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for different subdomains", () => {
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
it("should return false for different domains", () => {
const result = isSameSubdomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return false for invalid URLs", () => {
const result = isSameSubdomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameSubdomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for the same subdomain with different protocols", () => {
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return true for the same subdomain with www prefix", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for a subdomain with www prefix and different subdomain", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
});
describe("removeDuplicateUrls", () => {
it("should remove duplicate URLs with different protocols", () => {
const urls = [
"http://example.com",
"https://example.com",
"http://www.example.com",
"https://www.example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should keep URLs with different paths", () => {
const urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
]);
});
it("should prefer https over http", () => {
const urls = [
"http://example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should prefer non-www over www", () => {
const urls = [
"https://www.example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should handle empty input", () => {
const urls: string[] = [];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([]);
});
it("should handle URLs with different cases", () => {
const urls = [
"https://EXAMPLE.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://EXAMPLE.com"]);
});
it("should handle URLs with trailing slashes", () => {
const urls = [
"https://example.com",
"https://example.com/"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
});

View File

@ -1,9 +1,8 @@
const protocolIncluded = (url: string) => {
export const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return(/^([^.:]+:\/\/)/.test(url));
}
return /^([^.:]+:\/\/)/.test(url);
};
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
@ -30,9 +28,143 @@ export const checkAndUpdateURL = (url: string) => {
const typedUrlObj = urlObj as URL;
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
};
export const checkUrl = (url: string) => {
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
}
return url;
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
return domain1 === domain2;
}
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};
export function removeDuplicateUrls(urls: string[]): string[] {
const urlMap = new Map<string, string>();
for (const url of urls) {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol;
const hostname = parsedUrl.hostname.replace(/^www\./, '');
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
const key = `${hostname}${path}`;
if (!urlMap.has(key)) {
urlMap.set(key, url);
} else {
const existingUrl = new URL(urlMap.get(key)!);
const existingProtocol = existingUrl.protocol;
if (protocol === 'https:' && existingProtocol === 'http:') {
urlMap.set(key, url);
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
urlMap.set(key, url);
}
}
}
return [...new Set(Array.from(urlMap.values()))];
}

View File

@ -1,5 +1,8 @@
import { AuthResponse } from "../../src/types";
import { Logger } from "./logger";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
let warningCount = 0;
@ -7,7 +10,8 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
originalFunction: (...args: U) => Promise<T>
) {
return async function (...args: U): Promise<T> {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
if (warningCount < 5) {
Logger.warn("You're bypassing authentication");
warningCount++;
@ -17,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
try {
return await originalFunction(...args);
} catch (error) {
Sentry.captureException(error);
Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T;
}

View File

@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
configDotenv();
export async function startWebScraperPipeline({
job,
@ -26,7 +28,12 @@ export async function startWebScraperPipeline({
mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions,
extractorOptions: job.data.extractorOptions,
pageOptions: job.data.pageOptions,
pageOptions: {
...job.data.pageOptions,
...(job.data.crawl_id ? ({
includeRawHtml: true,
}): {}),
},
inProgress: (progress) => {
Logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) {
@ -49,6 +56,7 @@ export async function startWebScraperPipeline({
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
})) as { success: boolean; message: string; docs: Document[] };
}
export async function runWebScraper({
@ -63,6 +71,7 @@ export async function runWebScraper({
team_id,
bull_job_id,
priority,
is_scrape=false,
}: RunWebScraperParams): Promise<RunWebScraperResult> {
try {
const provider = new WebScraperDataProvider();
@ -110,17 +119,15 @@ export async function runWebScraper({
})
: docs;
const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found");
return {
success: false,
message: "Failed to bill team, no subscription was found",
docs: [],
};
if(is_scrape === false) {
billTeam(team_id, filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
// This is where the returnvalue from the job is set
onSuccess(filteredDocs, mode);
@ -134,7 +141,8 @@ export async function runWebScraper({
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
try {
if (process.env.USE_DB_AUTHENTICATION === "true") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.update({ docs: result })

View File

@ -1,11 +1,11 @@
import express from "express";
import { redisHealthController } from "../controllers/admin/redis-health";
import { redisHealthController } from "../controllers/v0/admin/redis-health";
import {
autoscalerController,
checkQueuesController,
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/admin/queue";
} from "../controllers/v0/admin/queue";
export const adminRouter = express.Router();

View File

@ -1,14 +1,14 @@
import express from "express";
import { crawlController } from "../../src/controllers/crawl";
import { crawlStatusController } from "../../src/controllers/crawl-status";
import { scrapeController } from "../../src/controllers/scrape";
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
import { searchController } from "../../src/controllers/search";
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
import { keyAuthController } from "../../src/controllers/keyAuth";
import { livenessController } from "../controllers/liveness";
import { readinessController } from "../controllers/readiness";
import { crawlController } from "../../src/controllers/v0/crawl";
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
import { scrapeController } from "../../src/controllers/v0/scrape";
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
import { searchController } from "../../src/controllers/v0/search";
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
import { livenessController } from "../controllers/v0/liveness";
import { readinessController } from "../controllers/v0/readiness";
export const v0Router = express.Router();

164
apps/api/src/routes/v1.ts Normal file
View File

@ -0,0 +1,164 @@
import express, { NextFunction, Request, Response } from "express";
import { crawlController } from "../controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../controllers/v1/crawl-status";
import { mapController } from "../controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/auth";
import { createIdempotencyKey } from "../services/idempotency/create";
import { validateIdempotencyKey } from "../services/idempotency/validate";
import { checkTeamCredits } from "../services/billing/credit_billing";
import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
}
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
}
req.account = { remainingCredits }
next();
})()
.catch(err => next(err));
};
}
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
rateLimiterMode,
);
if (!success) {
if (!res.headersSent) {
return res.status(status).json({ success: false, error });
}
}
req.auth = { team_id, plan };
next();
})()
.catch(err => next(err));
}
}
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
(async () => {
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
if (!res.headersSent) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
}
createIdempotencyKey(req);
}
next();
})()
.catch(err => next(err));
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
if (!res.headersSent) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
}
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
}
}
expressWs(express());
export const v1Router = express.Router();
v1Router.post(
"/scrape",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(scrapeController)
);
v1Router.post(
"/crawl",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(crawlController)
);
v1Router.post(
"/map",
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(mapController)
);
v1Router.get(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
);
v1Router.get(
"/scrape/:jobId",
wrap(scrapeStatusController)
);
v1Router.ws(
"/crawl/:jobId",
crawlStatusWSController
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
v1Router.delete(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.Crawl),
crawlCancelController
);
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication
// v1Router.get("/keyAuth", keyAuthController);
// // Search routes
// v0Router.post("/search", searchController);
// Health/Probe routes
// v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController);

View File

@ -34,4 +34,4 @@ it('should return a list of links on the firecrawl.ai page', async () => {
expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 10000);
}, 15000);

View File

@ -309,6 +309,23 @@ export class WebCrawler {
return null;
}
public extractLinksFromHTML(html: string, url: string) {
let links: string[] = [];
const $ = load(html);
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
const u = this.filterURL(href, url);
if (u !== null) {
links.push(u);
}
}
});
return links;
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
@ -352,15 +369,7 @@ export class WebCrawler {
links.push({ url, html: content, pageStatusCode, pageError });
}
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
const u = this.filterURL(href, url);
if (u !== null) {
links.push({ url: u, html: content, pageStatusCode, pageError });
}
}
});
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
if (this.visited.size === 1) {
return links;

View File

@ -294,28 +294,32 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(this.urls[0], documents);
}
documents = this.applyPathReplacements(documents);
// documents = await this.applyImgAltText(documents);
if (
(this.extractorOptions.mode === "llm-extraction" ||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
"markdown"
);
if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents);
}
if (
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
this.mode === "single_urls"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
"raw-html"
);
if (!this.pageOptions.includeHtml) {
for (let document of documents) {
delete document.html;
}
}
// documents = await this.applyImgAltText(documents);
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
const extractionMode = this.extractorOptions?.mode ?? "markdown";
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
if (
extractionMode === "llm-extraction" ||
extractionMode === "llm-extraction-from-markdown" ||
extractionMode === "llm-extraction-from-raw-html"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
completionMode
);
}
}
return documents.concat(pdfDocuments).concat(docxDocuments);
}
@ -347,6 +351,7 @@ export class WebScraperDataProvider {
});
return {
content: content,
markdown: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper",
};
@ -569,12 +574,24 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
parsePDF: true,
removeTags: [],
this.pageOptions = {
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
includeHtml: options.pageOptions?.includeHtml ?? false,
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: options.pageOptions?.parsePDF ?? true,
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
useFastMode: options.pageOptions?.useFastMode ?? false,
disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =
@ -599,6 +616,8 @@ export class WebScraperDataProvider {
this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {
if (!url.trim().startsWith("http")) {

View File

@ -55,7 +55,7 @@ export async function scrapWithFireEngine({
try {
const reqParams = await generateRequestParams(url);
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@ -69,15 +69,15 @@ export async function scrapWithFireEngine({
let engine = engineParam; // do we want fireEngineOptions as first choice?
Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
if (pageOptions?.useFastMode) {
fireEngineOptionsParam.engine = "tlsclient";
engine = "tlsclient";
}
Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
// atsv is only available for beta customers
const betaCustomersString = process.env.BETA_CUSTOMERS;
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
@ -96,6 +96,7 @@ export async function scrapWithFireEngine({
const _response = await Sentry.startSpan({
name: "Call to fire-engine"
}, async span => {
return await axiosInstance.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
@ -104,12 +105,13 @@ export async function scrapWithFireEngine({
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
pageOptions: pageOptions,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
engine,
instantReturn: true,
...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
},
{
headers: {
@ -125,7 +127,7 @@ export async function scrapWithFireEngine({
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
}

View File

@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
dotenv.config();
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
export const baseScrapers = [
"fire-engine;chrome-cdp",
"fire-engine",
"scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad",
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useFireEngine ? undefined : "playwright",
useScrapingBee ? "scrapingBeeLoad" : undefined,
"fetch",
].filter(Boolean);
@ -85,23 +88,23 @@ function getScrapingFallbackOrder(
});
let defaultOrder = [
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
"scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad",
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useScrapingBee ? "scrapingBeeLoad" : undefined,
useFireEngine ? undefined : "playwright",
"fetch",
].filter(Boolean);
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),
].filter(Boolean);
}
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
// defaultOrder = [
// "fire-engine",
// useFireEngine ? undefined : "playwright",
// ...defaultOrder.filter(
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
// ),
// ].filter(Boolean);
// }
const filteredDefaultOrder = defaultOrder.filter(
(scraper: (typeof baseScrapers)[number]) =>
@ -122,22 +125,42 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl(
jobId: string,
urlToScrap: string,
pageOptions: PageOptions = {
onlyMainContent: true,
includeHtml: false,
includeRawHtml: false,
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
headers: undefined,
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = "",
pageOptions: PageOptions,
extractorOptions?: ExtractorOptions,
existingHtml?: string,
priority?: number,
teamId?: string
): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
includeExtract: pageOptions.includeExtract ?? false,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
useFastMode: pageOptions.useFastMode ?? false,
disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false
}
if (extractorOptions) {
extractorOptions = {
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
}
}
if (!existingHtml) {
existingHtml = "";
}
urlToScrap = urlToScrap.trim();
const attemptScraping = async (
@ -180,6 +203,7 @@ export async function scrapSingleUrl(
fireEngineOptions: {
engine: engine,
atsv: pageOptions.atsv,
disableJsDom: pageOptions.disableJsDom,
},
priority,
teamId,
@ -341,8 +365,8 @@ export async function scrapSingleUrl(
pageError = undefined;
}
if (text && text.trim().length >= 100) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break;
}
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
@ -364,20 +388,22 @@ export async function scrapSingleUrl(
let linksOnPage: string[] | undefined;
linksOnPage = extractLinks(rawHtml, urlToScrap);
if (pageOptions.includeLinks) {
linksOnPage = extractLinks(rawHtml, urlToScrap);
}
let document: Document;
if (screenshot && screenshot.length > 0) {
document = {
content: text,
markdown: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html"
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
linksOnPage,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
metadata: {
...metadata,
screenshot: screenshot,
@ -389,11 +415,11 @@ export async function scrapSingleUrl(
} else {
document = {
content: text,
markdown: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html"
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
metadata: {
@ -402,7 +428,7 @@ export async function scrapSingleUrl(
pageStatusCode: pageStatusCode,
pageError: pageError,
},
linksOnPage,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
};
}
@ -416,9 +442,9 @@ export async function scrapSingleUrl(
});
return {
content: "",
markdown: "",
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
html: "",
linksOnPage: [],
linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,

View File

@ -17,6 +17,8 @@ const socialMediaBlocklist = [
'researchhub.com',
'youtube.com',
'corterix.com',
'southwest.com',
'ryanair.com'
];
const allowedKeywords = [

View File

@ -242,5 +242,13 @@ export const urlSpecificParams = {
engine: "chrome-cdp",
},
},
},
"lorealparis.hu":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "tlsclient",
},
},
}
};

View File

@ -39,16 +39,8 @@ export const excludeNonMainTags = [
"#search",
".share",
"#share",
".pagination",
"#pagination",
".widget",
"#widget",
".related",
"#related",
".tag",
"#tag",
".category",
"#category",
".cookie",
"#cookie"
];

View File

@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
const match = pattern.exec(url);
language = match ? match[1] : null;
language = soup('html').attr('lang') || null;
keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null;

View File

@ -0,0 +1,45 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function fireEngineMap(q: string, options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
numResults: number;
page?: number;
}): Promise<SearchResult[]> {
let data = JSON.stringify({
query: q,
lang: options.lang,
country: options.country,
location: options.location,
tbs: options.tbs,
numResults: options.numResults,
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
return [];
}
let config = {
method: "POST",
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response) {
return response.data
} else {
return [];
}
}

View File

@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
let proxies = null;
if (proxy) {
if (proxy.startsWith("https")) {

View File

@ -1,11 +1,9 @@
import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { serper_search } from "./serper";
export async function search({
query,
advanced = false,
@ -30,12 +28,20 @@ export async function search({
proxy?: string;
sleep_interval?: number;
timeout?: number;
}) : Promise<SearchResult[]> {
}): Promise<SearchResult[]> {
try {
if (process.env.SERPER_API_KEY ) {
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
}
return await google_search(
return await googleSearch(
query,
advanced,
num_results,
@ -49,7 +55,6 @@ export async function search({
);
} catch (error) {
Logger.error(`Error in search function: ${error}`);
return []
return [];
}
// if process.env.SERPER_API_KEY is set, use serper
}

View File

@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger";
import { getValue, setValue } from "../redis";
import { redlock } from "../redlock";
import * as Sentry from "@sentry/node";
const FREE_CREDITS = 500;
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
]);
let couponCredits = 0;
let sortedCoupons = [];
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
(total, coupon) => total + coupon.credits,
0
);
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
}
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
// using coupon credits:
if (couponCredits > 0) {
// if there is no subscription and they have enough coupon credits
@ -175,9 +176,25 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
}
// Retrieve the team's active subscription and check for available coupons concurrently
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
await Promise.all([
let cacheKeySubscription = `subscription_${team_id}`;
let cacheKeyCoupons = `coupons_${team_id}`;
// Try to get data from cache first
const [cachedSubscription, cachedCoupons] = await Promise.all([
getValue(cacheKeySubscription),
getValue(cacheKeyCoupons)
]);
let subscription, subscriptionError;
let coupons : {credits: number}[];
if (cachedSubscription && cachedCoupons) {
subscription = JSON.parse(cachedSubscription);
coupons = JSON.parse(cachedCoupons);
} else {
// If not in cache, retrieve from database
const [subscriptionResult, couponsResult] = await Promise.all([
supabase_service
.from("subscriptions")
.select("id, price_id, current_period_start, current_period_end")
@ -191,6 +208,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
.eq("status", "active"),
]);
subscription = subscriptionResult.data;
subscriptionError = subscriptionResult.error;
coupons = couponsResult.data;
// Cache the results for a minute, sub can be null and that's fine
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
}
let couponCredits = 0;
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
@ -199,30 +226,67 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
);
}
// If there are available coupons and they are enough for the operation
if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
}
// Free credits, no coupons
if (subscriptionError || !subscription) {
// If there is no active subscription but there are available coupons
if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
}
if (!subscription || subscriptionError) {
const { data: creditUsages, error: creditUsageError } =
await supabase_service
.from("credit_usage")
.select("credits_used")
.is("subscription_id", null)
.eq("team_id", team_id);
let creditUsages;
let creditUsageError;
let totalCreditsUsed = 0;
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
if (creditUsageError) {
throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}`
// Try to get credit usage from cache
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
if (cachedCreditUsage) {
totalCreditsUsed = parseInt(cachedCreditUsage);
} else {
let retries = 0;
const maxRetries = 3;
const retryInterval = 2000; // 2 seconds
while (retries < maxRetries) {
// Reminder, this has an 1000 limit.
const result = await supabase_service
.from("credit_usage")
.select("credits_used")
.is("subscription_id", null)
.eq("team_id", team_id);
creditUsages = result.data;
creditUsageError = result.error;
if (!creditUsageError) {
break;
}
retries++;
if (retries < maxRetries) {
await new Promise(resolve => setTimeout(resolve, retryInterval));
}
}
if (creditUsageError) {
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}`
);
}
totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
);
}
const totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
);
// Cache the result for 30 seconds
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
}
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
@ -230,9 +294,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
end.setDate(end.getDate() + 30);
// check if usage is within 80% of the limit
const creditLimit = FREE_CREDITS;
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
const creditUsagePercentage = totalCreditsUsed / creditLimit;
if (creditUsagePercentage >= 0.8) {
// Add a check to ensure totalCreditsUsed is greater than 0
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
await sendNotification(
team_id,
NotificationType.APPROACHING_LIMIT,
@ -242,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
}
// 5. Compare the total credits used with the credits allowed by the plan.
if (totalCreditsUsed + credits > FREE_CREDITS) {
if (totalCreditsUsed >= FREE_CREDITS) {
// Send email notification for insufficient credits
await sendNotification(
team_id,
@ -286,7 +352,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used;
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
}
}
@ -299,24 +365,47 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
// Get the price details
const { data: price, error: priceError } = await supabase_service
.from("prices")
.select("credits")
.eq("id", subscription.price_id)
.single();
if (priceError) {
throw new Error(
`Failed to retrieve price for price_id: ${subscription.price_id}`
);
// Get the price details from cache or database
const priceCacheKey = `price_${subscription.price_id}`;
let price : {credits: number};
try {
const cachedPrice = await getValue(priceCacheKey);
if (cachedPrice) {
price = JSON.parse(cachedPrice);
} else {
const { data, error: priceError } = await supabase_service
.from("prices")
.select("credits")
.eq("id", subscription.price_id)
.single();
if (priceError) {
throw new Error(
`Failed to retrieve price for price_id: ${subscription.price_id}`
);
}
price = data;
// There are only 21 records, so this is super fine
// Cache the price for a long time (e.g., 1 day)
await setValue(priceCacheKey, JSON.stringify(price), 86400);
}
} catch (error) {
Logger.error(`Error retrieving or caching price: ${error}`);
Sentry.captureException(error);
// If errors, just assume it's a big number so user don't get an error
price = { credits: 10000000 };
}
const creditLimit = price.credits;
const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit;
// Removal of + credits
const creditUsagePercentage = adjustedCreditsUsed / creditLimit;
// Compare the adjusted total credits used with the credits allowed by the plan
if (adjustedCreditsUsed + credits > price.credits) {
if (adjustedCreditsUsed >= price.credits) {
await sendNotification(
team_id,
NotificationType.LIMIT_REACHED,
@ -324,7 +413,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
subscription.current_period_end
);
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
} else if (creditUsagePercentage >= 0.8) {
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
// Send email notification for approaching credit limit
await sendNotification(
team_id,
@ -439,8 +528,8 @@ async function createCreditUsage({
subscription_id?: string;
credits: number;
}) {
const { data: credit_usage } = await supabase_service
.from("credit_usage")
await supabase_service
.from("credit_usage")
.insert([
{
team_id,
@ -448,8 +537,7 @@ async function createCreditUsage({
subscription_id: subscription_id || null,
created_at: new Date(),
},
])
.select();
]);
return { success: true, credit_usage };
return { success: true };
}

View File

@ -1,9 +1,11 @@
import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
import "dotenv/config";
import { configDotenv } from "dotenv";
configDotenv();
export async function logCrawl(job_id: string, team_id: string) {
if (process.env.USE_DB_AUTHENTICATION === 'true') {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
try {
const { data, error } = await supabase_service
.from("bulljobs_teams")

View File

@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog";
import "dotenv/config";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logJob(job: FirecrawlJob) {
try {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
return;
}

View File

@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
Logger.debug("Skipping logging scrape to Supabase");
return;
}

View File

@ -8,10 +8,11 @@ async function addScrapeJobRaw(
webScraperOptions: any,
options: any,
jobId: string,
jobPriority: number = 10
): Promise<Job> {
return await getScrapeQueue().add(jobId, webScraperOptions, {
...options,
priority: webScraperOptions.crawl_id ? 20 : 10,
priority: jobPriority,
jobId,
});
}
@ -20,7 +21,9 @@ export async function addScrapeJob(
webScraperOptions: WebScraperOptions,
options: any = {},
jobId: string = uuidv4(),
jobPriority: number = 10
): Promise<Job> {
if (Sentry.isInitialized()) {
const size = JSON.stringify(webScraperOptions).length;
return await Sentry.startSpan({
@ -39,10 +42,31 @@ export async function addScrapeJob(
baggage: Sentry.spanToBaggageHeader(span),
size,
},
}, options, jobId);
}, options, jobId, jobPriority);
});
} else {
return await addScrapeJobRaw(webScraperOptions, options, jobId);
return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
}
}
export function waitForJob(jobId: string, timeout: number) {
return new Promise((resolve, reject) => {
const start = Date.now();
const int = setInterval(async () => {
if (Date.now() >= start + timeout) {
clearInterval(int);
reject(new Error("Job wait "));
} else {
const state = await getScrapeQueue().getJobState(jobId);
if (state === "completed") {
clearInterval(int);
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
} else if (state === "failed") {
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
clearInterval(int);
reject((await getScrapeQueue().getJob(jobId)).failedReason);
}
}
}, 500);
})
}

View File

@ -16,6 +16,14 @@ export function getScrapeQueue() {
scrapeQueueName,
{
connection: redisConnection,
defaultJobOptions: {
removeOnComplete: {
age: 90000, // 25 hours
},
removeOnFail: {
age: 90000, // 25 hours
},
},
}
// {
// settings: {

View File

@ -1,5 +1,5 @@
import "dotenv/config";
import "./sentry"
import "./sentry";
import * as Sentry from "@sentry/node";
import { CustomError } from "../lib/custom-error";
import {
@ -17,10 +17,27 @@ import { Logger } from "../lib/logger";
import { Worker } from "bullmq";
import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
import {
addCrawlJob,
addCrawlJobDone,
crawlToCrawler,
finishCrawl,
getCrawl,
getCrawlJobs,
lockURL,
} from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import {
addJobPriority,
deleteJobPriority,
getJobPriority,
} from "../../src/lib/job-priority";
import { PlanType } from "../types";
import { getJobs } from "../../src/controllers/v1/crawl-status";
import { configDotenv } from "dotenv";
configDotenv();
if (process.env.ENV === "production") {
initSDK({
@ -50,23 +67,24 @@ const processJobInternal = async (token: string, job: Job) => {
await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval);
await addJobPriority(job.data.team_id, job.id);
let err = null;
try {
const result = await processJob(job, token);
try{
try {
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
await job.moveToCompleted(null, token, false);
} else {
await job.moveToCompleted(result.docs, token, false);
}
}catch(e){
}
} catch (e) {}
} catch (error) {
console.log("Job failed, error:", error);
Sentry.captureException(error);
err = error;
await job.moveToFailed(error, token, false);
} finally {
await deleteJobPriority(job.data.team_id, job.id);
clearInterval(extendLockInterval);
}
@ -80,7 +98,10 @@ process.on("SIGINT", () => {
isShuttingDown = true;
});
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<any>) => {
const workerFun = async (
queueName: string,
processJobInternal: (token: string, job: Job) => Promise<any>
) => {
const worker = new Worker(queueName, null, {
connection: redisConnection,
lockDuration: 1 * 60 * 1000, // 1 minute
@ -109,44 +130,60 @@ const workerFun = async (queueName: string, processJobInternal: (token: string,
const job = await worker.getNextJob(token);
if (job) {
if (job.data && job.data.sentry && Sentry.isInitialized()) {
Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => {
Sentry.startSpan({
Sentry.continueTrace(
{
sentryTrace: job.data.sentry.trace,
baggage: job.data.sentry.baggage,
},
() => {
Sentry.startSpan(
{
name: "Scrape job",
attributes: {
job: job.id,
worker: process.env.FLY_MACHINE_ID ?? worker.id,
},
},
async (span) => {
await Sentry.startSpan(
{
name: "Process scrape job",
op: "queue.process",
attributes: {
"messaging.message.id": job.id,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": job.data.sentry.size,
"messaging.message.receive.latency":
Date.now() - (job.processedOn ?? job.timestamp),
"messaging.message.retry.count": job.attemptsMade,
},
},
async () => {
const res = await processJobInternal(token, job);
if (res !== null) {
span.setStatus({ code: 2 }); // ERROR
} else {
span.setStatus({ code: 1 }); // OK
}
}
);
}
);
}
);
} else {
Sentry.startSpan(
{
name: "Scrape job",
attributes: {
job: job.id,
worker: process.env.FLY_MACHINE_ID ?? worker.id,
},
}, async (span) => {
await Sentry.startSpan({
name: "Process scrape job",
op: "queue.process",
attributes: {
"messaging.message.id": job.id,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": job.data.sentry.size,
"messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp),
"messaging.message.retry.count": job.attemptsMade,
}
}, async () => {
const res = await processJobInternal(token, job);
if (res !== null) {
span.setStatus({ code: 2 }); // ERROR
} else {
span.setStatus({ code: 1 }); // OK
}
});
});
});
} else {
Sentry.startSpan({
name: "Scrape job",
attributes: {
job: job.id,
worker: process.env.FLY_MACHINE_ID ?? worker.id,
},
}, () => {
processJobInternal(token, job);
});
() => {
processJobInternal(token, job);
}
);
}
await sleep(gotJobInterval);
@ -163,13 +200,20 @@ async function processJob(job: Job, token: string) {
// Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue
if (job.data.url && (job.data.url.includes("researchhub.com") || job.data.url.includes("ebay.com") || job.data.url.includes("youtube.com") || job.data.url.includes("microsoft.com") )) {
if (
job.data.url &&
(job.data.url.includes("researchhub.com") ||
job.data.url.includes("ebay.com") ||
job.data.url.includes("youtube.com") ||
job.data.url.includes("microsoft.com"))
) {
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
const data = {
success: false,
docs: [],
project_id: job.data.project_id,
error: "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
error:
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
};
await job.moveToCompleted(data.docs, token, false);
return data;
@ -188,9 +232,16 @@ async function processJob(job: Job, token: string) {
job,
token,
});
// Better if we throw here so we capture with the correct error
if (!success) {
throw new Error(message);
}
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0] ? docs[0].rawHtml : "";
const data = {
success,
result: {
@ -206,8 +257,26 @@ async function processJob(job: Job, token: string) {
docs,
};
if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data);
// No idea what this does and when it is called.
if (job.data.mode === "crawl" && !job.data.v1) {
callWebhook(
job.data.team_id,
job.id as string,
data,
job.data.webhook,
job.data.v1
);
}
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
await callWebhook(
job.data.team_id,
job.data.crawl_id,
data,
job.data.webhook,
job.data.v1,
"crawl.page",
true
);
}
if (job.data.crawl_id) {
@ -229,35 +298,48 @@ async function processJob(job: Job, token: string) {
await addCrawlJobDone(job.data.crawl_id, job.id);
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
let linksOnPage = [];
try{
linksOnPage = data.docs[0]?.linksOnPage ?? [];
}catch(e){
linksOnPage = []
}
const links = crawler.filterLinks(
linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl))
.filter(x => x !== null),
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
)
);
for (const link of links) {
if (await lockURL(job.data.crawl_id, sc, link)) {
const newJob = await addScrapeJob({
url: link,
mode: "single_urls",
crawlerOptions: sc.crawlerOptions,
// This seems to work really welel
const jobPriority = await getJobPriority({
plan: sc.plan as PlanType,
team_id: sc.team_id,
pageOptions: sc.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
basePriority: job.data.crawl_id ? 20 : 10,
});
const jobId = uuidv4();
// console.log("plan: ", sc.plan);
// console.log("team_id: ", sc.team_id)
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
// console.log("job priority: " , jobPriority, "\n\n\n")
const newJob = await addScrapeJob(
{
url: link,
mode: "single_urls",
crawlerOptions: sc.crawlerOptions,
team_id: sc.team_id,
pageOptions: sc.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
v1: job.data.v1,
},
{},
jobId,
jobPriority
);
await addCrawlJob(job.data.crawl_id, newJob.id);
}
@ -266,67 +348,98 @@ async function processJob(job: Job, token: string) {
}
if (await finishCrawl(job.data.crawl_id)) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await Promise.all(jobIDs.map(async x => {
if (x === job.id) {
return {
async getState() {
return "completed"
},
timestamp: Date.now(),
returnvalue: docs,
}
if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
const jobStatus =
sc.cancelled || jobStatuses.some((x) => x === "failed")
? "failed"
: "completed";
const fullDocs = jobs.map((x) =>
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
// v0 web hooks, call when done with all the data
if (!job.data.v1) {
callWebhook(
job.data.team_id,
job.data.crawl_id,
data,
job.data.webhook,
job.data.v1,
"crawl.completed"
);
}
} else {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x)));
const jobStatus =
sc.cancelled || jobStatuses.some((x) => x === "failed")
? "failed"
: "completed";
const j = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(j.id);
if (supabaseData) {
j.returnvalue = supabaseData.docs;
// v1 web hooks, call when done with no data, but with event completed
if (job.data.v1 && job.data.webhook) {
callWebhook(
job.data.team_id,
job.data.crawl_id,
[],
job.data.webhook,
job.data.v1,
"crawl.completed"
);
}
}
return j;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
await callWebhook(job.data.team_id, job.data.crawl_id, data);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
num_docs: jobIDs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
}
}
}
@ -335,11 +448,13 @@ async function processJob(job: Job, token: string) {
} catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
Sentry.captureException(error, {
data: {
job: job.id
},
})
if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) {
Sentry.captureException(error, {
data: {
job: job.id,
},
});
}
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
@ -369,8 +484,24 @@ async function processJob(job: Job, token: string) {
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
};
if (job.data.mode === "crawl" || job.data.crawl_id) {
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) {
callWebhook(
job.data.team_id,
job.data.crawl_id ?? (job.id as string),
data,
job.data.webhook,
job.data.v1
);
}
if (job.data.v1) {
callWebhook(
job.data.team_id,
job.id as string,
[],
job.data.webhook,
job.data.v1,
"crawl.failed"
);
}
if (job.data.crawl_id) {
@ -380,7 +511,8 @@ async function processJob(job: Job, token: string) {
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
: error.message ??
"Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
@ -401,7 +533,8 @@ async function processJob(job: Job, token: string) {
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
: error.message ??
"Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,

View File

@ -65,7 +65,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken",
"standard"
);
expect(limiter2.points).toBe(50);
expect(limiter2.points).toBe(100);
const limiter3 = getRateLimiter(
"search" as RateLimiterMode,
@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken",
"growth"
);
expect(limiter4.points).toBe(150);
expect(limiter4.points).toBe(250);
});
it("should return the default rate limiter if plan is not provided", () => {
@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => {
"crawlStatus" as RateLimiterMode,
"test-prefix:someToken"
);
expect(limiter2.points).toBe(150);
expect(limiter2.points).toBe(250);
});
it("should consume points correctly for 'crawl' mode", async () => {
@ -188,14 +188,13 @@ describe("Rate Limiter Service", () => {
"test-prefix:someTokenXY",
"hobby"
);
// expect hobby to have 100 points
expect(limiter.points).toBe(10);
expect(limiter.points).toBe(20);
const consumePoints = 5;
const res = await limiter.consume("test-prefix:someTokenXY", consumePoints);
expect(res.consumedPoints).toBe(5);
expect(res.remainingPoints).toBe(5);
expect(res.remainingPoints).toBe(15);
});
it("should return the correct rate limiter for 'crawl' mode", () => {
@ -227,7 +226,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken",
"free"
);
expect(limiter.points).toBe(5);
expect(limiter.points).toBe(10);
const limiter2 = getRateLimiter(
"scrape" as RateLimiterMode,
@ -241,7 +240,14 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken",
"standard"
);
expect(limiter3.points).toBe(50);
expect(limiter3.points).toBe(100);
const limiter4 = getRateLimiter(
"scrape" as RateLimiterMode,
"test-prefix:someToken",
"growth"
);
expect(limiter4.points).toBe(1000);
});
it("should return the correct rate limiter for 'search' mode", () => {
@ -309,7 +315,7 @@ describe("Rate Limiter Service", () => {
"crawlStatus" as RateLimiterMode,
"test-prefix:someToken"
);
expect(limiter2.points).toBe(150);
expect(limiter2.points).toBe(250);
});
it("should return the correct rate limiter for 'testSuite' mode", () => {

View File

@ -6,7 +6,7 @@ const RATE_LIMITS = {
crawl: {
default: 3,
free: 2,
starter: 3,
starter: 10,
standard: 5,
standardOld: 40,
scale: 50,
@ -17,9 +17,22 @@ const RATE_LIMITS = {
growthdouble: 50,
},
scrape: {
default: 20,
free: 10,
starter: 100,
standard: 100,
standardOld: 100,
scale: 500,
hobby: 20,
standardNew: 100,
standardnew: 100,
growth: 1000,
growthdouble: 1000,
},
search: {
default: 20,
free: 5,
starter: 20,
starter: 50,
standard: 50,
standardOld: 40,
scale: 500,
@ -29,12 +42,12 @@ const RATE_LIMITS = {
growth: 500,
growthdouble: 500,
},
search: {
map:{
default: 20,
free: 5,
starter: 20,
standard: 40,
standardOld: 40,
starter: 50,
standard: 50,
standardOld: 50,
scale: 500,
hobby: 10,
standardNew: 50,
@ -52,7 +65,7 @@ const RATE_LIMITS = {
},
crawlStatus: {
free: 150,
default: 150,
default: 250,
},
testSuite: {
free: 10000,
@ -91,6 +104,25 @@ export const devBRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds
});
export const manualRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "manual",
points: 2000,
duration: 60, // Duration in seconds
});
export const scrapeStatusRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "scrape-status",
points: 400,
duration: 60, // Duration in seconds
});
const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"];
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
export function getRateLimiter(
mode: RateLimiterMode,
token: string,
@ -98,14 +130,18 @@ export function getRateLimiter(
teamId?: string
) {
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) {
if (testSuiteTokens.some(testToken => token.includes(testToken))) {
return testSuiteRateLimiter;
}
if(teamId === process.env.DEV_B_TEAM_ID) {
if(teamId && teamId === process.env.DEV_B_TEAM_ID) {
return devBRateLimiter;
}
if(teamId && manual.includes(teamId)) {
return manualRateLimiter;
}
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
if (!rateLimitConfig) return serverRateLimiter;

Some files were not shown because too many files have changed in this diff Show More