Merge branch 'main' into f/rust-sdk

This commit is contained in:
Rafael Miller 2024-08-29 16:20:03 -03:00 committed by GitHub
commit eec6d86802
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
163 changed files with 18655 additions and 4793 deletions

View File

@ -22,16 +22,19 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
jobs: jobs:
deploy: deploy:
name: Deploy app name: Deploy app
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 15
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: superfly/flyctl-actions/setup-flyctl@master - uses: superfly/flyctl-actions/setup-flyctl@master
- run: flyctl deploy --remote-only -a firecrawl-scraper-js - run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN
working-directory: ./apps/api working-directory: ./apps/api
env: env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}

View File

@ -27,6 +27,7 @@ env:
PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }} NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
jobs: jobs:
pre-deploy-e2e-tests: pre-deploy-e2e-tests:
@ -132,7 +133,7 @@ jobs:
working-directory: ./apps/python-sdk working-directory: ./apps/python-sdk
- name: Run E2E tests for Python SDK - name: Run E2E tests for Python SDK
run: | run: |
pytest firecrawl/__tests__/e2e_withAuth/test.py pytest firecrawl/__tests__/v1/e2e_withAuth/test.py
working-directory: ./apps/python-sdk working-directory: ./apps/python-sdk
js-sdk-tests: js-sdk-tests:
@ -247,11 +248,12 @@ jobs:
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: superfly/flyctl-actions/setup-flyctl@master - uses: superfly/flyctl-actions/setup-flyctl@master
- run: flyctl deploy --remote-only -a firecrawl-scraper-js - run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN
working-directory: ./apps/api working-directory: ./apps/api
env: env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
build-and-publish-python-sdk: build-and-publish-python-sdk:
name: Build and publish Python SDK name: Build and publish Python SDK

2
.gitignore vendored
View File

@ -19,3 +19,5 @@ apps/test-suite/load-test-results/test-run-report.json
apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/node_modules/
apps/playwright-service-ts/package-lock.json apps/playwright-service-ts/package-lock.json
*.pyc
.rdb

8
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "apps/go-sdk/firecrawl"] [submodule "apps/go-sdk/firecrawl-go"]
path = apps/go-sdk/firecrawl path = apps/go-sdk/firecrawl-go
url = https://github.com/mendableai/firecrawl-go url = https://github.com/mendableai/firecrawl-go
[submodule "apps/go-sdk/examples"] [submodule "apps/go-sdk/firecrawl-go-examples"]
path = apps/go-sdk/examples path = apps/go-sdk/firecrawl-go-examples
url = https://github.com/mendableai/firecrawl-go-examples url = https://github.com/mendableai/firecrawl-go-examples

View File

@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs

300
README.md
View File

@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom
## What is Firecrawl? ## What is Firecrawl?
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. [Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev).
_Pst. hey, you, join our stargazers :)_ _Pst. hey, you, join our stargazers :)_
@ -41,18 +41,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl. Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/crawl \ curl -X POST https://api.firecrawl.dev/v1/crawl \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer fc-YOUR_API_KEY' \
-d '{ -d '{
"url": "https://mendable.ai" "url": "https://docs.firecrawl.dev",
"limit": 100,
"scrapeOptions": {
"formats": ["markdown", "html"]
}
}' }'
``` ```
Returns a jobId Returns a crawl job id and the url to check the status of the crawl.
```json ```json
{ "jobId": "1234-5678-9101" } {
"success": true,
"id": "123-456-789",
"url": "https://api.firecrawl.dev/v1/crawl/123-456-789"
}
``` ```
### Check Crawl Job ### Check Crawl Job
@ -60,7 +68,7 @@ Returns a jobId
Used to check the status of a crawl job and get its result. Used to check the status of a crawl job and get its result.
```bash ```bash
curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' -H 'Authorization: Bearer YOUR_API_KEY'
``` ```
@ -68,18 +76,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
```json ```json
{ {
"status": "completed", "status": "completed",
"current": 22, "total": 36,
"total": 22, "creditsUsed": 36,
"expiresAt": "2024-00-00T00:00:00.000Z",
"data": [ "data": [
{ {
"content": "Raw Content ", "markdown": "[Firecrawl Docs home page![light logo](https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/logo/light.svg)!...",
"markdown": "# Markdown Content", "html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...",
"provider": "web-scraper",
"metadata": { "metadata": {
"title": "Mendable | AI for CX and Sales", "title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl",
"description": "AI for CX and Sales", "language": "en",
"language": null, "sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3",
"sourceURL": "https://www.mendable.ai/" "description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.",
"ogLocaleAlternate": [],
"statusCode": 200
} }
} }
] ]
@ -88,14 +98,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
### Scraping ### Scraping
Used to scrape a URL and get its content. Used to scrape a URL and get its content in the specified formats.
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/scrape \ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://mendable.ai" "url": "https://docs.firecrawl.dev",
"formats" : ["markdown", "html"]
}' }'
``` ```
@ -105,55 +116,83 @@ Response:
{ {
"success": true, "success": true,
"data": { "data": {
"content": "Raw Content ", "markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...",
"markdown": "# Markdown Content", "html": "<!DOCTYPE html><html lang=\"en\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...",
"provider": "web-scraper",
"metadata": { "metadata": {
"title": "Mendable | AI for CX and Sales", "title": "Home - Firecrawl",
"description": "AI for CX and Sales", "description": "Firecrawl crawls and converts any website into clean markdown.",
"language": null, "language": "en",
"sourceURL": "https://www.mendable.ai/" "keywords": "Firecrawl,Markdown,Data,Mendable,Langchain",
"robots": "follow, index",
"ogTitle": "Firecrawl",
"ogDescription": "Turn any website into LLM-ready data.",
"ogUrl": "https://www.firecrawl.dev/",
"ogImage": "https://www.firecrawl.dev/og.png?123",
"ogLocaleAlternate": [],
"ogSiteName": "Firecrawl",
"sourceURL": "https://firecrawl.dev",
"statusCode": 200
} }
} }
} }
``` ```
### Search (Beta) ### Map (Alpha)
Used to search the web, get the most relevant results, scrape each page and return the markdown. Used to map a URL and get urls of the website. This returns most links present on the website.
```bash ```bash cURL
curl -X POST https://api.firecrawl.dev/v0/search \ curl -X POST https://api.firecrawl.dev/v1/map \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"query": "firecrawl", "url": "https://firecrawl.dev"
"pageOptions": {
"fetchPageContent": true // false for a fast serp api
}
}' }'
``` ```
Response:
```json ```json
{ {
"success": true, "status": "success",
"data": [ "links": [
{ "https://firecrawl.dev",
"url": "https://mendable.ai", "https://www.firecrawl.dev/pricing",
"markdown": "# Markdown Content", "https://www.firecrawl.dev/blog",
"provider": "web-scraper", "https://www.firecrawl.dev/playground",
"metadata": { "https://www.firecrawl.dev/smart-crawl",
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
}
}
] ]
} }
``` ```
### Intelligent Extraction (Beta) #### Map with search
Map with `search` param allows you to search for specific urls inside a website.
```bash cURL
curl -X POST https://api.firecrawl.dev/v1/map \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://firecrawl.dev",
"search": "docs"
}'
```
Response will be an ordered list from the most relevant to the least relevant.
```json
{
"status": "success",
"links": [
"https://docs.firecrawl.dev",
"https://docs.firecrawl.dev/sdks/python",
"https://docs.firecrawl.dev/learn/rag-llama3",
]
}
```
### LLM Extraction (v0) (Beta)
Used to extract structured data from scraped pages. Used to extract structured data from scraped pages.
@ -220,6 +259,42 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
} }
``` ```
### Search (v0) (Beta)
Used to search the web, get the most relevant results, scrape each page and return the markdown.
```bash
curl -X POST https://api.firecrawl.dev/v0/search \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"query": "firecrawl",
"pageOptions": {
"fetchPageContent": true // false for a fast serp api
}
}'
```
```json
{
"success": true,
"data": [
{
"url": "https://mendable.ai",
"markdown": "# Markdown Content",
"provider": "web-scraper",
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
}
}
]
}
```
## Using Python SDK ## Using Python SDK
### Installing Python SDK ### Installing Python SDK
@ -231,24 +306,28 @@ pip install firecrawl-py
### Crawl a website ### Crawl a website
```python ```python
from firecrawl import FirecrawlApp from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY") app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) # Scrape a website:
scrape_status = app.scrape_url(
'https://firecrawl.dev',
params={'formats': ['markdown', 'html']}
)
print(scrape_status)
# Get the markdown # Crawl a website:
for result in crawl_result: crawl_status = app.crawl_url(
print(result['markdown']) 'https://firecrawl.dev',
``` params={
'limit': 100,
### Scraping a URL 'scrapeOptions': {'formats': ['markdown', 'html']}
},
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. wait_until_done=True,
poll_interval=30
```python )
url = 'https://example.com' print(crawl_status)
scraped_data = app.scrape_url(url)
``` ```
### Extracting structured data from a URL ### Extracting structured data from a URL
@ -256,6 +335,11 @@ scraped_data = app.scrape_url(url)
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
```python ```python
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
class ArticleSchema(BaseModel): class ArticleSchema(BaseModel):
title: str title: str
points: int points: int
@ -277,15 +361,6 @@ data = app.scrape_url('https://news.ycombinator.com', {
print(data["llm_extraction"]) print(data["llm_extraction"])
``` ```
### Search for a query
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
```python
query = 'What is Mendable?'
search_result = app.search(query)
```
## Using the Node SDK ## Using the Node SDK
### Installation ### Installation
@ -301,54 +376,33 @@ npm install @mendable/firecrawl-js
1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
### Scraping a URL
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
```js ```js
try { import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
const url = "https://example.com";
const scrapedData = await app.scrapeUrl(url); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
console.log(scrapedData);
} catch (error) { // Scrape a website
console.error("Error occurred while scraping:", error.message); const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
formats: ['markdown', 'html'],
});
if (scrapeResponse) {
console.log(scrapeResponse)
}
// Crawl a website
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
limit: 100,
scrapeOptions: {
formats: ['markdown', 'html'],
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
if (crawlResponse) {
console.log(crawlResponse)
} }
``` ```
### Crawling a Website
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
```js
const crawlUrl = "https://example.com";
const params = {
crawlerOptions: {
excludes: ["blog/"],
includes: [], // leave empty for all pages
limit: 1000,
},
pageOptions: {
onlyMainContent: true,
},
};
const waitUntilDone = true;
const timeout = 5;
const crawlResult = await app.crawlUrl(
crawlUrl,
params,
waitUntilDone,
timeout
);
```
### Checking Crawl Status
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
```js
const status = await app.checkCrawlStatus(jobId);
console.log(status);
```
### Extracting structured data from a URL ### Extracting structured data from a URL
@ -360,6 +414,7 @@ import { z } from "zod";
const app = new FirecrawlApp({ const app = new FirecrawlApp({
apiKey: "fc-YOUR_API_KEY", apiKey: "fc-YOUR_API_KEY",
version: "v0"
}); });
// Define schema to extract contents into // Define schema to extract contents into
@ -384,19 +439,6 @@ const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
console.log(scrapeResult.data["llm_extraction"]); console.log(scrapeResult.data["llm_extraction"]);
``` ```
### Search for a query
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
```js
const query = "what is mendable?";
const searchResults = await app.search(query, {
pageOptions: {
fetchPageContent: true, // Fetch the page content for each search result
},
});
```
## Contributing ## Contributing
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.

View File

@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
LOGTAIL_KEY= LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs # set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY= LLAMAPARSE_API_KEY=
# set if you have a serper key you'd like to use as a search api
SERPER_API_KEY=
# set if you'd like to send slack server health status messages # set if you'd like to send slack server health status messages
SLACK_WEBHOOK_URL= SLACK_WEBHOOK_URL=
# set if you'd like to send posthog events like job logs # set if you'd like to send posthog events like job logs

3
apps/api/.gitignore vendored
View File

@ -6,3 +6,6 @@ dump.rdb
/mongo-data /mongo-data
/.next/ /.next/
.rdb
.sentryclirc

View File

@ -12,8 +12,10 @@ RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-l
FROM base AS build FROM base AS build
RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile
RUN apt-get update -qq && apt-get install -y ca-certificates && update-ca-certificates
RUN pnpm install RUN pnpm install
RUN pnpm run build RUN --mount=type=secret,id=SENTRY_AUTH_TOKEN \
bash -c 'export SENTRY_AUTH_TOKEN="$(cat /run/secrets/SENTRY_AUTH_TOKEN)"; if [ -z $SENTRY_AUTH_TOKEN ]; then pnpm run build:nosentry; else pnpm run build; fi'
# Install packages needed for deployment # Install packages needed for deployment

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 100 # hard_limit = 100
soft_limit = 50 soft_limit = 100
[[http_service.checks]] [[http_service.checks]]
grace_period = "10s" grace_period = "10s"
@ -51,12 +51,13 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 25 # hard_limit = 25
soft_limit = 20 soft_limit = 100
[[vm]] [[vm]]
size = 'performance-1x' size = 'performance-2x'
processes = ['app','worker'] processes = ['app','worker']
memory = 8192

View File

@ -24,8 +24,8 @@ kill_timeout = '30s'
[http_service.concurrency] [http_service.concurrency]
type = "requests" type = "requests"
hard_limit = 200 # hard_limit = 200
soft_limit = 75 soft_limit = 200
[[http_service.checks]] [[http_service.checks]]
grace_period = "20s" grace_period = "20s"
@ -50,8 +50,8 @@ kill_timeout = '30s'
[services.concurrency] [services.concurrency]
type = 'connections' type = 'connections'
hard_limit = 30 # hard_limit = 30
soft_limit = 12 soft_limit = 200
[[vm]] [[vm]]
size = 'performance-4x' size = 'performance-4x'

924
apps/api/openapi-v0.json Normal file
View File

@ -0,0 +1,924 @@
{
"openapi": "3.0.0",
"info": {
"title": "Firecrawl API",
"version": "0.0.0",
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
"servers": [
{
"url": "https://api.firecrawl.dev/v0"
}
],
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
},
"extractorOptions": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScrapeResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl": {
"post": {
"summary": "Crawl multiple URLs based on options",
"operationId": "crawlUrls",
"tags": ["Crawling"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The base URL to start crawling from"
},
"crawlerOptions": {
"type": "object",
"properties": {
"includes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
},
"excludes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
},
"generateImgAltText": {
"type": "boolean",
"description": "Generate alt text for images using LLMs (must have a paid plan)",
"default": false
},
"returnOnlyUrls": {
"type": "boolean",
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false
},
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
},
"allowBackwardCrawling": {
"type": "boolean",
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
"default": false
},
"allowExternalContentLinks": {
"type": "boolean",
"description": "Allows the crawler to follow links to external websites.",
"default": false
}
}
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CrawlResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/search": {
"post": {
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
"operationId": "searchGoogle",
"tags": ["Search"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"format": "uri",
"description": "The query to search for"
},
"pageOptions": {
"type": "object",
"properties": {
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"fetchPageContent": {
"type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
}
}
},
"searchOptions": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of results. Max is 20 during beta."
}
}
}
},
"required": ["query"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SearchResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/status/{jobId}": {
"get": {
"tags": ["Crawl"],
"summary": "Get the status of a crawl job",
"operationId": "getCrawlStatus",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Status of the job (completed, active, failed, paused)"
},
"current": {
"type": "integer",
"description": "Current page number"
},
"total": {
"type": "integer",
"description": "Total number of pages"
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
}
},
"components": {
"securitySchemes": {
"bearerAuth": {
"type": "http",
"scheme": "bearer"
}
},
"schemas": {
"ScrapeResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
}
},
"SearchResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}
}
}
}
},
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"type": "string"
}
}
}
}
},
"security": [
{
"bearerAuth": []
}
]
}

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL and optionally extract information using an LLM", "summary": "Scrape a single URL",
"operationId": "scrapeAndExtractFromUrl", "operationId": "scrape",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -38,94 +38,47 @@
"format": "uri", "format": "uri",
"description": "The URL to scrape" "description": "The URL to scrape"
}, },
"pageOptions": { "formats": {
"type": "object", "type": "array",
"properties": { "items": {
"headers": { "type": "string",
"type": "object", "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." },
}, "description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
"includeHtml": { "default": ["markdown"]
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
}, },
"extractorOptions": { "headers": {
"type": "object", "type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
"default": {}, },
"properties": { "includeTags": {
"mode": { "type": "array",
"type": "string", "items": {
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], "type": "string"
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." },
}, "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
"extractionPrompt": { },
"type": "string", "excludeTags": {
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." "type": "array",
}, "items": {
"extractionSchema": { "type": "string"
"type": "object", },
"additionalProperties": true, "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
"description": "The schema for the data to be extracted, required only for LLM extraction modes.", },
"required": [ "onlyMainContent": {
"company_mission", "type": "boolean",
"supports_sso", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"is_open_source" "default": true
]
}
}
}, },
"timeout": { "timeout": {
"type": "integer", "type": "integer",
"description": "Timeout in milliseconds for the request", "description": "Timeout in milliseconds for the request",
"default": 30000 "default": 30000
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
} }
}, },
"required": ["url"] "required": ["url"]
@ -741,24 +694,42 @@
"success": { "success": {
"type": "boolean" "type": "boolean"
}, },
"warning": {
"type": "string",
"nullable": true,
"description": "Warning message to let you know of any issues."
},
"data": { "data": {
"type": "object", "type": "object",
"properties": { "properties": {
"markdown": { "markdown": {
"type": "string" "type": "string",
}, "nullable": true,
"content": { "description": "Markdown content of the page if the `markdown` format was specified (default)"
"type": "string"
}, },
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true" "description": "HTML version of the content on page if the `html` format was specified"
}, },
"rawHtml": { "rawHtml": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true" "description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -780,27 +751,16 @@
"<any other metadata> ": { "<any other metadata> ": {
"type": "string" "type": "string"
}, },
"pageStatusCode": { "statusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
}, },
"pageError": { "error": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
} }
} }
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
} }
} }
} }
@ -810,24 +770,33 @@
"type": "object", "type": "object",
"properties": { "properties": {
"markdown": { "markdown": {
"type": "string" "type": "string",
}, "nullable": true,
"content": { "description": "Markdown content of the page if the `markdown` format was specified (default)"
"type": "string"
}, },
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true" "description": "HTML version of the content on page if the `html` format was specified"
}, },
"rawHtml": { "rawHtml": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true" "description": "Raw HTML content of the page if the `rawHtml` format was specified"
}, },
"index": { "links": {
"type": "integer", "type": "array",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." "items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -849,11 +818,11 @@
"<any other metadata> ": { "<any other metadata> ": {
"type": "string" "type": "string"
}, },
"pageStatusCode": { "statusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
}, },
"pageError": { "error": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
@ -871,34 +840,63 @@
"data": { "data": {
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "markdown": {
"properties": { "type": "string",
"url": { "nullable": true,
"type": "string" "description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
}, },
"markdown": { "nullable": true,
"type": "string" "description": "Links on the page if the `links` format was specified"
}, },
"content": { "screenshot": {
"type": "string" "type": "string",
}, "nullable": true,
"metadata": { "description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
"type": "object", },
"properties": { "metadata": {
"title": { "type": "object",
"type": "string" "properties": {
}, "title": {
"description": { "type": "string"
"type": "string" },
}, "description": {
"language": { "type": "string"
"type": "string", },
"nullable": true "language": {
}, "type": "string",
"sourceURL": { "nullable": true
"type": "string", },
"format": "uri" "sourceURL": {
} "type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
} }
} }
} }
@ -909,8 +907,15 @@
"CrawlResponse": { "CrawlResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
"jobId": { "success": {
"type": "boolean"
},
"id": {
"type": "string" "type": "string"
},
"url": {
"type": "string",
"format": "uri"
} }
} }
} }

View File

@ -9,7 +9,8 @@
"format": "prettier --write \"src/**/*.(js|ts)\"", "format": "prettier --write \"src/**/*.(js|ts)\"",
"flyio": "node dist/src/index.js", "flyio": "node dist/src/index.js",
"start:dev": "nodemon --exec ts-node src/index.ts", "start:dev": "nodemon --exec ts-node src/index.ts",
"build": "tsc", "build": "tsc && pnpm sentry:sourcemaps",
"build:nosentry": "tsc",
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'", "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
@ -19,8 +20,9 @@
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
"mongo-docker-console": "docker exec -it mongodb mongosh", "mongo-docker-console": "docker exec -it mongodb mongosh",
"run-example": "npx ts-node src/example.ts", "run-example": "npx ts-node src/example.ts",
"deploy:fly": "flyctl deploy", "deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN)",
"deploy:fly:staging": "fly deploy -c fly.staging.toml" "deploy:fly:staging": "fly deploy -c fly.staging.toml",
"sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
}, },
"author": "", "author": "",
"license": "ISC", "license": "ISC",
@ -29,7 +31,6 @@
"@jest/globals": "^29.7.0", "@jest/globals": "^29.7.0",
"@tsconfig/recommended": "^1.0.3", "@tsconfig/recommended": "^1.0.3",
"@types/body-parser": "^1.19.2", "@types/body-parser": "^1.19.2",
"@types/bull": "^4.10.0",
"@types/cors": "^2.8.13", "@types/cors": "^2.8.13",
"@types/express": "^4.17.17", "@types/express": "^4.17.17",
"@types/jest": "^29.5.12", "@types/jest": "^29.5.12",
@ -53,17 +54,21 @@
"@bull-board/express": "^5.20.5", "@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13", "@dqbd/tiktoken": "^1.0.13",
"@hyperdx/node-opentelemetry": "^0.8.0", "@hyperdx/node-opentelemetry": "^0.8.1",
"@logtail/node": "^0.4.12", "@logtail/node": "^0.4.12",
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@sentry/node": "^8.13.0", "@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",
"@supabase/supabase-js": "^2.44.2", "@supabase/supabase-js": "^2.44.2",
"@types/express-ws": "^3.0.4",
"@types/ws": "^8.5.12",
"ajv": "^8.16.0", "ajv": "^8.16.0",
"async": "^3.2.5", "async": "^3.2.5",
"async-mutex": "^0.5.0", "async-mutex": "^0.5.0",
"axios": "^1.3.4", "axios": "^1.3.4",
"bottleneck": "^2.19.5", "bottleneck": "^2.19.5",
"bull": "^4.15.0", "bullmq": "^5.11.0",
"cacheable-lookup": "^6.1.0", "cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1", "cohere": "^1.1.1",
@ -71,7 +76,9 @@
"cron-parser": "^4.9.0", "cron-parser": "^4.9.0",
"date-fns": "^3.6.0", "date-fns": "^3.6.0",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2",
"express-rate-limit": "^7.3.1", "express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"form-data": "^4.0.0", "form-data": "^4.0.0",
"glob": "^10.4.2", "glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5", "gpt3-tokenizer": "^1.1.5",
@ -99,14 +106,16 @@
"robots-parser": "^3.0.1", "robots-parser": "^3.0.1",
"scrapingbee": "^1.7.4", "scrapingbee": "^1.7.4",
"stripe": "^16.1.0", "stripe": "^16.1.0",
"systeminformation": "^5.22.11",
"turndown": "^7.1.3", "turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2", "turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4", "typesense": "^1.5.4",
"unstructured-client": "^0.11.3", "unstructured-client": "^0.11.3",
"uuid": "^10.0.0", "uuid": "^10.0.0",
"wordpos": "^2.1.0", "wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2", "xml2js": "^0.6.2",
"zod": "^3.23.4", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1" "zod-to-json-schema": "^3.23.1"
}, },
"nodemonConfig": { "nodemonConfig": {
@ -116,4 +125,4 @@
"temp" "temp"
] ]
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,16 @@
### Crawl Website ### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer fc Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
content-type: application/json content-type: application/json
{ {
"url":"firecrawl.dev" "url":"corterix.com"
} }
### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
### Check Job Status ### Check Job Status
GET http://localhost:3002/v0/jobs/active HTTP/1.1 GET http://localhost:3002/v0/jobs/active HTTP/1.1

View File

@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey) .set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' }); .send({ url: 'https://docs.firecrawl.dev' });
expect(firstResponse.statusCode).toBe(200); expect(firstResponse.statusCode).toBe(200);
@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.set("x-idempotency-key", uniqueIdempotencyKey) .set("x-idempotency-key", uniqueIdempotencyKey)
.send({ url: 'https://mendable.ai' }); .send({ url: 'https://docs.firecrawl.dev' });
expect(secondResponse.statusCode).toBe(409); expect(secondResponse.statusCode).toBe(409);
expect(secondResponse.body.error).toBe('Idempotency key already used'); expect(secondResponse.body.error).toBe('Idempotency key already used');

View File

@ -0,0 +1,951 @@
import request from "supertest";
import dotenv from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for v1 API Routes", () => {
beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true";
});
afterAll(() => {
delete process.env.USE_DB_AUTHENTICATION;
});
describe("GET /is-production", () => {
it.concurrent("should return the production status", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
"/is-production"
);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("isProduction");
});
});
describe("POST /v1/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/scrape"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
);
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
);
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
); // 30 seconds timeout
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
// formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/pdf/astro-ph/9301001"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send(scrapeRequest);
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
}, 60000);
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true
};
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
if (!("data" in responseWithoutRemoveTags.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
const scrapeRequestWithRemoveTags: ScrapeRequest = {
url: "https://www.scrapethissite.com/",
excludeTags: ['.nav', '#footer', 'strong'],
onlyMainContent: false // default is true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithRemoveTags);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
}, 30000);
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/400' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(400);
}, 60000);
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(401);
}, 60000);
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(403);
}, 60000);
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(404);
}, 60000);
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(405);
}, 60000);
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(500);
}, 60000);
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408);
}, 3000);
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.rawHtml).toContain("<html");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with waitFor",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 5000
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("links");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain("PagerDuty");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
it.concurrent(
"should return a successful response with a valid links on page",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://roastmywebsite.ai",
formats: ["links"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data).not.toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("links");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.links).toContain("https://firecrawl.dev");
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
);
});
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/map"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should return a successful response with a valid API key", async () => {
const mapRequest = {
url: "https://roastmywebsite.ai"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
});
it.concurrent("should return a successful response with a valid API key and search", async () => {
const mapRequest = {
url: "https://usemotion.com",
search: "pricing"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("usemotion.com/pricing");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
const mapRequest = {
url: "https://firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: false
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).not.toContain("docs.firecrawl.dev");
})
it.concurrent("should return an error for invalid URL", async () => {
const mapRequest = {
url: "invalid-url",
includeSubdomains: true,
search: "test",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("error");
});
});
describe("POST /v1/crawl", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/crawl"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
);
it.concurrent("should return a successful response", async () => {
const response = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("id");
expect(response.body.id).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("url");
expect(response.body.url).toContain("/v1/crawl/");
});
it.concurrent(
"should return a successful response with a valid API key and valid includes option",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
limit: 10,
includePaths: ["blog/*"],
});
let response;
let isFinished = false;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url).toContain("firecrawl.dev/blog");
});
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
180000
); // 180 seconds
it.concurrent(
"should return a successful response with a valid API key and valid excludes option",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
limit: 10,
excludePaths: ["blog/*"],
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(
TEST_URL
)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(3);
urls.forEach((url: string) => {
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
});
},
90000
); // 90 seconds
it.concurrent(
"should return a successful response with max depth option for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
maxDepth: 1,
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
// wait for 60 seconds
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(
TEST_URL
)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThanOrEqual(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split("/");
const depth =
pathSplits.length -
(pathSplits[0].length === 0 &&
pathSplits[pathSplits.length - 1].length === 0
? 1
: 0);
expect(depth).toBeLessThanOrEqual(2);
});
},
180000
);
})
describe("GET /v1/crawl/:jobId", () => {
it.concurrent("should require authorization", async () => {
const response = await request(TEST_URL).get("/v1/crawl/123");
expect(response.statusCode).toBe(401);
});
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL)
.get("/v1/crawl/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
}
);
it.concurrent(
"should return Job not found for invalid job ID",
async () => {
const response = await request(TEST_URL)
.get("/v1/crawl/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
}
);
it.concurrent(
"should return a successful crawl status response for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.mendable.ai" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
if (response.body.status === "completed") {
isCompleted = true;
} else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
200
);
expect(
completedResponse.body.data[0].metadata.error
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(
(doc) =>
doc.metadata &&
doc.metadata.sourceURL
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
},
180000
); // 120 seconds
it.concurrent(
"If someone cancels a crawl job, it should turn into failed status",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.tatum.io", limit: 200 });
expect(crawlResponse.statusCode).toBe(200);
await new Promise((r) => setTimeout(r, 10000));
const responseCancel = await request(TEST_URL)
.delete(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(responseCancel.statusCode).toBe(200);
expect(responseCancel.body).toHaveProperty("status");
expect(responseCancel.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL)
.get(`/v1/crawl/${crawlResponse.body.id}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("cancelled");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
60000
); // 60 seconds
})
});

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
import { crawlController } from '../crawl' import { crawlController } from '../v0/crawl'
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create'; import { createIdempotencyKey } from '../../services/idempotency/create';

View File

@ -1,87 +0,0 @@
import { Request, Response } from "express";
import { Job } from "bull";
import { Logger } from "../../lib/logger";
import { getWebScraperQueue } from "../../services/queue-service";
import { checkAlerts } from "../../services/alerts";
export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
) {
Logger.info("🐂 Cleaning jobs older than 24h");
try {
const webScraperQueue = getWebScraperQueue();
const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push(
webScraperQueue.getJobs(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
);
}
const completedJobs: Job[] = (
await Promise.all(completedJobsPromises)
).flat();
const before24hJobs =
completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
if (!before24hJobs) {
return res.status(200).send(`No jobs to remove.`);
}
for (const job of before24hJobs) {
try {
await job.remove();
count++;
} catch (jobError) {
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
export async function checkQueuesController(req: Request, res: Response) {
try {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
// Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) {
try {
const webScraperQueue = getWebScraperQueue();
const [webScraperActive] = await Promise.all([
webScraperQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,21 +1,36 @@
import { parseApi } from "../../src/lib/parseApi"; import { parseApi } from "../lib/parseApi";
import { getRateLimiter } from "../../src/services/rate-limiter"; import { getRateLimiter } from "../services/rate-limiter";
import { import {
AuthResponse, AuthResponse,
NotificationType, NotificationType,
PlanType,
RateLimiterMode, RateLimiterMode,
} from "../../src/types"; } from "../types";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../services/supabase";
import { withAuth } from "../../src/lib/withAuth"; import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification"; import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { redlock } from "../../src/services/redlock"; import { redlock } from "../services/redlock";
import { getValue } from "../../src/services/redis"; import { getValue } from "../services/redis";
import { setValue } from "../../src/services/redis"; import { setValue } from "../services/redis";
import { validate } from "uuid"; import { validate } from "uuid";
import * as Sentry from "@sentry/node";
// const { data, error } = await supabase_service
// .from('api_keys')
// .select(`
// key,
// team_id,
// teams (
// subscriptions (
// price_id
// )
// )
// `)
// .eq('key', normalizedApi)
// .limit(1)
// .single();
function normalizedApiIsUuid(potentialUuid: string): boolean { function normalizedApiIsUuid(potentialUuid: string): boolean {
// Check if the string is a valid UUID // Check if the string is a valid UUID
return validate(potentialUuid); return validate(potentialUuid);
@ -34,6 +49,7 @@ function setTrace(team_id: string, api_key: string) {
api_key, api_key,
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error);
Logger.error(`Error setting trace attributes: ${error.message}`); Logger.error(`Error setting trace attributes: ${error.message}`);
} }
} }
@ -49,6 +65,7 @@ async function getKeyAndPriceId(normalizedApi: string): Promise<{
api_key: normalizedApi, api_key: normalizedApi,
}); });
if (error) { if (error) {
Sentry.captureException(error);
Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`); Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`);
return { return {
success: false, success: false,
@ -58,7 +75,10 @@ async function getKeyAndPriceId(normalizedApi: string): Promise<{
}; };
} }
if (!data || data.length === 0) { if (!data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`); if (error) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
Sentry.captureException(error);
}
// TODO: change this error code ? // TODO: change this error code ?
return { return {
success: false, success: false,
@ -82,7 +102,7 @@ export async function supaAuthenticateUser(
team_id?: string; team_id?: string;
error?: string; error?: string;
status?: number; status?: number;
plan?: string; plan?: PlanType;
}> { }> {
const authHeader = req.headers.authorization; const authHeader = req.headers.authorization;
if (!authHeader) { if (!authHeader) {
@ -112,7 +132,11 @@ export async function supaAuthenticateUser(
let priceId: string | null = null; let priceId: string | null = null;
if (token == "this_is_just_a_preview_token") { if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); if (mode == RateLimiterMode.CrawlStatus) {
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
} else {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
}
teamId = "preview"; teamId = "preview";
} else { } else {
normalizedApi = parseApi(token); normalizedApi = parseApi(token);
@ -148,11 +172,12 @@ export async function supaAuthenticateUser(
await setValue( await setValue(
cacheKey, cacheKey,
JSON.stringify({ team_id: teamId, price_id: priceId }), JSON.stringify({ team_id: teamId, price_id: priceId }),
10 60
); );
} }
} catch (error) { } catch (error) {
Logger.error(`Error with auth function: ${error.message}`); Sentry.captureException(error);
Logger.error(`Error with auth function: ${error}`);
// const { // const {
// success, // success,
// teamId: tId, // teamId: tId,
@ -215,7 +240,8 @@ export async function supaAuthenticateUser(
rateLimiter = getRateLimiter( rateLimiter = getRateLimiter(
RateLimiterMode.Scrape, RateLimiterMode.Scrape,
token, token,
subscriptionData.plan subscriptionData.plan,
teamId
); );
break; break;
case RateLimiterMode.Search: case RateLimiterMode.Search:
@ -225,6 +251,13 @@ export async function supaAuthenticateUser(
subscriptionData.plan subscriptionData.plan
); );
break; break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(
RateLimiterMode.Map,
token,
subscriptionData.plan
);
break;
case RateLimiterMode.CrawlStatus: case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break; break;
@ -268,7 +301,7 @@ export async function supaAuthenticateUser(
return { return {
success: false, success: false,
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
status: 429, status: 429,
}; };
} }
@ -277,6 +310,9 @@ export async function supaAuthenticateUser(
token === "this_is_just_a_preview_token" && token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || (mode === RateLimiterMode.Scrape ||
mode === RateLimiterMode.Preview || mode === RateLimiterMode.Preview ||
mode === RateLimiterMode.Map ||
mode === RateLimiterMode.Crawl ||
mode === RateLimiterMode.CrawlStatus ||
mode === RateLimiterMode.Search) mode === RateLimiterMode.Search)
) { ) {
return { success: true, team_id: "preview" }; return { success: true, team_id: "preview" };
@ -302,7 +338,10 @@ export async function supaAuthenticateUser(
.eq("key", normalizedApi); .eq("key", normalizedApi);
if (error || !data || data.length === 0) { if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`); if (error) {
Sentry.captureException(error);
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
}
return { return {
success: false, success: false,
error: "Unauthorized: Invalid token", error: "Unauthorized: Invalid token",
@ -316,10 +355,10 @@ export async function supaAuthenticateUser(
return { return {
success: true, success: true,
team_id: subscriptionData.team_id, team_id: subscriptionData.team_id,
plan: subscriptionData.plan ?? "", plan: (subscriptionData.plan ?? "") as PlanType,
}; };
} }
function getPlanByPriceId(price_id: string) { function getPlanByPriceId(price_id: string): PlanType {
switch (price_id) { switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER: case process.env.STRIPE_PRICE_ID_STARTER:
return "starter"; return "starter";
@ -336,6 +375,8 @@ function getPlanByPriceId(price_id: string) {
case process.env.STRIPE_PRICE_ID_GROWTH: case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return "growth"; return "growth";
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
return "growthdouble";
default: default:
return "free"; return "free";
} }

View File

@ -1,51 +0,0 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlStatusController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
const { current, current_url, total, current_step, partialDocs } = await job.progress();
let data = job.returnvalue;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(req.params.jobId);
if (supabaseData) {
data = supabaseData.docs;
}
}
const jobStatus = await job.getState();
res.json({
status: jobStatus,
// progress: job.progress(),
current,
current_url,
current_step,
total,
data: data ? data : null,
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,110 +0,0 @@
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
import { billTeam } from "../../src/services/billing/credit_billing";
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
}
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
try {
const a = new WebScraperDataProvider();
await a.setOptions({
jobId: uuidv4(),
mode: "single_urls",
urls: [url],
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
pageOptions: pageOptions,
});
const docs = await a.getDocuments(false, (progress) => {
job.progress({
current: progress.current,
total: progress.total,
current_step: "SCRAPING",
current_url: progress.currentDocumentUrl,
});
});
return res.json({
success: true,
documents: docs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
});
await logCrawl(job.id.toString(), team_id);
res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,46 +0,0 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../src/lib/logger";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
if (!success) {
return res.status(status).json({ error });
}
// authenticate on supabase
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
const job = await addWebScraperJob({
url: url,
mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
team_id: "preview",
pageOptions: pageOptions,
origin: "website-preview",
});
res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,196 +0,0 @@
import { ExtractorOptions, PageOptions } from './../lib/entities';
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
import { v4 as uuidv4 } from "uuid";
import { Logger } from '../lib/logger';
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
plan?: string
): Promise<{
success: boolean;
error?: string;
data?: Document;
returnCode: number;
}> {
const url = req.body.url;
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
const a = new WebScraperDataProvider();
await a.setOptions({
jobId,
mode: "single_urls",
urls: [url],
crawlerOptions: {
...crawlerOptions,
},
pageOptions: pageOptions,
extractorOptions: extractorOptions,
});
const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
);
const docsPromise = a.getDocuments(false);
let docs;
try {
docs = await Promise.race([docsPromise, timeoutPromise]);
} catch (error) {
return error;
}
// make sure doc.content is not empty
let filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
);
if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
}
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
filteredDocs.forEach(doc => {
delete doc.rawHtml;
});
}
return {
success: true,
data: filteredDocs[0],
returnCode: 200,
};
}
export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode.includes("llm-extraction")) {
pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000;
}
const checkCredits = async () => {
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
earlyReturn = true;
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
Logger.error(error);
earlyReturn = true;
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
}
};
await checkCredits();
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
extractorOptions,
timeout,
plan
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
if (result.success) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
const creditsPerLLMExtract = 50;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
const billingResult = await billTeam(
team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.",
});
}
}
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: 1,
docs: [result.data],
time_taken: timeTakenInSeconds,
team_id: team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,42 +0,0 @@
import { Request, Response } from "express";
import { getWebScraperQueue } from "../../src/services/queue-service";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { Logger } from "../../src/lib/logger";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) {
return res.status(404).json({ error: "Job not found" });
}
const { current, current_url, total, current_step, partialDocs } = await job.progress();
let data = job.returnvalue;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(req.params.jobId);
if (supabaseData) {
data = supabaseData.docs;
}
}
let jobStatus = await job.getState();
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
jobStatus = 'active';
}
res.json({
status: jobStatus,
// progress: job.progress(),
current,
current_url,
current_step,
total,
data: data ? data : null,
partial_data: jobStatus == 'completed' ? [] : partialDocs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,199 @@
import { Request, Response } from "express";
import { Job } from "bullmq";
import { Logger } from "../../../lib/logger";
import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../../services/alerts";
import { sendSlackWebhook } from "../../../services/alerts/slack";
export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
) {
Logger.info("🐂 Cleaning jobs older than 24h");
try {
const scrapeQueue = getScrapeQueue();
const batchSize = 10;
const numberOfBatches = 9; // Adjust based on your needs
const completedJobsPromises: Promise<Job[]>[] = [];
for (let i = 0; i < numberOfBatches; i++) {
completedJobsPromises.push(
scrapeQueue.getJobs(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
);
}
const completedJobs: Job[] = (
await Promise.all(completedJobsPromises)
).flat();
const before24hJobs =
completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || [];
let count = 0;
if (!before24hJobs) {
return res.status(200).send(`No jobs to remove.`);
}
for (const job of before24hJobs) {
try {
await job.remove();
count++;
} catch (jobError) {
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
}
}
return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs");
}
}
export async function checkQueuesController(req: Request, res: Response) {
try {
await checkAlerts();
return res.status(200).send("Alerts initialized");
} catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts");
}
}
// Use this as a "health check" that way we dont destroy the server
export async function queuesController(req: Request, res: Response) {
try {
const scrapeQueue = getScrapeQueue();
const [webScraperActive] = await Promise.all([
scrapeQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs,
});
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
export async function autoscalerController(req: Request, res: Response) {
try {
const maxNumberOfMachines = 80;
const minNumberOfMachines = 20;
const scrapeQueue = getScrapeQueue();
const [webScraperActive, webScraperWaiting, webScraperPriority] =
await Promise.all([
scrapeQueue.getActiveCount(),
scrapeQueue.getWaitingCount(),
scrapeQueue.getPrioritizedCount(),
]);
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
// get number of machines active
const request = await fetch(
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
{
headers: {
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
},
}
);
const machines = await request.json();
// Only worker machines
const activeMachines = machines.filter(
(machine) =>
(machine.state === "started" ||
machine.state === "starting" ||
machine.state === "replacing") &&
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
).length;
let targetMachineCount = activeMachines;
const baseScaleUp = 10;
// Slow scale down
const baseScaleDown = 2;
// Scale up logic
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 3
);
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 2
);
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp
);
}
// Scale down logic
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 3
);
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 2
);
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown
);
}
if (targetMachineCount !== activeMachines) {
Logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
);
if (targetMachineCount > activeMachines) {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
} else {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
);
}
return res.status(200).json({
mode: "scale-descale",
count: targetMachineCount,
});
}
return res.status(200).json({
mode: "normal",
count: activeMachines,
});
} catch (error) {
Logger.error(error);
return res.status(500).send("Failed to initialize autoscaler");
}
}

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import Redis from "ioredis"; import Redis from "ioredis";
import { Logger } from "../../lib/logger"; import { Logger } from "../../../lib/logger";
import { redisRateLimitClient } from "../../services/rate-limiter"; import { redisRateLimitClient } from "../../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) { export async function redisHealthController(req: Request, res: Response) {
const retryOperation = async (operation, retries = 3) => { const retryOperation = async (operation, retries = 3) => {

View File

@ -1,11 +1,10 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs"; import { supabase_service } from "../../../src/services/supabase";
import { getWebScraperQueue } from "../../src/services/queue-service"; import { Logger } from "../../../src/lib/logger";
import { supabase_service } from "../../src/services/supabase"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import { billTeam } from "../../src/services/billing/credit_billing"; import * as Sentry from "@sentry/node";
import { Logger } from "../../src/lib/logger";
export async function crawlCancelController(req: Request, res: Response) { export async function crawlCancelController(req: Request, res: Response) {
try { try {
@ -19,8 +18,9 @@ export async function crawlCancelController(req: Request, res: Response) {
if (!success) { if (!success) {
return res.status(status).json({ error }); return res.status(status).json({ error });
} }
const job = await getWebScraperQueue().getJob(req.params.jobId);
if (!job) { const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
} }
@ -40,31 +40,18 @@ export async function crawlCancelController(req: Request, res: Response) {
} }
} }
const jobState = await job.getState();
const { partialDocs } = await job.progress();
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
Logger.info("Billing team for partial docs...");
// Note: the credits that we will bill them here might be lower than the actual
// due to promises that are not yet resolved
await billTeam(team_id, partialDocs.length);
}
try { try {
await getWebScraperQueue().client.del(job.lockKey()); sc.cancelled = true;
await job.takeLock(); await saveCrawl(req.params.jobId, sc);
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) { } catch (error) {
Logger.error(error); Logger.error(error);
} }
const newJobState = await job.getState();
res.json({ res.json({
status: "cancelled" status: "cancelled"
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error);
Logger.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }

View File

@ -0,0 +1,71 @@
import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
if (job) {
job.returnvalue = x.docs;
}
})
}
jobs.forEach(job => {
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
});
return jobs;
}
export async function crawlStatusController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
if (sc.team_id !== team_id) {
return res.status(403).json({ error: "Forbidden" });
}
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
res.json({
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length,
data: jobStatus === "completed" ? data : null,
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
});
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,232 @@
import { Request, Response } from "express";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}
const crawlerOptions = {
...defaultCrawlerOptions,
...req.body.crawlerOptions,
};
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (Array.isArray(crawlerOptions.includes)) {
for (const x of crawlerOptions.includes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ error: e.message });
}
}
}
if (Array.isArray(crawlerOptions.excludes)) {
for (const x of crawlerOptions.excludes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ error: e.message });
}
}
}
const limitCheck = req.body?.crawlerOptions?.limit ?? 1;
const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } =
await checkTeamCredits(team_id, limitCheck);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
}
// TODO: need to do this to v1
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (typeof url !== "string") {
return res.status(400).json({ error: "URL must be a string" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
await logCrawl(id, team_id);
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
plan,
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (_) {}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(sitemap.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
if (Sentry.isInitialized()) {
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
} else {
await getScrapeQueue().addBulk(jobs);
}
} else {
await lockURL(id, sc, url);
// Not needed, first one should be 15.
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
const job = await addScrapeJob(
{
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
},
{
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
}
);
await addCrawlJob(id, job.id);
}
res.json({ jobId: id });
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,138 @@
import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
export async function crawlPreviewController(req: Request, res: Response) {
try {
const { success, error, status, team_id:a, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Preview
);
const team_id = "preview";
if (!success) {
return res.status(status).json({ error });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
plan,
robots,
createdAt: Date.now(),
};
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
});
await addCrawlJob(id, job.id);
}
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
});
await addCrawlJob(id, job.id);
}
res.json({ jobId: id });
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -1,8 +1,8 @@
import { AuthResponse, RateLimiterMode } from "../types"; import { AuthResponse, RateLimiterMode } from "../../types";
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
export const keyAuthController = async (req: Request, res: Response) => { export const keyAuthController = async (req: Request, res: Response) => {

View File

@ -0,0 +1,286 @@
import { ExtractorOptions, PageOptions } from "./../../lib/entities";
import { Request, Response } from "express";
import {
billTeam,
checkTeamCredits,
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import { Document } from "../../lib/entities";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import {
defaultPageOptions,
defaultExtractorOptions,
defaultTimeout,
defaultOrigin,
} from "../../lib/default-values";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue } from "../../services/queue-service";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
plan?: PlanType
): Promise<{
success: boolean;
error?: string;
data?: Document;
returnCode: number;
}> {
const url = req.body.url;
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return {
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
returnCode: 403,
};
}
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
const job = await addScrapeJob(
{
url,
mode: "single_urls",
crawlerOptions,
team_id,
pageOptions,
extractorOptions,
origin: req.body.origin ?? defaultOrigin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
let doc;
const err = await Sentry.startSpan(
{
name: "Wait for job to finish",
op: "bullmq.wait",
attributes: { job: jobId },
},
async (span) => {
try {
doc = (await waitForJob(job.id, timeout))[0];
} catch (e) {
if (e instanceof Error && e.message.startsWith("Job wait")) {
span.setAttribute("timedOut", true);
return {
success: false,
error: "Request timed out",
returnCode: 408,
};
} else if (
typeof e === "string" &&
(e.includes("Error generating completions: ") ||
e.includes("Invalid schema for function") ||
e.includes(
"LLM extraction did not match the extraction schema you provided."
))
) {
return {
success: false,
error: e,
returnCode: 500,
};
} else {
throw e;
}
}
span.setAttribute("result", JSON.stringify(doc));
return null;
}
);
if (err !== null) {
return err;
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return {
success: true,
error: "No page found",
returnCode: 200,
data: doc,
};
}
delete doc.index;
delete doc.provider;
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (
!pageOptions.includeRawHtml &&
extractorOptions.mode == "llm-extraction-from-raw-html"
) {
if (doc.rawHtml) {
delete doc.rawHtml;
}
}
if (!pageOptions.includeHtml) {
if (doc.html) {
delete doc.html;
}
}
return {
success: true,
data: doc,
returnCode: 200,
};
}
export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = {
...defaultExtractorOptions,
...req.body.extractorOptions,
};
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode.includes("llm-extraction")) {
if (
typeof extractorOptions.extractionSchema !== "object" ||
extractorOptions.extractionSchema === null
) {
return res.status(400).json({
error:
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
});
}
pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000;
}
// checkCredits
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
earlyReturn = true;
return res.status(402).json({ error: "Insufficient credits" });
}
} catch (error) {
Logger.error(error);
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact hello@firecrawl.com for help.",
});
}
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
extractorOptions,
timeout,
plan
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
result.data && result.data.markdown
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
: 0;
if (result.success) {
let creditsToBeBilled = 1;
const creditsPerLLMExtract = 49;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
const billingResult = await billTeam(team_id, creditsToBeBilled);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
});
}
}
}
logJob({
job_id: jobId,
success: result.success,
message: result.error,
num_docs: 1,
docs: [result.data],
time_taken: timeTakenInSeconds,
team_id: team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({
error:
typeof error === "string"
? error
: error?.message ?? "Internal Server Error",
});
}
}

View File

@ -1,14 +1,18 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper"; import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../types"; import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { PageOptions, SearchOptions } from "../lib/entities"; import { PageOptions, SearchOptions } from "../../lib/entities";
import { search } from "../search"; import { search } from "../../search";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../lib/logger"; import { Logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
export async function searchHelper( export async function searchHelper(
jobId: string, jobId: string,
@ -17,6 +21,7 @@ export async function searchHelper(
crawlerOptions: any, crawlerOptions: any,
pageOptions: PageOptions, pageOptions: PageOptions,
searchOptions: SearchOptions, searchOptions: SearchOptions,
plan: PlanType
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -73,55 +78,57 @@ export async function searchHelper(
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
} }
const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
// filter out social media links // filter out social media links
const jobDatas = res.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
},
opts: {
jobId: uuid,
priority: jobPriority,
}
};
})
const a = new WebScraperDataProvider(); let jobs = [];
await a.setOptions({ if (Sentry.isInitialized()) {
jobId, for (const job of jobDatas) {
mode: "single_urls", // add with sentry instrumentation
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)), jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId));
crawlerOptions: { }
...crawlerOptions, } else {
}, jobs = await getScrapeQueue().addBulk(jobDatas);
pageOptions: { await getScrapeQueue().addBulk(jobs);
...pageOptions, }
onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false,
removeTags: pageOptions?.removeTags ?? [],
fallback: false,
},
});
const docs = await a.getDocuments(false); const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
if (docs.length === 0) { if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
} }
await Promise.all(jobs.map(x => x.remove()));
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( const filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs }; return { success: true, error: "No page found", returnCode: 200, data: docs };
} }
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
return { return {
success: true, success: true,
data: filteredDocs, data: filteredDocs,
@ -132,7 +139,7 @@ export async function searchHelper(
export async function searchController(req: Request, res: Response) { export async function searchController(req: Request, res: Response) {
try { try {
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser( const { success, team_id, error, status, plan } = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Search RateLimiterMode.Search
@ -142,17 +149,16 @@ export async function searchController(req: Request, res: Response) {
} }
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { const pageOptions = req.body.pageOptions ?? {
includeHtml: false, includeHtml: req.body.pageOptions?.includeHtml ?? false,
onlyMainContent: true, onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
fetchPageContent: true, fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
removeTags: [], removeTags: req.body.pageOptions?.removeTags ?? [],
fallback: false, fallback: req.body.pageOptions?.fallback ?? false,
}; };
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const searchOptions = req.body.searchOptions ?? { limit: 5 }; const searchOptions = req.body.searchOptions ?? { limit: 5 };
const jobId = uuidv4(); const jobId = uuidv4();
try { try {
@ -162,6 +168,7 @@ export async function searchController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
} }
} catch (error) { } catch (error) {
Sentry.captureException(error);
Logger.error(error); Logger.error(error);
return res.status(500).json({ error: "Internal server error" }); return res.status(500).json({ error: "Internal server error" });
} }
@ -173,6 +180,7 @@ export async function searchController(req: Request, res: Response) {
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
searchOptions, searchOptions,
plan
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
@ -192,6 +200,11 @@ export async function searchController(req: Request, res: Response) {
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
if (error instanceof Error && error.message.startsWith("Job wait")) {
return res.status(408).json({ error: "Request timed out" });
}
Sentry.captureException(error);
Logger.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }

View File

@ -0,0 +1,43 @@
import { Request, Response } from "express";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
try {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
const jobIDs = await getCrawlJobs(req.params.jobId);
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") {
// const supabaseData = await supabaseGetJobById(req.params.jobId);
// if (supabaseData) {
// data = supabaseData.docs;
// }
// }
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
res.json({
status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length,
data: jobStatus === "completed" ? data : null,
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
});
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,47 @@
import { crawlController } from '../crawl'
import { Request, Response } from 'express';
import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create';
import { validateIdempotencyKey } from '../../services/idempotency/validate';
import { v4 as uuidv4 } from 'uuid';
jest.mock('../auth', () => ({
authenticateUser: jest.fn().mockResolvedValue({
success: true,
team_id: 'team123',
error: null,
status: 200
}),
reduce: jest.fn()
}));
jest.mock('../../services/idempotency/validate');
describe('crawlController', () => {
it('should prevent duplicate requests using the same idempotency key', async () => {
const req = {
headers: {
'x-idempotency-key': await uuidv4(),
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
},
body: {
url: 'https://mendable.ai'
}
} as unknown as Request;
const res = {
status: jest.fn().mockReturnThis(),
json: jest.fn()
} as unknown as Response;
// Mock the idempotency key validation to return false for the second call
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
// First request should succeed
await crawlController(req, res);
expect(res.status).not.toHaveBeenCalledWith(409);
// Second request with the same key should fail
await crawlController(req, res);
expect(res.status).toHaveBeenCalledWith(409);
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
});
});

View File

@ -0,0 +1,64 @@
import { url } from "../types";
describe("URL Schema Validation", () => {
beforeEach(() => {
jest.resetAllMocks();
});
it("should prepend http:// to URLs without a protocol", () => {
const result = url.parse("example.com");
expect(result).toBe("http://example.com");
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("http://example.com")).not.toThrow();
expect(() => url.parse("https://example.com")).not.toThrow();
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("example.com")).not.toThrow();
});
it("should reject URLs with unsupported protocols", () => {
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
});
it("should reject URLs without a valid top-level domain", () => {
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
});
it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with subdomains correctly", () => {
expect(() => url.parse("http://sub.example.com")).not.toThrow();
expect(() => url.parse("https://blog.example.com")).not.toThrow();
});
it("should handle URLs with paths correctly", () => {
expect(() => url.parse("http://example.com/path")).not.toThrow();
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
});
it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should reject malformed URLs starting with 'http://http'", () => {
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
});
})

View File

@ -0,0 +1,58 @@
import { Request, Response } from "express";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types";
import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
export async function crawlCancelController(req: Request, res: Response) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ error: "Job not found" });
}
// check if the job belongs to the team
if (useDbAuthentication) {
const { data, error: supaError } = await supabase_service
.from("bulljobs_teams")
.select("*")
.eq("job_id", req.params.jobId)
.eq("team_id", team_id);
if (supaError) {
return res.status(500).json({ error: supaError.message });
}
if (data.length === 0) {
return res.status(403).json({ error: "Unauthorized" });
}
}
try {
sc.cancelled = true;
await saveCrawl(req.params.jobId, sc);
} catch (error) {
Logger.error(error);
}
res.json({
status: "cancelled"
});
} catch (error) {
Sentry.captureException(error);
Logger.error(error);
return res.status(500).json({ error: error.message });
}
}

View File

@ -0,0 +1,159 @@
import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node";
type ErrorMessage = {
type: "error",
error: string,
}
type CatchupMessage = {
type: "catchup",
data: CrawlStatusResponse,
}
type DocumentMessage = {
type: "document",
data: Document,
}
type DoneMessage = { type: "done" }
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
function send(ws: WebSocket, msg: Message) {
if (ws.readyState === 1) {
return new Promise((resolve, reject) => {
ws.send(JSON.stringify(msg), (err) => {
if (err) reject(err);
else resolve(null);
});
});
}
}
function close(ws: WebSocket, code: number, msg: Message) {
if (ws.readyState <= 1) {
ws.close(code, JSON.stringify(msg));
}
}
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return close(ws, 1008, { type: "error", error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return close(ws, 3003, { type: "error", error: "Forbidden" });
}
let doneJobIDs = [];
let finished = false;
const loop = async () => {
if (finished) return;
const jobIDs = await getCrawlJobs(req.params.jobId);
if (jobIDs.length === doneJobIDs.length) {
return close(ws, 1000, { type: "done" });
}
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
for (const jobID of newlyDoneJobIDs) {
const job = await getJob(jobID);
if (job.returnvalue) {
send(ws, {
type: "document",
data: legacyDocumentConverter(job.returnvalue),
})
} else {
return close(ws, 3000, { type: "error", error: job.failedReason });
}
}
setTimeout(loop, 1000);
};
setTimeout(loop, 1000);
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
const doneJobs = await getJobs(doneJobIDs);
const data = doneJobs.map(x => x.returnvalue);
send(ws, {
type: "catchup",
data: {
status,
total: jobIDs.length,
completed: doneJobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data.map(x => legacyDocumentConverter(x)),
}
});
if (status !== "scraping") {
finished = true;
return close(ws, 1000, { type: "done" });
}
}
// Basically just middleware and error wrapping
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
try {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
null,
RateLimiterMode.CrawlStatus,
);
if (!success) {
return close(ws, 3000, {
type: "error",
error,
});
}
req.auth = { team_id, plan };
await crawlStatusWS(ws, req);
} catch (err) {
Sentry.captureException(err);
const id = uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
});
}
}

View File

@ -0,0 +1,116 @@
import { Response } from "express";
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);
if (!job) return job;
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(id);
if (supabaseData) {
job.returnvalue = supabaseData.docs;
}
}
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
return job;
}
export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
if (job) {
job.returnvalue = x.docs;
}
})
}
jobs.forEach(job => {
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
});
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
}
if (sc.team_id !== req.auth.team_id) {
return res.status(403).json({ success: false, error: "Forbidden" });
}
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
let doneJobs = [];
if (end === undefined) { // determine 10 megabyte limit
let bytes = 0;
const bytesLimit = 10485760; // 10 MiB in bytes
const factor = 100; // chunking for faster retrieval
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
// get current chunk and retrieve jobs
const currentIDs = doneJobsOrder.slice(i, i+factor);
const jobs = await getJobs(currentIDs);
// iterate through jobs and add them one them one to the byte counter
// both loops will break once we cross the byte counter
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
const job = jobs[ii];
doneJobs.push(job);
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
}
}
// if we ran over the bytes limit, remove the last document
if (bytes > bytesLimit) {
doneJobs.splice(doneJobs.length - 1, 1);
}
} else {
doneJobs = await getJobs(doneJobsOrder);
}
const data = doneJobs.map(x => x.returnvalue);
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());
if (typeof req.query.limit === "string") {
nextURL.searchParams.set("limit", req.query.limit);
}
res.status(200).json({
status,
completed: doneJobsLength,
total: jobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
next:
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
? undefined
: nextURL.href,
data: data.map(x => legacyDocumentConverter(x)),
});
}

View File

@ -0,0 +1,157 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
CrawlRequest,
crawlRequestSchema,
CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority";
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
res: Response<CrawlResponse>
) {
req.body = crawlRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
const { remainingCredits } = req.account;
const crawlerOptions = legacyCrawlerOptions(req.body);
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
// TODO: @rafa, is this right? copied from v0
if (Array.isArray(crawlerOptions.includes)) {
for (const x of crawlerOptions.includes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
if (Array.isArray(crawlerOptions.excludes)) {
for (const x of crawlerOptions.excludes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
);
}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(sitemap.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
const job = await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true,
},
{
priority: 15,
}
);
await addCrawlJob(id, job.id);
}
return res.status(200).json({
success: true,
id,
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
});
}

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function livenessController(req: Request, res: Response) {
//TODO: add checks if the application is live and healthy like checking the redis connection
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,130 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
legacyCrawlerOptions,
mapRequestSchema,
RequestWithAuth,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
import {
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine";
configDotenv();
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
const startTime = new Date().getTime();
req.body = mapRequestSchema.parse(req.body);
const limit = req.body.limit;
const id = uuidv4();
let links: string[] = [req.body.url];
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body),
pageOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
const crawler = crawlToCrawler(id, sc);
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
sitemap.map((x) => {
links.push(x.url);
});
}
let urlWithoutWww = req.body.url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
// limit to 50 results (beta)
numResults: Math.min(limit, 50),
});
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
});
}
}
// Perform cosine similarity between the search query and the list of links
if (req.body.search) {
const searchQuery = req.body.search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
}
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
// if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url));
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
await billTeam(req.auth.team_id, 1);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const linksToReturn = links.slice(0, limit);
logJob({
job_id: id,
success: links.length > 0,
message: "Map completed",
num_docs: links.length,
docs: linksToReturn,
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "map",
url: req.body.url,
crawlerOptions: {},
pageOptions: {},
origin: req.body.origin,
extractor_options: { mode: "markdown" },
num_tokens: 0,
});
return res.status(200).json({
success: true,
links: linksToReturn,
});
}

View File

@ -0,0 +1,6 @@
import { Request, Response } from "express";
export async function readinessController(req: Request, res: Response) {
// TODO: add checks when the application is ready to serve traffic
res.status(200).json({ status: "ok" });
}

View File

@ -0,0 +1,109 @@
import { Request, Response } from "express";
import { Logger } from '../../lib/logger';
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from 'uuid';
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false;
const origin = req.body.origin;
const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body);
const jobId = uuidv4();
const startTime = new Date().getTime();
const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10})
const job = await addScrapeJob({
url: req.body.url,
mode: "single_urls",
crawlerOptions: {},
team_id: req.auth.team_id,
pageOptions,
extractorOptions: {},
origin: req.body.origin,
is_scrape: true,
}, {}, jobId, jobPriority);
let doc: any | undefined;
try {
doc = (await waitForJob(job.id, timeout))[0];
} catch (e) {
Logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && e.message.startsWith("Job wait")) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
} else {
return res.status(500).json({
success: false,
error: "Internal server error",
});
}
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({
success: true,
warning: "No page found",
data: doc
});
}
delete doc.index;
delete doc.provider;
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
const billingResult = await billTeam(
req.auth.team_id,
creditsToBeBilled
);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.",
});
}
logJob({
job_id: jobId,
success: true,
message: "Scrape completed",
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "scrape",
url: req.body.url,
crawlerOptions: {},
pageOptions: pageOptions,
origin: origin,
extractor_options: { mode: "markdown" },
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
});
}

View File

@ -0,0 +1,321 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
export type Format =
| "markdown"
| "html"
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage";
export const url = z.preprocess(
(x) => {
if (!protocolIncluded(x as string)) {
return `http://${x}`;
}
return x;
},
z
.string()
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
"URL must have a valid top-level domain or be a valid path"
)
.refine(
(x) => checkUrl(x as string),
"Invalid URL"
)
.refine(
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
);
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const scrapeOptions = z.object({
formats: z
.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
])
.array()
.optional()
.default(["markdown"]),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), // default?
waitFor: z.number().int().nonnegative().finite().safe().default(0),
parsePDF: z.boolean().default(true),
}).strict(strictMessage);
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
url,
origin: z.string().optional().default("api"),
}).strict(strictMessage);
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
maxDepth: z.number().default(10), // default?
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
}).strict(strictMessage);
// export type CrawlerOptions = {
// includePaths?: string[];
// excludePaths?: string[];
// maxDepth?: number;
// limit?: number;
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
// allowExternalLinks?: boolean;
// ignoreSitemap?: boolean;
// };
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
limit: z.number().default(10000),
}).strict(strictMessage);
// export type CrawlRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
// };
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
limit: z.number().min(1).max(50).default(5000).optional(),
}).strict(strictMessage);
// export type MapRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// };
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
metadata: {
title?: string;
description?: string;
language?: string;
keywords?: string;
robots?: string;
ogTitle?: string;
ogDescription?: string;
ogUrl?: string;
ogImage?: string;
ogAudio?: string;
ogDeterminer?: string;
ogLocale?: string;
ogLocaleAlternate?: string[];
ogSiteName?: string;
ogVideo?: string;
dcTermsCreated?: string;
dcDateCreated?: string;
dcDate?: string;
dcTermsType?: string;
dcType?: string;
dcTermsAudience?: string;
dcTermsSubject?: string;
dcSubject?: string;
dcDescription?: string;
dcTermsKeywords?: string;
modifiedTime?: string;
publishedTime?: string;
articleTag?: string;
articleSection?: string;
sourceURL?: string;
statusCode?: number;
error?: string;
};
};
export type ErrorResponse = {
success: false;
error: string;
details?: any;
};
export type ScrapeResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document;
};
export interface ScrapeResponseRequestTest {
statusCode: number;
body: ScrapeResponse;
error?: string;
}
export type CrawlResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
};
export type MapResponse =
| ErrorResponse
| {
success: true;
links: string[];
};
export type CrawlStatusParams = {
jobId: string;
};
export type CrawlStatusResponse =
| ErrorResponse
| {
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: string;
next?: string;
data: Document[];
};
type AuthObject = {
team_id: string;
plan: PlanType;
};
type Account = {
remainingCredits: number;
};
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
account?: Account;
}
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
}
export interface ResponseWithSentry<
ResBody = undefined,
> extends Response<ResBody> {
sentry?: string,
}
export function legacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxCrawledDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
};
}
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
};
}
export function legacyDocumentConverter(doc: any): Document {
if (doc.metadata) {
if (doc.metadata.screenshot) {
doc.screenshot = doc.metadata.screenshot;
delete doc.metadata.screenshot;
}
if (doc.metadata.fullPageScreenshot) {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
}
return {
markdown: doc.markdown,
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
metadata: {
...doc.metadata,
pageError: undefined,
pageStatusCode: undefined,
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
};
}

View File

@ -1,8 +1,10 @@
import express from "express"; import "dotenv/config";
import "./services/sentry"
import * as Sentry from "@sentry/node";
import express, { NextFunction, Request, Response } from "express";
import bodyParser from "body-parser"; import bodyParser from "body-parser";
import cors from "cors"; import cors from "cors";
import "dotenv/config"; import { getScrapeQueue } from "./services/queue-service";
import { getWebScraperQueue } from "./services/queue-service";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from "@hyperdx/node-opentelemetry"; import { initSDK } from "@hyperdx/node-opentelemetry";
import cluster from "cluster"; import cluster from "cluster";
@ -13,6 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events";
import http from 'node:http'; import http from 'node:http';
import https from 'node:https'; import https from 'node:https';
import CacheableLookup from 'cacheable-lookup'; import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1";
import expressWs from "express-ws";
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid";
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -45,7 +53,8 @@ if (cluster.isMaster) {
} }
}); });
} else { } else {
const app = express(); const ws = expressWs(express());
const app = ws.app;
global.isProduction = process.env.IS_PRODUCTION === "true"; global.isProduction = process.env.IS_PRODUCTION === "true";
@ -58,7 +67,7 @@ if (cluster.isMaster) {
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
queues: [new BullAdapter(getWebScraperQueue())], queues: [new BullAdapter(getScrapeQueue())],
serverAdapter: serverAdapter, serverAdapter: serverAdapter,
}); });
@ -78,6 +87,7 @@ if (cluster.isMaster) {
// register router // register router
app.use(v0Router); app.use(v0Router);
app.use("/v1", v1Router);
app.use(adminRouter); app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002; const DEFAULT_PORT = process.env.PORT ?? 3002;
@ -104,9 +114,9 @@ if (cluster.isMaster) {
app.get(`/serverHealthCheck`, async (req, res) => { app.get(`/serverHealthCheck`, async (req, res) => {
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const [waitingJobs] = await Promise.all([ const [waitingJobs] = await Promise.all([
webScraperQueue.getWaitingCount(), scrapeQueue.getWaitingCount(),
]); ]);
const noWaitingJobs = waitingJobs === 0; const noWaitingJobs = waitingJobs === 0;
@ -115,6 +125,7 @@ if (cluster.isMaster) {
waitingJobs, waitingJobs,
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error);
Logger.error(error); Logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
@ -126,9 +137,9 @@ if (cluster.isMaster) {
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
const getWaitingJobsCount = async () => { const getWaitingJobsCount = async () => {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const [waitingJobsCount] = await Promise.all([ const [waitingJobsCount] = await Promise.all([
webScraperQueue.getWaitingCount(), scrapeQueue.getWaitingCount(),
]); ]);
return waitingJobsCount; return waitingJobsCount;
@ -166,6 +177,7 @@ if (cluster.isMaster) {
}, timeout); }, timeout);
} }
} catch (error) { } catch (error) {
Sentry.captureException(error);
Logger.debug(error); Logger.debug(error);
} }
}; };
@ -178,16 +190,42 @@ if (cluster.isMaster) {
res.send({ isProduction: global.isProduction }); res.send({ isProduction: global.isProduction });
}); });
Sentry.setupExpressErrorHandler(app);
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
if (err instanceof ZodError) {
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
} else {
const id = res.sentry ?? uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
}
});
Logger.info(`Worker ${process.pid} started`); Logger.info(`Worker ${process.pid} started`);
} }
const wsq = getWebScraperQueue();
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); // const sq = getScrapeQueue();
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

View File

@ -46,7 +46,7 @@ export async function generateCompletions(
return completionResult; return completionResult;
} catch (error) { } catch (error) {
Logger.error(`Error generating completions: ${error}`); Logger.error(`Error generating completions: ${error}`);
throw new Error(`Error generating completions: ${error.message}`); throw error;
} }
default: default:
throw new Error("Invalid client"); throw new Error("Invalid client");

View File

@ -15,7 +15,7 @@ const defaultPrompt =
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document, document: Document,
mode: "markdown" | "raw-html" mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
let markdown = document.markdown; let markdown = document.markdown;
@ -27,9 +27,10 @@ function prepareOpenAIDoc(
// Check if the markdown content exists in the document // Check if the markdown content exists in the document
if (!extractionTarget) { if (!extractionTarget) {
throw new Error( return null;
`${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai` // throw new Error(
); // `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
// );
} }
@ -64,7 +65,16 @@ export async function generateOpenAICompletions({
mode: "markdown" | "raw-html"; mode: "markdown" | "raw-html";
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const [content, numTokens] = prepareOpenAIDoc(document, mode); const preparedDoc = prepareOpenAIDoc(document, mode);
if (preparedDoc === null) {
return {
...document,
warning: "LLM extraction was not performed since the document's content is empty or missing.",
};
}
const [content, numTokens] = preparedDoc;
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,

View File

@ -0,0 +1,134 @@
import {
getJobPriority,
addJobPriority,
deleteJobPriority,
} from "../job-priority";
import { redisConnection } from "../../services/queue-service";
import { PlanType } from "../../types";
jest.mock("../../services/queue-service", () => ({
redisConnection: {
sadd: jest.fn(),
srem: jest.fn(),
scard: jest.fn(),
expire: jest.fn(),
},
}));
describe("Job Priority Tests", () => {
afterEach(() => {
jest.clearAllMocks();
});
test("addJobPriority should add job_id to the set and set expiration", async () => {
const team_id = "team1";
const job_id = "job1";
await addJobPriority(team_id, job_id);
expect(redisConnection.sadd).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
});
test("deleteJobPriority should remove job_id from the set", async () => {
const team_id = "team1";
const job_id = "job1";
await deleteJobPriority(team_id, job_id);
expect(redisConnection.srem).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
);
});
test("getJobPriority should return correct priority based on plan and set length", async () => {
const team_id = "team1";
const plan: PlanType = "standard";
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
const priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(250);
const priorityExceeded = await getJobPriority({ plan, team_id });
expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4)
});
test("getJobPriority should handle different plans correctly", async () => {
const team_id = "team1";
(redisConnection.scard as jest.Mock).mockResolvedValue(50);
let plan: PlanType = "hobby";
let priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(150);
plan = "hobby";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3)
(redisConnection.scard as jest.Mock).mockResolvedValue(25);
plan = "free";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(10);
(redisConnection.scard as jest.Mock).mockResolvedValue(60);
plan = "free";
priority = await getJobPriority({ plan, team_id });
expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5)
});
test("addJobPriority should reset expiration time when adding new job", async () => {
const team_id = "team1";
const job_id1 = "job1";
const job_id2 = "job2";
await addJobPriority(team_id, job_id1);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
// Clear the mock calls
(redisConnection.expire as jest.Mock).mockClear();
// Add another job
await addJobPriority(team_id, job_id2);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
});
test("Set should expire after 60 seconds", async () => {
const team_id = "team1";
const job_id = "job1";
jest.useFakeTimers();
await addJobPriority(team_id, job_id);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
);
// Fast-forward time by 59 seconds
jest.advanceTimersByTime(59000);
// The set should still exist
expect(redisConnection.scard).not.toHaveBeenCalled();
// Fast-forward time by 2 more seconds (total 61 seconds)
jest.advanceTimersByTime(2000);
// Check if the set has been removed (scard should return 0)
(redisConnection.scard as jest.Mock).mockResolvedValue(0);
const setSize = await redisConnection.scard(`limit_team_id:${team_id}`);
expect(setSize).toBe(0);
jest.useRealTimers();
});
});

View File

@ -0,0 +1,32 @@
import { checkTeamCredits } from "../services/billing/credit_billing";
import { Logger } from "./logger";
type checkCreditsResponse = {
status: number;
error: string | null;
}
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
try {
const {
success: creditsCheckSuccess,
message: creditsCheckMessage
} = await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return {
status: 402,
error: "Insufficient credits"
};
}
} catch (error) {
Logger.error(error);
return {
status: 500,
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
};
}
return {
status: 200,
error: null
}
};

View File

@ -0,0 +1,124 @@
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
export type StoredCrawl = {
originUrl: string;
crawlerOptions: any;
pageOptions: any;
team_id: string;
plan: string;
robots?: string;
cancelled?: boolean;
createdAt: number;
};
export async function saveCrawl(id: string, crawl: StoredCrawl) {
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
}
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
const x = await redisConnection.get("crawl:" + id);
if (x === null) {
return null;
}
return JSON.parse(x);
}
export async function getCrawlExpiry(id: string): Promise<Date> {
const d = new Date();
const ttl = await redisConnection.pttl("crawl:" + id);
d.setMilliseconds(d.getMilliseconds() + ttl);
d.setMilliseconds(0);
return d;
}
export async function addCrawlJob(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobs(id: string, job_ids: string[]) {
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
}
export async function addCrawlJobDone(id: string, job_id: string) {
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
}
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
}
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
}
export async function isCrawlFinished(id: string) {
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
}
export async function isCrawlFinishedLocked(id: string) {
return (await redisConnection.exists("crawl:" + id + ":finish"));
}
export async function finishCrawl(id: string) {
if (await isCrawlFinished(id)) {
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
if (set === 1) {
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
}
return set === 1
}
}
export async function getCrawlJobs(id: string): Promise<string[]> {
return await redisConnection.smembers("crawl:" + id + ":jobs");
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
return false;
}
}
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
initialUrl: sc.originUrl,
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
limit: sc.crawlerOptions?.limit ?? 10000,
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
});
if (sc.robots !== undefined) {
try {
crawler.importRobotsTxt(sc.robots);
} catch (_) {}
}
return crawler;
}

View File

@ -1,6 +1,6 @@
export const defaultOrigin = "api"; export const defaultOrigin = "api";
export const defaultTimeout = 45000; // 45 seconds export const defaultTimeout = 60000; // 60 seconds
export const defaultPageOptions = { export const defaultPageOptions = {
onlyMainContent: false, onlyMainContent: false,
@ -12,7 +12,8 @@ export const defaultPageOptions = {
}; };
export const defaultCrawlerOptions = { export const defaultCrawlerOptions = {
allowBackwardCrawling: false allowBackwardCrawling: false,
limit: 10000
} }
export const defaultCrawlPageOptions = { export const defaultCrawlPageOptions = {

View File

@ -11,6 +11,7 @@ export interface Progress {
} }
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean;
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean; includeRawHtml?: boolean;
@ -24,6 +25,10 @@ export type PageOptions = {
parsePDF?: boolean; parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
onlyIncludeTags?: string | string[]; onlyIncludeTags?: string | string[];
includeLinks?: boolean;
useFastMode?: boolean; // beta
disableJSDom?: boolean; // beta
atsv?: boolean; // beta
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -65,6 +70,8 @@ export type WebScraperOptions = {
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;
bullJobId?: string; bullJobId?: string;
priority?: number;
teamId?: string;
}; };
export interface DocumentUrl { export interface DocumentUrl {
@ -141,4 +148,5 @@ export interface FireEngineOptions{
blockMedia?: boolean; blockMedia?: boolean;
blockAds?: boolean; blockAds?: boolean;
disableJsDom?: boolean; disableJsDom?: boolean;
atsv?: boolean; // beta
} }

View File

@ -1,5 +1,5 @@
export function parseMarkdown(html: string) { export async function parseMarkdown(html: string) {
var TurndownService = require("turndown"); var TurndownService = require("turndown");
var turndownPluginGfm = require('joplin-turndown-plugin-gfm') var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
@ -21,7 +21,27 @@ export function parseMarkdown(html: string) {
}); });
var gfm = turndownPluginGfm.gfm; var gfm = turndownPluginGfm.gfm;
turndownService.use(gfm); turndownService.use(gfm);
let markdownContent = turndownService.turndown(html); let markdownContent = "";
const turndownPromise = new Promise<string>((resolve, reject) => {
try {
const result = turndownService.turndown(html);
resolve(result);
} catch (error) {
reject("Error converting HTML to Markdown: " + error);
}
});
const timeoutPromise = new Promise<string>((resolve, reject) => {
const timeout = 5000; // Timeout in milliseconds
setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout);
});
try {
markdownContent = await Promise.race([turndownPromise, timeoutPromise]);
} catch (error) {
console.error(error);
return ""; // Optionally return an empty string or handle the error as needed
}
// multiple line links // multiple line links
let insideLinkContent = false; let insideLinkContent = false;

View File

@ -0,0 +1,91 @@
import { redisConnection } from "../../src/services/queue-service";
import { PlanType } from "../../src/types";
import { Logger } from "./logger";
const SET_KEY_PREFIX = "limit_team_id:";
export async function addJobPriority(team_id, job_id) {
try {
const setKey = SET_KEY_PREFIX + team_id;
// Add scrape job id to the set
await redisConnection.sadd(setKey, job_id);
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
await redisConnection.expire(setKey, 60);
} catch (e) {
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
}
}
export async function deleteJobPriority(team_id, job_id) {
try {
const setKey = SET_KEY_PREFIX + team_id;
// remove job_id from the set
await redisConnection.srem(setKey, job_id);
} catch (e) {
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
}
}
export async function getJobPriority({
plan,
team_id,
basePriority = 10,
}: {
plan: PlanType;
team_id: string;
basePriority?: number;
}): Promise<number> {
try {
const setKey = SET_KEY_PREFIX + team_id;
// Get the length of the set
const setLength = await redisConnection.scard(setKey);
// Determine the priority based on the plan and set length
let planModifier = 1;
let bucketLimit = 0;
switch (plan) {
case "free":
bucketLimit = 25;
planModifier = 0.5;
break;
case "hobby":
bucketLimit = 100;
planModifier = 0.3;
break;
case "standard":
case "standardnew":
bucketLimit = 200;
planModifier = 0.2;
break;
case "growth":
case "growthdouble":
bucketLimit = 400;
planModifier = 0.1;
break;
default:
bucketLimit = 25;
planModifier = 1;
break;
}
// if length set is smaller than set, just return base priority
if (setLength <= bucketLimit) {
return basePriority;
} else {
// If not, we keep base priority + planModifier
return Math.ceil(
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
);
}
} catch (e) {
Logger.error(
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
);
return basePriority;
}
}

View File

@ -0,0 +1,46 @@
import { Logger } from "./logger";
export function performCosineSimilarity(links: string[], searchQuery: string) {
try {
// Function to calculate cosine similarity
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const magnitude1 = Math.sqrt(
vec1.reduce((sum, val) => sum + val * val, 0)
);
const magnitude2 = Math.sqrt(
vec2.reduce((sum, val) => sum + val * val, 0)
);
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);
};
// Function to convert text to vector
const textToVector = (text: string): number[] => {
const words = searchQuery.toLowerCase().split(/\W+/);
return words.map((word) => {
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
.length;
return count / text.length;
});
};
// Calculate similarity scores
const similarityScores = links.map((link) => {
const linkVector = textToVector(link);
const searchVector = textToVector(searchQuery);
return cosineSimilarity(linkVector, searchVector);
});
// Sort links based on similarity scores and print scores
const a = links
.map((link, index) => ({ link, score: similarityScores[index] }))
.sort((a, b) => b.score - a.score);
links = a.map((item) => item.link);
return links;
} catch (error) {
Logger.error(`Error performing cosine similarity: ${error}`);
return links;
}
}

View File

@ -1,4 +1,4 @@
import { Job, JobId } from "bull"; import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url"; import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase"; import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger"; import { Logger } from "./logger";
@ -70,7 +70,7 @@ export class ScrapeEvents {
} }
} }
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { static async logJobEvent(job: Job | any, event: ScrapeQueueEvent["event"]) {
try { try {
await this.insert(((job as any).id ? (job as any).id : job) as string, { await this.insert(((job as any).id ? (job as any).id : job) as string, {
type: "queue", type: "queue",

View File

@ -17,3 +17,21 @@ export const supabaseGetJobById = async (jobId: string) => {
return data; return data;
} }
export const supabaseGetJobsById = async (jobIds: string[]) => {
const { data, error } = await supabase_service
.from('firecrawl_jobs')
.select('*')
.in('job_id', jobIds);
if (error) {
return [];
}
if (!data) {
return [];
}
return data;
}

View File

@ -0,0 +1,159 @@
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
import { isSameSubdomain } from "./validateUrl";
describe("isSameDomain", () => {
it("should return true for a subdomain", () => {
const result = isSameDomain("http://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain", () => {
const result = isSameDomain("http://example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for different domains", () => {
const result = isSameDomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return true for a subdomain with different protocols", () => {
const result = isSameDomain("https://sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return false for invalid URLs", () => {
const result = isSameDomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameDomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for a subdomain with www prefix", () => {
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
expect(result).toBe(true);
});
it("should return true for the same domain with www prefix", () => {
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
expect(result).toBe(true);
});
});
describe("isSameSubdomain", () => {
it("should return false for a subdomain", () => {
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
expect(result).toBe(false);
});
it("should return true for the same subdomain", () => {
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for different subdomains", () => {
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
it("should return false for different domains", () => {
const result = isSameSubdomain("http://example.com", "http://another.com");
expect(result).toBe(false);
});
it("should return false for invalid URLs", () => {
const result = isSameSubdomain("invalid-url", "http://example.com");
expect(result).toBe(false);
const result2 = isSameSubdomain("http://example.com", "invalid-url");
expect(result2).toBe(false);
});
it("should return true for the same subdomain with different protocols", () => {
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return true for the same subdomain with www prefix", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
expect(result).toBe(true);
});
it("should return false for a subdomain with www prefix and different subdomain", () => {
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
expect(result).toBe(false);
});
});
describe("removeDuplicateUrls", () => {
it("should remove duplicate URLs with different protocols", () => {
const urls = [
"http://example.com",
"https://example.com",
"http://www.example.com",
"https://www.example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should keep URLs with different paths", () => {
const urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
]);
});
it("should prefer https over http", () => {
const urls = [
"http://example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should prefer non-www over www", () => {
const urls = [
"https://www.example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should handle empty input", () => {
const urls: string[] = [];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([]);
});
it("should handle URLs with different cases", () => {
const urls = [
"https://EXAMPLE.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://EXAMPLE.com"]);
});
it("should handle URLs with trailing slashes", () => {
const urls = [
"https://example.com",
"https://example.com/"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
});

View File

@ -0,0 +1,170 @@
export const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return /^([^.:]+:\/\/)/.test(url);
};
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
let error = false;
let urlObj = {};
try {
urlObj = new URL(s);
} catch (err) {
error = true;
}
return { error, urlObj };
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
};
export const checkUrl = (url: string) => {
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
}
return url;
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
return domain1 === domain2;
}
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};
export function removeDuplicateUrls(urls: string[]): string[] {
const urlMap = new Map<string, string>();
for (const url of urls) {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol;
const hostname = parsedUrl.hostname.replace(/^www\./, '');
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
const key = `${hostname}${path}`;
if (!urlMap.has(key)) {
urlMap.set(key, url);
} else {
const existingUrl = new URL(urlMap.get(key)!);
const existingProtocol = existingUrl.protocol;
if (protocol === 'https:' && existingProtocol === 'http:') {
urlMap.set(key, url);
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
urlMap.set(key, url);
}
}
}
return [...new Set(Array.from(urlMap.values()))];
}

View File

@ -1,4 +1,4 @@
import { Job } from "bull"; import { Job } from "bullmq";
import { import {
CrawlResult, CrawlResult,
WebScraperOptions, WebScraperOptions,
@ -15,15 +15,23 @@ import { ScrapeEvents } from "../lib/scrape-events";
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
token,
}: { }: {
job: Job<WebScraperOptions>; job: Job<WebScraperOptions>;
token: string;
}) { }) {
let partialDocs: Document[] = []; let partialDocs: Document[] = [];
return (await runWebScraper({ return (await runWebScraper({
url: job.data.url, url: job.data.url,
mode: job.data.mode, mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, extractorOptions: job.data.extractorOptions,
pageOptions: {
...job.data.pageOptions,
...(job.data.crawl_id ? ({
includeRawHtml: true,
}): {}),
},
inProgress: (progress) => { inProgress: (progress) => {
Logger.debug(`🐂 Job in progress ${job.id}`); Logger.debug(`🐂 Job in progress ${job.id}`);
if (progress.currentDocument) { if (progress.currentDocument) {
@ -31,20 +39,22 @@ export async function startWebScraperPipeline({
if (partialDocs.length > 50) { if (partialDocs.length > 50) {
partialDocs = partialDocs.slice(-50); partialDocs = partialDocs.slice(-50);
} }
job.progress({ ...progress, partialDocs: partialDocs }); // job.updateProgress({ ...progress, partialDocs: partialDocs });
} }
}, },
onSuccess: (result) => { onSuccess: (result, mode) => {
Logger.debug(`🐂 Job completed ${job.id}`); Logger.debug(`🐂 Job completed ${job.id}`);
saveJob(job, result); saveJob(job, result, token, mode);
}, },
onError: (error) => { onError: (error) => {
Logger.error(`🐂 Job failed ${job.id}`); Logger.error(`🐂 Job failed ${job.id}`);
ScrapeEvents.logJobEvent(job, "failed"); ScrapeEvents.logJobEvent(job, "failed");
job.moveToFailed(error); job.moveToFailed(error, token, false);
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false,
})) as { success: boolean; message: string; docs: Document[] }; })) as { success: boolean; message: string; docs: Document[] };
} }
export async function runWebScraper({ export async function runWebScraper({
@ -52,11 +62,14 @@ export async function runWebScraper({
mode, mode,
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
extractorOptions,
inProgress, inProgress,
onSuccess, onSuccess,
onError, onError,
team_id, team_id,
bull_job_id, bull_job_id,
priority,
is_scrape=false,
}: RunWebScraperParams): Promise<RunWebScraperResult> { }: RunWebScraperParams): Promise<RunWebScraperResult> {
try { try {
const provider = new WebScraperDataProvider(); const provider = new WebScraperDataProvider();
@ -65,17 +78,22 @@ export async function runWebScraper({
jobId: bull_job_id, jobId: bull_job_id,
mode: mode, mode: mode,
urls: [url], urls: [url],
extractorOptions,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
bullJobId: bull_job_id, bullJobId: bull_job_id,
priority,
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
jobId: bull_job_id, jobId: bull_job_id,
mode: mode, mode: mode,
urls: url.split(","), urls: url.split(","),
extractorOptions,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
priority,
teamId: team_id
}); });
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
@ -97,21 +115,24 @@ export async function runWebScraper({
return { url: doc.metadata.sourceURL }; return { url: doc.metadata.sourceURL };
} }
}) })
: docs.filter((doc) => doc.content.trim().length > 0); : docs;
const billingResult = await billTeam(team_id, filteredDocs.length); if(is_scrape === false) {
const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) { if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found"); // throw new Error("Failed to bill team, no subscription was found");
return { return {
success: false, success: false,
message: "Failed to bill team, no subscription was found", message: "Failed to bill team, no subscription was found",
docs: [], docs: [],
}; };
}
} }
// This is where the returnvalue from the job is set // This is where the returnvalue from the job is set
onSuccess(filteredDocs); onSuccess(filteredDocs, mode);
// this return doesn't matter too much for the job completion result // this return doesn't matter too much for the job completion result
return { success: true, message: "", docs: filteredDocs }; return { success: true, message: "", docs: filteredDocs };
@ -121,7 +142,7 @@ export async function runWebScraper({
} }
} }
const saveJob = async (job: Job, result: any) => { const saveJob = async (job: Job, result: any, token: string, mode: string) => {
try { try {
if (process.env.USE_DB_AUTHENTICATION === "true") { if (process.env.USE_DB_AUTHENTICATION === "true") {
const { data, error } = await supabase_service const { data, error } = await supabase_service
@ -130,17 +151,21 @@ const saveJob = async (job: Job, result: any) => {
.eq("job_id", job.id); .eq("job_id", job.id);
if (error) throw new Error(error.message); if (error) throw new Error(error.message);
try { // try {
await job.moveToCompleted(null, false, false); // if (mode === "crawl") {
} catch (error) { // await job.moveToCompleted(null, token, false);
// I think the job won't exist here anymore // } else {
} // await job.moveToCompleted(result, token, false);
} else { // }
try { // } catch (error) {
await job.moveToCompleted(result, false, false); // // I think the job won't exist here anymore
} catch (error) { // }
// I think the job won't exist here anymore // } else {
} // try {
// await job.moveToCompleted(result, token, false);
// } catch (error) {
// // I think the job won't exist here anymore
// }
} }
ScrapeEvents.logJobEvent(job, "completed"); ScrapeEvents.logJobEvent(job, "completed");
} catch (error) { } catch (error) {

View File

@ -1,10 +1,11 @@
import express from "express"; import express from "express";
import { redisHealthController } from "../controllers/admin/redis-health"; import { redisHealthController } from "../controllers/v0/admin/redis-health";
import { import {
autoscalerController,
checkQueuesController, checkQueuesController,
cleanBefore24hCompleteJobsController, cleanBefore24hCompleteJobsController,
queuesController, queuesController,
} from "../controllers/admin/queue"; } from "../controllers/v0/admin/queue";
export const adminRouter = express.Router(); export const adminRouter = express.Router();
@ -27,3 +28,8 @@ adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/queues`, `/admin/${process.env.BULL_AUTH_KEY}/queues`,
queuesController queuesController
); );
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);

View File

@ -1,14 +1,14 @@
import express from "express"; import express from "express";
import { crawlController } from "../../src/controllers/crawl"; import { crawlController } from "../../src/controllers/v0/crawl";
import { crawlStatusController } from "../../src/controllers/crawl-status"; import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
import { scrapeController } from "../../src/controllers/scrape"; import { scrapeController } from "../../src/controllers/v0/scrape";
import { crawlPreviewController } from "../../src/controllers/crawlPreview"; import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
import { searchController } from "../../src/controllers/search"; import { searchController } from "../../src/controllers/v0/search";
import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
import { keyAuthController } from "../../src/controllers/keyAuth"; import { keyAuthController } from "../../src/controllers/v0/keyAuth";
import { livenessController } from "../controllers/liveness"; import { livenessController } from "../controllers/v0/liveness";
import { readinessController } from "../controllers/readiness"; import { readinessController } from "../controllers/v0/readiness";
export const v0Router = express.Router(); export const v0Router = express.Router();

150
apps/api/src/routes/v1.ts Normal file
View File

@ -0,0 +1,150 @@
import express, { NextFunction, Request, Response } from "express";
import { crawlController } from "../controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../controllers/v1/crawl-status";
import { mapController } from "../controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/auth";
import { createIdempotencyKey } from "../services/idempotency/create";
import { validateIdempotencyKey } from "../services/idempotency/validate";
import { checkTeamCredits } from "../services/billing/credit_billing";
import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
}
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
req.account = { remainingCredits }
next();
})()
.catch(err => next(err));
};
}
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
rateLimiterMode,
);
if (!success) {
return res.status(status).json({ success: false, error });
}
req.auth = { team_id, plan };
next();
})()
.catch(err => next(err));
}
}
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
(async () => {
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
createIdempotencyKey(req);
}
next();
})()
.catch(err => next(err));
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (req.body.url && isUrlBlocked(req.body.url)) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
}
}
expressWs(express());
export const v1Router = express.Router();
v1Router.post(
"/scrape",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
wrap(scrapeController)
);
v1Router.post(
"/crawl",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
idempotencyMiddleware,
checkCreditsMiddleware(),
wrap(crawlController)
);
v1Router.post(
"/map",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
wrap(mapController)
);
v1Router.get(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
);
v1Router.ws(
"/crawl/:jobId",
crawlStatusWSController
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
v1Router.delete(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.Crawl),
crawlCancelController
);
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication
// v1Router.get("/keyAuth", keyAuthController);
// // Search routes
// v0Router.post("/search", searchController);
// Health/Probe routes
// v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController);

175
apps/api/src/run-req.ts Normal file
View File

@ -0,0 +1,175 @@
import axios from "axios";
import { promises as fs } from "fs";
import { v4 as uuidV4 } from "uuid";
interface Result {
start_url: string;
job_id?: string;
idempotency_key?: string;
result_data_jsonb?: any;
}
async function sendCrawl(result: Result): Promise<string | undefined> {
const idempotencyKey = uuidV4();
const url = result.start_url;
try {
const response = await axios.post(
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
{
url: url,
crawlerOptions: {
limit: 75,
},
pageOptions: {
includeHtml: true,
replaceAllPathsWithAbsolutePaths: true,
waitFor: 1000,
},
},
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
result.idempotency_key = idempotencyKey;
return response.data.jobId;
} catch (error) {
console.error("Error sending crawl:", error);
return undefined;
}
}
async function getContent(result: Result): Promise<boolean> {
let attempts = 0;
while (attempts < 120) {
// Reduce the number of attempts to speed up
try {
const response = await axios.get(
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `,
},
}
);
if (response.data.status === "completed") {
result.result_data_jsonb = response.data.data;
// Job actually completed
return true;
}
} catch (error) {
console.error("Error getting content:", error);
}
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
attempts++;
}
// Set result as null if timed out
result.result_data_jsonb = null;
return false;
}
async function processResults(results: Result[]): Promise<void> {
let processedCount = 0;
let starterCount = 0;
const queue: Result[] = [];
const processedUrls = new Set<string>();
// Initialize the queue with the first 1000 results
for (let i = 0; i < Math.min(100, results.length); i++) {
queue.push(results[i]);
processedUrls.add(results[i].start_url);
}
// Function to process a single result
const processSingleResult = async (result: Result) => {
const jobId = await sendCrawl(result);
if (jobId) {
console.log(`Job requested count: ${starterCount}`);
starterCount++;
result.job_id = jobId;
processedCount++;
// Save the result to the file
try {
// Save job id along with the start_url
const resultWithJobId = results.map(r => ({
start_url: r.start_url,
job_id: r.job_id,
}));
await fs.writeFile(
"results_with_job_id_4000_6000.json",
JSON.stringify(resultWithJobId, null, 4)
);
} catch (error) {
console.error("Error writing to results_with_content.json:", error);
}
// Add a new result to the queue if there are more results to process
// if (processedCount < results.length) {
// for (let i = queue.length; i < results.length; i++) {
// if (!processedUrls.has(results[i].start_url)) {
// const nextResult = results[i];
// console.log("Next result:", nextResult.start_url);
// queue.push(nextResult);
// processedUrls.add(nextResult.start_url);
// console.log(`Queue length: ${queue.length}`);
// processSingleResult(nextResult);
// break;
// }
// }
// }
}
};
// Start processing the initial queue concurrently
// for (let i = 0; i < queue.length; i++) {
// processSingleResult(queue[i]);
// if ((i + 1) % 500 === 0) {
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
// }
// }
// Start processing the initial queue concurrently
// await Promise.all(queue.map(result => processSingleResult(result)));
for (let i = 0; i < results.length; i += 100) {
const batch = results.slice(i, i + 100);
Promise.all(batch.map((result) => processSingleResult(result)))
.then(() => {
console.log(`Processed ${i + 100} results.`);
})
.catch((error) => {
console.error(`Error processing batch starting at index ${i}:`, error);
});
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
}
}
// Example call
async function getStartUrls(): Promise<Result[]> {
try {
const data = await fs.readFile("starturls.json", "utf-8");
return JSON.parse(data);
} catch (error) {
console.error("Error reading starturls.json:", error);
return [];
}
}
async function main() {
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
// console.log(results.map((r) => r.start_url).slice(0, 3));
processResults(results)
.then(() => {
console.log("All results processed.");
})
.catch((error) => {
console.error("Error processing results:", error);
});
}
main();

View File

@ -24,14 +24,15 @@ describe('scrapSingleUrl', () => {
}); });
it('should return a list of links on the firecrawl.ai page', async () => { it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://example.com'; const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true }; const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl("TEST", url, pageOptions); const result = await scrapSingleUrl("TEST", url, pageOptions);
// Check if the result contains a list of links // Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
console.log({result});
expect(Array.isArray(result.linksOnPage)).toBe(true); expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example') expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 10000); }, 15000);

View File

@ -1,4 +1,4 @@
import axios from "axios"; import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio"; import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
@ -22,7 +22,7 @@ export class WebCrawler {
private crawledUrls: Map<string, string> = new Map(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; public robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
@ -53,8 +53,8 @@ export class WebCrawler {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? []; this.includes = Array.isArray(includes) ? includes : [];
this.excludes = excludes ?? []; this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit; this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, ""); this.robots = robotsParser(this.robotsTxtUrl, "");
@ -66,10 +66,16 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
const url = new URL(link.trim(), this.baseUrl); let url: URL;
try {
url = new URL(link.trim(), this.baseUrl);
} catch (error) {
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
return false;
}
const path = url.pathname; const path = url.pathname;
const depth = getURLDepth(url.toString()); const depth = getURLDepth(url.toString());
@ -102,7 +108,12 @@ export class WebCrawler {
// Normalize the initial URL and the link to account for www and non-www versions // Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL(this.initialUrl); const normalizedInitialUrl = new URL(this.initialUrl);
const normalizedLink = new URL(link); let normalizedLink;
try {
normalizedLink = new URL(link);
} catch (_) {
return false;
}
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
@ -130,6 +141,25 @@ export class WebCrawler {
.slice(0, limit); .slice(0, limit);
} }
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
return response.data;
}
public importRobotsTxt(txt: string) {
this.robots = robotsParser(this.robotsTxtUrl, txt);
}
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
return filteredLinks.map(link => ({ url: link, html: "" }));
}
return null;
}
public async start( public async start(
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions, pageOptions?: PageOptions,
@ -142,19 +172,17 @@ export class WebCrawler {
Logger.debug(`Crawler starting with ${this.initialUrl}`); Logger.debug(`Crawler starting with ${this.initialUrl}`);
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); const txt = await this.getRobotsTxt();
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.importRobotsTxt(txt);
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
if (!crawlerOptions?.ignoreSitemap){ if (!crawlerOptions?.ignoreSitemap){
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sm = await this.tryGetSitemap();
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sm !== null) {
if (sitemapLinks.length > 0) { return sm;
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" }));
} }
} }
@ -241,6 +269,63 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
public filterURL(href: string, url: string): string | null {
let fullUrl = href;
if (!href.startsWith("http")) {
try {
fullUrl = new URL(href, this.baseUrl).toString();
} catch (_) {
return null;
}
}
let urlObj;
try {
urlObj = new URL(fullUrl);
} catch (_) {
return null;
}
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
return fullUrl;
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
return fullUrl;
}
}
return null;
}
public extractLinksFromHTML(html: string, url: string) {
let links: string[] = [];
const $ = load(html);
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
const u = this.filterURL(href, url);
if (u !== null) {
links.push(u);
}
}
});
return links;
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
@ -284,37 +369,7 @@ export class WebCrawler {
links.push({ url, html: content, pageStatusCode, pageError }); links.push({ url, html: content, pageStatusCode, pageError });
} }
$("a").each((_, element) => { links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
const href = $(element).attr("href");
if (href) {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
}
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
}
});
if (this.visited.size === 1) { if (this.visited.size === 1) {
return links; return links;
@ -465,9 +520,13 @@ export class WebCrawler {
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); if (error instanceof AxiosError && error.response?.status === 404) {
if (response) { // ignore 404
sitemapLinks = response; } else {
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) {
sitemapLinks = response;
}
} }
} }
@ -480,7 +539,11 @@ export class WebCrawler {
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
}
} }
} }

View File

@ -16,7 +16,6 @@ import {
replacePathsWithAbsolutePaths, replacePathsWithAbsolutePaths,
} from "./utils/replacePaths"; } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction"; import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
@ -44,6 +43,8 @@ export class WebScraperDataProvider {
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false; private allowExternalContentLinks: boolean = false;
private priority?: number;
private teamId?: string;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -72,7 +73,9 @@ export class WebScraperDataProvider {
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions, this.extractorOptions,
existingHTML existingHTML,
this.priority,
this.teamId,
); );
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
@ -88,21 +91,6 @@ export class WebScraperDataProvider {
results[i + index] = result; results[i + index] = result;
}) })
); );
try {
if (this.mode === "crawl" && this.bullJobId) {
const job = await getWebScraperQueue().getJob(this.bullJobId);
const jobStatus = await job.getState();
if (jobStatus === "failed") {
Logger.info(
"Job has failed or has been cancelled by the user. Stopping the job..."
);
return [] as Document[];
}
}
} catch (error) {
Logger.error(error.message);
return [] as Document[];
}
} }
return results.filter((result) => result !== null) as Document[]; return results.filter((result) => result !== null) as Document[];
} }
@ -306,7 +294,16 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
} }
documents = this.applyPathReplacements(documents); if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents);
}
if (!this.pageOptions.includeHtml) {
for (let document of documents) {
delete document.html;
}
}
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
(this.extractorOptions.mode === "llm-extraction" || (this.extractorOptions.mode === "llm-extraction" ||
@ -359,6 +356,7 @@ export class WebScraperDataProvider {
}); });
return { return {
content: content, content: content,
markdown: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
@ -581,12 +579,20 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { this.pageOptions = {
onlyMainContent: false, onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
includeHtml: false, includeHtml: options.pageOptions?.includeHtml ?? false,
replaceAllPathsWithAbsolutePaths: false, replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: true, parsePDF: options.pageOptions?.parsePDF ?? true,
removeTags: [], onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
}; };
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths = this.replaceAllPathsWithAbsolutePaths =
@ -608,6 +614,8 @@ export class WebScraperDataProvider {
options.crawlerOptions?.allowBackwardCrawling ?? false; options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false; options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -5,6 +5,7 @@ import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global"; import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger"; import { Logger } from "../../../lib/logger";
import * as Sentry from "@sentry/node";
/** /**
* Scrapes a URL with Fire-Engine * Scrapes a URL with Fire-Engine
@ -22,19 +23,23 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false, fullPageScreenshot = false,
pageOptions = { parsePDF: true }, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
priority,
teamId,
}: { }: {
url: string; url: string;
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean; fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
priority?: number;
teamId?: string;
}): Promise<FireEngineResponse> { }): Promise<FireEngineResponse> {
const logParams = { const logParams = {
url, url,
@ -49,11 +54,11 @@ export async function scrapWithFireEngine({
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor; let waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = "/scrape"; let endpoint = "/scrape";
@ -68,47 +73,101 @@ export async function scrapWithFireEngine({
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
if (pageOptions?.useFastMode) {
fireEngineOptionsParam.engine = "tlsclient";
engine = "tlsclient";
}
const response = await axios.post( // atsv is only available for beta customers
process.env.FIRE_ENGINE_BETA_URL + endpoint, const betaCustomersString = process.env.BETA_CUSTOMERS;
{ const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
url: url,
wait: waitParam, if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
screenshot: screenshotParam, fireEngineOptionsParam.atsv = true;
fullPageScreenshot: fullPageScreenshotParam, } else {
headers: headers, pageOptions.atsv = false;
pageOptions: pageOptions, }
...fireEngineOptionsParam,
}, const axiosInstance = axios.create({
{ headers: { "Content-Type": "application/json" }
headers: { });
"Content-Type": "application/json",
const startTime = Date.now();
const _response = await Sentry.startSpan({
name: "Call to fire-engine"
}, async span => {
return await axiosInstance.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
url: url,
wait: waitParam,
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
pageOptions: pageOptions,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
engine,
instantReturn: true,
...fireEngineOptionsParam,
}, },
timeout: universalTimeout + waitParam, {
} headers: {
); "Content-Type": "application/json",
...(Sentry.isInitialized() ? ({
"sentry-trace": Sentry.spanToTraceHeader(span),
"baggage": Sentry.spanToBaggageHeader(span),
}) : {}),
}
}
);
});
if (response.status !== 200) { let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
}
if (checkStatusResponse.data.processing) {
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
axiosInstance.delete(
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
validateStatus: (status) => true
}
).catch((error) => {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
});
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
logParams.error_message = "Request timed out";
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
}
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
Logger.debug( Logger.debug(
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}` `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}`
); );
logParams.error_message = response.data?.pageError; logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
logParams.response_code = response.data?.pageStatusCode; logParams.response_code = checkStatusResponse.data?.pageStatusCode;
if(response.data && response.data?.pageStatusCode !== 200) { if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`); Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
} }
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
return { return {
html: "", html: "",
screenshot: "", screenshot: "",
pageStatusCode: response.data?.pageStatusCode, pageStatusCode,
pageError: response.data?.pageError, pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
}; };
} }
const contentType = response.headers["content-type"]; const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url, url,
@ -119,18 +178,19 @@ export async function scrapWithFireEngine({
logParams.error_message = pageError; logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError }; return { html: content, screenshot: "", pageStatusCode, pageError };
} else { } else {
const data = response.data; const data = checkStatusResponse.data;
logParams.success = logParams.success =
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) || (data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
data.pageStatusCode === 404; data.pageStatusCode === 404;
logParams.html = data.content ?? ""; logParams.html = data.content ?? "";
logParams.response_code = data.pageStatusCode; logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError; logParams.error_message = data.pageError ?? data.error;
return { return {
html: data.content ?? "", html: data.content ?? "",
screenshot: data.screenshot ?? "", screenshot: data.screenshot ?? "",
pageStatusCode: data.pageStatusCode, pageStatusCode: data.pageStatusCode,
pageError: data.pageError, pageError: data.pageError ?? data.error,
}; };
} }
} catch (error) { } catch (error) {

View File

@ -43,6 +43,9 @@ export async function scrapWithScrapingBee(
transparent_status_code: "True", transparent_status_code: "True",
}, },
}); });
Logger.info(
`⛏️ ScrapingBee: Scraping ${url}`
);
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
logParams.success = true; logParams.success = true;

View File

@ -24,8 +24,8 @@ import { clientSideError } from "../../strings";
dotenv.config(); dotenv.config();
export const baseScrapers = [ export const baseScrapers = [
"fire-engine",
"fire-engine;chrome-cdp", "fire-engine;chrome-cdp",
"fire-engine",
"scrapingBee", "scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
@ -85,8 +85,8 @@ function getScrapingFallbackOrder(
}); });
let defaultOrder = [ let defaultOrder = [
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
"scrapingBee", "scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
@ -122,20 +122,38 @@ function getScrapingFallbackOrder(
export async function scrapSingleUrl( export async function scrapSingleUrl(
jobId: string, jobId: string,
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions,
onlyMainContent: true, extractorOptions?: ExtractorOptions,
includeHtml: false, existingHtml?: string,
includeRawHtml: false, priority?: number,
waitFor: 0, teamId?: string
screenshot: false,
fullPageScreenshot: false,
headers: undefined,
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
},
existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
}
if (extractorOptions) {
extractorOptions = {
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
}
}
if (!existingHtml) {
existingHtml = "";
}
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
const attemptScraping = async ( const attemptScraping = async (
@ -163,7 +181,7 @@ export async function scrapSingleUrl(
case "fire-engine;chrome-cdp": case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if(method === "fire-engine;chrome-cdp"){ if (method === "fire-engine;chrome-cdp") {
engine = "chrome-cdp"; engine = "chrome-cdp";
} }
@ -177,7 +195,10 @@ export async function scrapSingleUrl(
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
engine: engine, engine: engine,
} atsv: pageOptions.atsv,
},
priority,
teamId,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;
@ -336,11 +357,11 @@ export async function scrapSingleUrl(
pageError = undefined; pageError = undefined;
} }
if (text && text.trim().length >= 100) { if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break; break;
} }
if (pageStatusCode && pageStatusCode == 404) { if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`); Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
break; break;
} }
@ -359,20 +380,22 @@ export async function scrapSingleUrl(
let linksOnPage: string[] | undefined; let linksOnPage: string[] | undefined;
linksOnPage = extractLinks(rawHtml, urlToScrap); if (pageOptions.includeLinks) {
linksOnPage = extractLinks(rawHtml, urlToScrap);
}
let document: Document; let document: Document;
if (screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html" extractorOptions?.mode === "llm-extraction-from-raw-html"
? rawHtml ? rawHtml
: undefined, : undefined,
linksOnPage, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
@ -384,11 +407,11 @@ export async function scrapSingleUrl(
} else { } else {
document = { document = {
content: text, content: text,
markdown: text, markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html" extractorOptions?.mode === "llm-extraction-from-raw-html"
? rawHtml ? rawHtml
: undefined, : undefined,
metadata: { metadata: {
@ -397,7 +420,7 @@ export async function scrapSingleUrl(
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError, pageError: pageError,
}, },
linksOnPage, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
}; };
} }
@ -411,9 +434,9 @@ export async function scrapSingleUrl(
}); });
return { return {
content: "", content: "",
markdown: "", markdown: pageOptions.includeMarkdown ? "" : undefined,
html: "", html: "",
linksOnPage: [], linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: { metadata: {
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,

View File

@ -8,7 +8,6 @@ describe('Blocklist Functionality', () => {
'https://twitter.com/home', 'https://twitter.com/home',
'https://instagram.com/explore', 'https://instagram.com/explore',
'https://linkedin.com/in/johndoe', 'https://linkedin.com/in/johndoe',
'https://pinterest.com/pin/create',
'https://snapchat.com/add/johndoe', 'https://snapchat.com/add/johndoe',
'https://tiktok.com/@johndoe', 'https://tiktok.com/@johndoe',
'https://reddit.com/r/funny', 'https://reddit.com/r/funny',

View File

@ -8,7 +8,6 @@ describe('isUrlBlocked', () => {
'https://twitter.com/someuser', 'https://twitter.com/someuser',
'https://instagram.com/someuser', 'https://instagram.com/someuser',
'https://www.linkedin.com/in/someuser', 'https://www.linkedin.com/in/someuser',
'https://pinterest.com/someuser',
'https://snapchat.com/someuser', 'https://snapchat.com/someuser',
'https://tiktok.com/@someuser', 'https://tiktok.com/@someuser',
'https://reddit.com/r/somesubreddit', 'https://reddit.com/r/somesubreddit',

View File

@ -6,7 +6,6 @@ const socialMediaBlocklist = [
'twitter.com', 'twitter.com',
'instagram.com', 'instagram.com',
'linkedin.com', 'linkedin.com',
'pinterest.com',
'snapchat.com', 'snapchat.com',
'tiktok.com', 'tiktok.com',
'reddit.com', 'reddit.com',
@ -15,6 +14,11 @@ const socialMediaBlocklist = [
'whatsapp.com', 'whatsapp.com',
'wechat.com', 'wechat.com',
'telegram.org', 'telegram.org',
'researchhub.com',
'youtube.com',
'corterix.com',
'southwest.com',
'ryanair.com'
]; ];
const allowedKeywords = [ const allowedKeywords = [

View File

@ -234,5 +234,13 @@ export const urlSpecificParams = {
engine: "tlsclient", engine: "tlsclient",
}, },
}, },
},
"zoopla.co.uk":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
} }
}; };

View File

@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null; description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
const pattern = /([a-zA-Z]+-[A-Z]{2})/; language = soup('html').attr('lang') || null;
const match = pattern.exec(url);
language = match ? match[1] : null;
keywords = soup('meta[name="keywords"]').attr("content") || null; keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null;

View File

@ -41,10 +41,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
links.push(href); links.push(href);
} else if (href.startsWith('/')) { } else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin // Relative URL starting with '/', append to origin
links.push(`${origin}${href}`); links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) { } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL // Relative URL not starting with '/', append to base URL
links.push(`${baseUrl}/${href}`); links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) { } else if (href.startsWith('mailto:')) {
// mailto: links, add as is // mailto: links, add as is
links.push(href); links.push(href);

View File

@ -0,0 +1,45 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function fireEngineMap(q: string, options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
numResults: number;
page?: number;
}): Promise<SearchResult[]> {
let data = JSON.stringify({
query: q,
lang: options.lang,
country: options.country,
location: options.location,
tbs: options.tbs,
numResults: options.numResults,
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
return [];
}
let config = {
method: "POST",
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response) {
return response.data
} else {
return [];
}
}

View File

@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> { export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
let proxies = null; let proxies = null;
if (proxy) { if (proxy) {
if (proxy.startsWith("https")) { if (proxy.startsWith("https")) {

View File

@ -1,11 +1,9 @@
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities"; import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch"; import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { serper_search } from "./serper"; import { serper_search } from "./serper";
export async function search({ export async function search({
query, query,
advanced = false, advanced = false,
@ -30,12 +28,20 @@ export async function search({
proxy?: string; proxy?: string;
sleep_interval?: number; sleep_interval?: number;
timeout?: number; timeout?: number;
}) : Promise<SearchResult[]> { }): Promise<SearchResult[]> {
try { try {
if (process.env.SERPER_API_KEY ) {
return await serper_search(query, {num_results, tbs, filter, lang, country, location}); if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
} }
return await google_search( return await googleSearch(
query, query,
advanced, advanced,
num_results, num_results,
@ -49,7 +55,6 @@ export async function search({
); );
} catch (error) { } catch (error) {
Logger.error(`Error in search function: ${error}`); Logger.error(`Error in search function: ${error}`);
return [] return [];
} }
// if process.env.SERPER_API_KEY is set, use serper
} }

View File

@ -1,5 +1,5 @@
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";
import { getWebScraperQueue } from "../queue-service"; import { getScrapeQueue } from "../queue-service";
import { sendSlackWebhook } from "./slack"; import { sendSlackWebhook } from "./slack";
export async function checkAlerts() { export async function checkAlerts() {
@ -13,8 +13,8 @@ export async function checkAlerts() {
Logger.info("Initializing alerts"); Logger.info("Initializing alerts");
const checkActiveJobs = async () => { const checkActiveJobs = async () => {
try { try {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const activeJobs = await webScraperQueue.getActiveCount(); const activeJobs = await scrapeQueue.getActiveCount();
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
Logger.warn( Logger.warn(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
@ -34,11 +34,10 @@ export async function checkAlerts() {
}; };
const checkWaitingQueue = async () => { const checkWaitingQueue = async () => {
const webScraperQueue = getWebScraperQueue(); const scrapeQueue = getScrapeQueue();
const waitingJobs = await webScraperQueue.getWaitingCount(); const waitingJobs = await scrapeQueue.getWaitingCount();
const paused = await webScraperQueue.getPausedCount();
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
Logger.warn( Logger.warn(
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
); );

View File

@ -3,9 +3,9 @@ import { Logger } from "../../../src/lib/logger";
export async function sendSlackWebhook( export async function sendSlackWebhook(
message: string, message: string,
alertEveryone: boolean = false alertEveryone: boolean = false,
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
) { ) {
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
const messagePrefix = alertEveryone ? "<!channel> " : ""; const messagePrefix = alertEveryone ? "<!channel> " : "";
const payload = { const payload = {
text: `${messagePrefix} ${message}`, text: `${messagePrefix} ${message}`,

View File

@ -168,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
export async function checkTeamCredits(team_id: string, credits: number) { export async function checkTeamCredits(team_id: string, credits: number) {
return withAuth(supaCheckTeamCredits)(team_id, credits); return withAuth(supaCheckTeamCredits)(team_id, credits);
} }
// if team has enough credits for the operation, return true, else return false // if team has enough credits for the operation, return true, else return false
export async function supaCheckTeamCredits(team_id: string, credits: number) { export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (team_id === "preview") { if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" }; return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
} }
// Retrieve the team's active subscription and check for available coupons concurrently // Retrieve the team's active subscription and check for available coupons concurrently
@ -202,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (subscriptionError || !subscription) { if (subscriptionError || !subscription) {
// If there is no active subscription but there are available coupons // If there is no active subscription but there are available coupons
if (couponCredits >= credits) { if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
} }
const { data: creditUsages, error: creditUsageError } = const { data: creditUsages, error: creditUsageError } =
@ -252,9 +253,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { return {
success: false, success: false,
message: "Insufficient credits, please upgrade!", message: "Insufficient credits, please upgrade!",
remainingCredits: FREE_CREDITS - totalCreditsUsed
}; };
} }
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
} }
let totalCreditsUsed = 0; let totalCreditsUsed = 0;
@ -315,24 +317,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Compare the adjusted total credits used with the credits allowed by the plan // Compare the adjusted total credits used with the credits allowed by the plan
if (adjustedCreditsUsed + credits > price.credits) { if (adjustedCreditsUsed + credits > price.credits) {
await sendNotification( // await sendNotification(
team_id, // team_id,
NotificationType.LIMIT_REACHED, // NotificationType.LIMIT_REACHED,
subscription.current_period_start, // subscription.current_period_start,
subscription.current_period_end // subscription.current_period_end
); // );
return { success: false, message: "Insufficient credits, please upgrade!" }; return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
} else if (creditUsagePercentage >= 0.8) { } else if (creditUsagePercentage >= 0.8) {
// Send email notification for approaching credit limit // Send email notification for approaching credit limit
await sendNotification( // await sendNotification(
team_id, // team_id,
NotificationType.APPROACHING_LIMIT, // NotificationType.APPROACHING_LIMIT,
subscription.current_period_start, // subscription.current_period_start,
subscription.current_period_end // subscription.current_period_end
); // );
} }
return { success: true, message: "Sufficient credits available" }; return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
} }
// Count the total credits used by a team within the current billing period and return the remaining credits. // Count the total credits used by a team within the current billing period and return the remaining credits.

View File

@ -40,10 +40,11 @@ export async function logJob(job: FirecrawlJob) {
extractor_options: job.extractor_options, extractor_options: job.extractor_options,
num_tokens: job.num_tokens, num_tokens: job.num_tokens,
retry: !!job.retry, retry: !!job.retry,
crawl_id: job.crawl_id,
}, },
]); ]);
if (process.env.POSTHOG_API_KEY) { if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
let phLog = { let phLog = {
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
...(job.team_id !== "preview" && { ...(job.team_id !== "preview" && {

View File

@ -1,17 +1,71 @@
import { Job, Queue } from "bull"; import { Job, Queue } from "bullmq";
import { import { getScrapeQueue } from "./queue-service";
getWebScraperQueue,
} from "./queue-service";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { WebScraperOptions } from "../types"; import { WebScraperOptions } from "../types";
import * as Sentry from "@sentry/node";
export async function addWebScraperJob( async function addScrapeJobRaw(
webScraperOptions: WebScraperOptions, webScraperOptions: any,
options: any = {} options: any,
jobId: string,
jobPriority: number = 10
): Promise<Job> { ): Promise<Job> {
return await getWebScraperQueue().add(webScraperOptions, { return await getScrapeQueue().add(jobId, webScraperOptions, {
...options, ...options,
jobId: uuidv4(), priority: jobPriority,
jobId,
}); });
} }
export async function addScrapeJob(
webScraperOptions: WebScraperOptions,
options: any = {},
jobId: string = uuidv4(),
jobPriority: number = 10
): Promise<Job> {
if (Sentry.isInitialized()) {
const size = JSON.stringify(webScraperOptions).length;
return await Sentry.startSpan({
name: "Add scrape job",
op: "queue.publish",
attributes: {
"messaging.message.id": jobId,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": size,
},
}, async (span) => {
return await addScrapeJobRaw({
...webScraperOptions,
sentry: {
trace: Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span),
size,
},
}, options, jobId, jobPriority);
});
} else {
return await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
}
}
export function waitForJob(jobId: string, timeout: number) {
return new Promise((resolve, reject) => {
const start = Date.now();
const int = setInterval(async () => {
if (Date.now() >= start + timeout) {
clearInterval(int);
reject(new Error("Job wait "));
} else {
const state = await getScrapeQueue().getJobState(jobId);
if (state === "completed") {
clearInterval(int);
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
} else if (state === "failed") {
clearInterval(int);
reject((await getScrapeQueue().getJob(jobId)).failedReason);
}
}
}, 1000);
})
}

View File

@ -1,23 +1,40 @@
import Queue from "bull"; import { Queue } from "bullmq";
import { Queue as BullQueue } from "bull";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import IORedis from "ioredis";
let webScraperQueue: BullQueue; let scrapeQueue: Queue;
export function getWebScraperQueue() { export const redisConnection = new IORedis(process.env.REDIS_URL, {
if (!webScraperQueue) { maxRetriesPerRequest: null,
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { });
settings: {
lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds, export const scrapeQueueName = "{scrapeQueue}";
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
stalledInterval: 30 * 1000, export function getScrapeQueue() {
maxStalledCount: 10, if (!scrapeQueue) {
}, scrapeQueue = new Queue(
defaultJobOptions:{ scrapeQueueName,
attempts: 2 {
connection: redisConnection,
} }
}); // {
// settings: {
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
// stalledInterval: 30 * 1000,
// maxStalledCount: 10,
// },
// defaultJobOptions:{
// attempts: 5
// }
// }
);
Logger.info("Web scraper queue created"); Logger.info("Web scraper queue created");
} }
return webScraperQueue; return scrapeQueue;
} }
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
// import { QueueEvents } from 'bullmq';
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });

View File

@ -1,74 +1,362 @@
import { CustomError } from "../lib/custom-error";
import { getWebScraperQueue } from "./queue-service";
import "dotenv/config"; import "dotenv/config";
import "./sentry"
import * as Sentry from "@sentry/node";
import { CustomError } from "../lib/custom-error";
import {
getScrapeQueue,
redisConnection,
scrapeQueueName,
} from "./queue-service";
import { logtail } from "./logtail"; import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper"; import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry'; import { initSDK } from "@hyperdx/node-opentelemetry";
import { Job } from "bull"; import { Job } from "bullmq";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { Worker } from "bullmq";
import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid";
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, getCrawlJobs, lockURL } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
import { addJobPriority, deleteJobPriority, getJobPriority } from "../../src/lib/job-priority";
import { PlanType } from "../types";
if (process.env.ENV === 'production') { if (process.env.ENV === "production") {
initSDK({ initSDK({
consoleCapture: true, consoleCapture: true,
additionalInstrumentations: [], additionalInstrumentations: [],
}); });
} }
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const wsq = getWebScraperQueue(); const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
const workerStalledCheckInterval =
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
const jobLockExtendInterval =
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime =
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
async function processJob(job: Job, done) { const cantAcceptConnectionInterval =
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval =
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const processJobInternal = async (token: string, job: Job) => {
const extendLockInterval = setInterval(async () => {
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval);
await addJobPriority(job.data.team_id, job.id );
let err = null;
try {
const result = await processJob(job, token);
try{
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
await job.moveToCompleted(null, token, false);
} else {
await job.moveToCompleted(result.docs, token, false);
}
}catch(e){
}
} catch (error) {
console.log("Job failed, error:", error);
Sentry.captureException(error);
err = error;
await job.moveToFailed(error, token, false);
} finally {
await deleteJobPriority(job.data.team_id, job.id );
clearInterval(extendLockInterval);
}
return err;
};
let isShuttingDown = false;
process.on("SIGINT", () => {
console.log("Received SIGINT. Shutting down gracefully...");
isShuttingDown = true;
});
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<any>) => {
const worker = new Worker(queueName, null, {
connection: redisConnection,
lockDuration: 1 * 60 * 1000, // 1 minute
// lockRenewTime: 15 * 1000, // 15 seconds
stalledInterval: 30 * 1000, // 30 seconds
maxStalledCount: 10, // 10 times
});
worker.startStalledCheckTimer();
const monitor = await systemMonitor;
while (true) {
if (isShuttingDown) {
console.log("No longer accepting new jobs. SIGINT");
break;
}
const token = uuidv4();
const canAcceptConnection = await monitor.acceptConnection();
if (!canAcceptConnection) {
console.log("Cant accept connection");
await sleep(cantAcceptConnectionInterval); // more sleep
continue;
}
const job = await worker.getNextJob(token);
if (job) {
if (job.data && job.data.sentry && Sentry.isInitialized()) {
Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => {
Sentry.startSpan({
name: "Scrape job",
attributes: {
job: job.id,
worker: process.env.FLY_MACHINE_ID ?? worker.id,
},
}, async (span) => {
await Sentry.startSpan({
name: "Process scrape job",
op: "queue.process",
attributes: {
"messaging.message.id": job.id,
"messaging.destination.name": getScrapeQueue().name,
"messaging.message.body.size": job.data.sentry.size,
"messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp),
"messaging.message.retry.count": job.attemptsMade,
}
}, async () => {
const res = await processJobInternal(token, job);
if (res !== null) {
span.setStatus({ code: 2 }); // ERROR
} else {
span.setStatus({ code: 1 }); // OK
}
});
});
});
} else {
Sentry.startSpan({
name: "Scrape job",
attributes: {
job: job.id,
worker: process.env.FLY_MACHINE_ID ?? worker.id,
},
}, () => {
processJobInternal(token, job);
});
}
await sleep(gotJobInterval);
} else {
await sleep(connectionMonitorInterval);
}
}
};
workerFun(scrapeQueueName, processJobInternal);
async function processJob(job: Job, token: string) {
Logger.info(`🐂 Worker taking job ${job.id}`); Logger.info(`🐂 Worker taking job ${job.id}`);
// Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue
if (job.data.url && (job.data.url.includes("researchhub.com") || job.data.url.includes("ebay.com") || job.data.url.includes("youtube.com") || job.data.url.includes("microsoft.com") )) {
Logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
const data = {
success: false,
docs: [],
project_id: job.data.project_id,
error: "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
};
await job.moveToCompleted(data.docs, token, false);
return data;
}
try { try {
job.progress({ job.updateProgress({
current: 1, current: 1,
total: 100, total: 100,
current_step: "SCRAPING", current_step: "SCRAPING",
current_url: "", current_url: "",
}); });
const start = Date.now(); const start = Date.now();
const { success, message, docs } = await startWebScraperPipeline({ job });
const { success, message, docs } = await startWebScraperPipeline({
job,
token,
});
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0] ? docs[0].rawHtml : "";
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
if (docs[0] && docs[0].rawHtml) {
delete docs[0].rawHtml;
}
}
const data = { const data = {
success: success, success,
result: { result: {
links: docs.map((doc) => { links: docs.map((doc) => {
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,
error: message /* etc... */, error: message /* etc... */,
docs,
}; };
await callWebhook(job.data.team_id, job.id as string, data); if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook, job.data.v1);
}
if (job.data.crawl_id) {
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
await addCrawlJobDone(job.data.crawl_id, job.id);
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
if (!job.data.sitemapped) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
)
for (const link of links) {
if (await lockURL(job.data.crawl_id, sc, link)) {
// This seems to work really welel
const jobPriority = await getJobPriority({plan:sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10})
const jobId = uuidv4();
// console.log("plan: ", sc.plan);
// console.log("team_id: ", sc.team_id)
// console.log("base priority: ", job.data.crawl_id ? 20 : 10)
// console.log("job priority: " , jobPriority, "\n\n\n")
const newJob = await addScrapeJob({
url: link,
mode: "single_urls",
crawlerOptions: sc.crawlerOptions,
team_id: sc.team_id,
pageOptions: sc.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
v1: job.data.v1,
}, {}, jobId, jobPriority);
await addCrawlJob(job.data.crawl_id, newJob.id);
}
}
}
}
if (await finishCrawl(job.data.crawl_id)) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
const jobs = (await Promise.all(jobIDs.map(async x => {
if (x === job.id) {
return {
async getState() {
return "completed"
},
timestamp: Date.now(),
returnvalue: docs,
}
}
const j = await getScrapeQueue().getJob(x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobById(j.id);
if (supabaseData) {
j.returnvalue = supabaseData.docs;
}
}
return j;
}))).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled || jobStatuses.some(x => x === "failed") ? "failed" : "completed";
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
await logJob({
job_id: job.data.crawl_id,
success: jobStatus === "completed",
message: sc.cancelled ? "Cancelled" : message,
num_docs: fullDocs.length,
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
});
const data = {
success: jobStatus !== "failed",
result: {
links: fullDocs.map((doc) => {
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
error: message /* etc... */,
docs: fullDocs,
};
await callWebhook(job.data.team_id, job.data.crawl_id, data, job.data.webhook, job.data.v1);
}
}
await logJob({
job_id: job.id as string,
success: success,
message: message,
num_docs: docs.length,
docs: docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: "crawl",
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
});
Logger.info(`🐂 Job done ${job.id}`); Logger.info(`🐂 Job done ${job.id}`);
done(null, data); return data;
} catch (error) { } catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`); Logger.error(`🐂 Job errored ${job.id} - ${error}`);
if (await getWebScraperQueue().isPaused(false)) {
Logger.debug("🐂Queue is paused, ignoring"); Sentry.captureException(error, {
return; data: {
} job: job.id
},
})
if (error instanceof CustomError) { if (error instanceof CustomError) {
// Here we handle the error, then save the failed job // Here we handle the error, then save the failed job
@ -81,6 +369,9 @@ async function processJob(job: Job, done) {
}); });
} }
Logger.error(error); Logger.error(error);
if (error.stack) {
Logger.error(error.stack);
}
logtail.error("Overall error ingesting", { logtail.error("Overall error ingesting", {
job_id: job.id, job_id: job.id,
@ -89,37 +380,69 @@ async function processJob(job: Job, done) {
const data = { const data = {
success: false, success: false,
docs: [],
project_id: job.data.project_id, project_id: job.data.project_id,
error: error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */, "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
}; };
await callWebhook(job.data.team_id, job.id as string, data);
await logJob({ if (job.data.mode === "crawl" || job.data.crawl_id) {
job_id: job.id as string, await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data, job.data.webhook, job.data.v1);
success: false, }
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
num_docs: 0, if (job.data.crawl_id) {
docs: [], await logJob({
time_taken: 0, job_id: job.id as string,
team_id: job.data.team_id, success: false,
mode: "crawl", message:
url: job.data.url, typeof error === "string"
crawlerOptions: job.data.crawlerOptions, ? error
pageOptions: job.data.pageOptions, : error.message ?? "Something went wrong... Contact help@mendable.ai",
origin: job.data.origin, num_docs: 0,
}); docs: [],
done(null, data); time_taken: 0,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
});
const sc = await getCrawl(job.data.crawl_id);
await logJob({
job_id: job.data.crawl_id,
success: false,
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
origin: job.data.origin,
});
}
// done(null, data);
return data;
} }
} }
wsq.process( // wsq.process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), // Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
processJob // processJob
); // );
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); // wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); // wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); // wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); // wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

View File

@ -65,7 +65,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken", "test-prefix:someToken",
"standard" "standard"
); );
expect(limiter2.points).toBe(50); expect(limiter2.points).toBe(100);
const limiter3 = getRateLimiter( const limiter3 = getRateLimiter(
"search" as RateLimiterMode, "search" as RateLimiterMode,
@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken", "test-prefix:someToken",
"growth" "growth"
); );
expect(limiter4.points).toBe(150); expect(limiter4.points).toBe(250);
}); });
it("should return the default rate limiter if plan is not provided", () => { it("should return the default rate limiter if plan is not provided", () => {
@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => {
"crawlStatus" as RateLimiterMode, "crawlStatus" as RateLimiterMode,
"test-prefix:someToken" "test-prefix:someToken"
); );
expect(limiter2.points).toBe(150); expect(limiter2.points).toBe(250);
}); });
it("should consume points correctly for 'crawl' mode", async () => { it("should consume points correctly for 'crawl' mode", async () => {
@ -188,14 +188,13 @@ describe("Rate Limiter Service", () => {
"test-prefix:someTokenXY", "test-prefix:someTokenXY",
"hobby" "hobby"
); );
// expect hobby to have 100 points expect(limiter.points).toBe(20);
expect(limiter.points).toBe(10);
const consumePoints = 5; const consumePoints = 5;
const res = await limiter.consume("test-prefix:someTokenXY", consumePoints); const res = await limiter.consume("test-prefix:someTokenXY", consumePoints);
expect(res.consumedPoints).toBe(5); expect(res.consumedPoints).toBe(5);
expect(res.remainingPoints).toBe(5); expect(res.remainingPoints).toBe(15);
}); });
it("should return the correct rate limiter for 'crawl' mode", () => { it("should return the correct rate limiter for 'crawl' mode", () => {
@ -227,7 +226,7 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken", "test-prefix:someToken",
"free" "free"
); );
expect(limiter.points).toBe(5); expect(limiter.points).toBe(10);
const limiter2 = getRateLimiter( const limiter2 = getRateLimiter(
"scrape" as RateLimiterMode, "scrape" as RateLimiterMode,
@ -241,7 +240,14 @@ describe("Rate Limiter Service", () => {
"test-prefix:someToken", "test-prefix:someToken",
"standard" "standard"
); );
expect(limiter3.points).toBe(50); expect(limiter3.points).toBe(100);
const limiter4 = getRateLimiter(
"scrape" as RateLimiterMode,
"test-prefix:someToken",
"growth"
);
expect(limiter4.points).toBe(1000);
}); });
it("should return the correct rate limiter for 'search' mode", () => { it("should return the correct rate limiter for 'search' mode", () => {
@ -309,7 +315,7 @@ describe("Rate Limiter Service", () => {
"crawlStatus" as RateLimiterMode, "crawlStatus" as RateLimiterMode,
"test-prefix:someToken" "test-prefix:someToken"
); );
expect(limiter2.points).toBe(150); expect(limiter2.points).toBe(250);
}); });
it("should return the correct rate limiter for 'testSuite' mode", () => { it("should return the correct rate limiter for 'testSuite' mode", () => {

View File

@ -14,18 +14,20 @@ const RATE_LIMITS = {
standardNew: 10, standardNew: 10,
standardnew: 10, standardnew: 10,
growth: 50, growth: 50,
growthdouble: 50,
}, },
scrape: { scrape: {
default: 20, default: 20,
free: 5, free: 10,
starter: 20, starter: 20,
standard: 50, standard: 100,
standardOld: 40, standardOld: 40,
scale: 500, scale: 500,
hobby: 10, hobby: 20,
standardNew: 50, standardNew: 100,
standardnew: 50, standardnew: 100,
growth: 500, growth: 1000,
growthdouble: 1000,
}, },
search: { search: {
default: 20, default: 20,
@ -38,6 +40,20 @@ const RATE_LIMITS = {
standardNew: 50, standardNew: 50,
standardnew: 50, standardnew: 50,
growth: 500, growth: 500,
growthdouble: 500,
},
map:{
default: 20,
free: 5,
starter: 20,
standard: 40,
standardOld: 40,
scale: 500,
hobby: 10,
standardNew: 50,
standardnew: 50,
growth: 500,
growthdouble: 500,
}, },
preview: { preview: {
free: 5, free: 5,
@ -49,7 +65,7 @@ const RATE_LIMITS = {
}, },
crawlStatus: { crawlStatus: {
free: 150, free: 150,
default: 150, default: 250,
}, },
testSuite: { testSuite: {
free: 10000, free: 10000,
@ -81,16 +97,28 @@ export const testSuiteRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const devBRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "dev-b",
points: 1200,
duration: 60, // Duration in seconds
});
export function getRateLimiter( export function getRateLimiter(
mode: RateLimiterMode, mode: RateLimiterMode,
token: string, token: string,
plan?: string plan?: string,
teamId?: string
) { ) {
if (token.includes("a01ccae") || token.includes("6254cf9")) { if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673") || token.includes("23befa1b")) {
return testSuiteRateLimiter; return testSuiteRateLimiter;
} }
if(teamId && teamId === process.env.DEV_B_TEAM_ID) {
return devBRateLimiter;
}
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
if (!rateLimitConfig) return serverRateLimiter; if (!rateLimitConfig) return serverRateLimiter;

View File

@ -0,0 +1,18 @@
// Import with `import * as Sentry from "@sentry/node"` if you are using ESM
import * as Sentry from "@sentry/node";
import { nodeProfilingIntegration } from "@sentry/profiling-node";
import { Logger } from "../lib/logger";
if (process.env.SENTRY_DSN) {
Logger.info("Setting up Sentry...");
Sentry.init({
dsn: process.env.SENTRY_DSN,
integrations: [
nodeProfilingIntegration(),
],
tracesSampleRate: process.env.SENTRY_ENVIRONMENT === "dev" ? 1.0 : 0.045,
profilesSampleRate: 1.0,
serverName: process.env.FLY_MACHINE_ID,
environment: process.env.SENTRY_ENVIRONMENT ?? "production",
});
}

View File

@ -0,0 +1,81 @@
import si from 'systeminformation';
import { Mutex } from "async-mutex";
const MAX_CPU = process.env.MAX_CPU ? parseFloat(process.env.MAX_CPU) : 0.8;
const MAX_RAM = process.env.MAX_RAM ? parseFloat(process.env.MAX_RAM) : 0.8;
const CACHE_DURATION = process.env.SYS_INFO_MAX_CACHE_DURATION ? parseFloat(process.env.SYS_INFO_MAX_CACHE_DURATION) : 150;
class SystemMonitor {
private static instance: SystemMonitor;
private static instanceMutex = new Mutex();
private cpuUsageCache: number | null = null;
private memoryUsageCache: number | null = null;
private lastCpuCheck: number = 0;
private lastMemoryCheck: number = 0;
private constructor() {}
public static async getInstance(): Promise<SystemMonitor> {
if (SystemMonitor.instance) {
return SystemMonitor.instance;
}
await this.instanceMutex.runExclusive(async () => {
if (!SystemMonitor.instance) {
SystemMonitor.instance = new SystemMonitor();
}
});
return SystemMonitor.instance;
}
private async checkMemoryUsage() {
const now = Date.now();
if (this.memoryUsageCache !== null && (now - this.lastMemoryCheck) < CACHE_DURATION) {
return this.memoryUsageCache;
}
const memoryData = await si.mem();
const totalMemory = memoryData.total;
const availableMemory = memoryData.available;
const usedMemory = totalMemory - availableMemory;
const usedMemoryPercentage = (usedMemory / totalMemory);
this.memoryUsageCache = usedMemoryPercentage;
this.lastMemoryCheck = now;
return usedMemoryPercentage;
}
private async checkCpuUsage() {
const now = Date.now();
if (this.cpuUsageCache !== null && (now - this.lastCpuCheck) < CACHE_DURATION) {
return this.cpuUsageCache;
}
const cpuData = await si.currentLoad();
const cpuLoad = cpuData.currentLoad / 100;
this.cpuUsageCache = cpuLoad;
this.lastCpuCheck = now;
return cpuLoad;
}
public async acceptConnection() {
const cpuUsage = await this.checkCpuUsage();
const memoryUsage = await this.checkMemoryUsage();
return cpuUsage < MAX_CPU && memoryUsage < MAX_RAM;
}
public clearCache() {
this.cpuUsageCache = null;
this.memoryUsageCache = null;
this.lastCpuCheck = 0;
this.lastMemoryCheck = 0;
}
}
export default SystemMonitor.getInstance();

View File

@ -1,15 +1,16 @@
import { legacyDocumentConverter } from "../../src/controllers/v1/types";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, jobId: string,data: any) => { export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string, v1 = false) => {
try { try {
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId); const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
let webhookUrl = selfHostedUrl; let webhookUrl = specified ?? selfHostedUrl;
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
// and the USE_DB_AUTHENTICATION environment variable is set to true // and the USE_DB_AUTHENTICATION environment variable is set to true
if (!selfHostedUrl && useDbAuthentication) { if (!webhookUrl && useDbAuthentication) {
const { data: webhooksData, error } = await supabase_service const { data: webhooksData, error } = await supabase_service
.from("webhooks") .from("webhooks")
.select("url") .select("url")
@ -30,11 +31,15 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
let dataToSend = []; let dataToSend = [];
if (data.result.links && data.result.links.length !== 0) { if (data.result.links && data.result.links.length !== 0) {
for (let i = 0; i < data.result.links.length; i++) { for (let i = 0; i < data.result.links.length; i++) {
dataToSend.push({ if (v1) {
content: data.result.links[i].content.content, dataToSend.push(legacyDocumentConverter(data.result.links[i].content))
markdown: data.result.links[i].content.markdown, } else {
metadata: data.result.links[i].content.metadata, dataToSend.push({
}); content: data.result.links[i].content.content,
markdown: data.result.links[i].content.markdown,
metadata: data.result.links[i].content.metadata,
});
}
} }
} }

View File

@ -25,8 +25,14 @@ export interface WebScraperOptions {
mode: Mode; mode: Mode;
crawlerOptions: any; crawlerOptions: any;
pageOptions: any; pageOptions: any;
extractorOptions?: any;
team_id: string; team_id: string;
origin?: string; origin?: string;
crawl_id?: string;
sitemapped?: boolean;
webhook?: string;
v1?: boolean;
is_scrape?: boolean;
} }
export interface RunWebScraperParams { export interface RunWebScraperParams {
@ -34,11 +40,14 @@ export interface RunWebScraperParams {
mode: Mode; mode: Mode;
crawlerOptions: any; crawlerOptions: any;
pageOptions?: any; pageOptions?: any;
extractorOptions?: any;
inProgress: (progress: any) => void; inProgress: (progress: any) => void;
onSuccess: (result: any) => void; onSuccess: (result: any, mode: string) => void;
onError: (error: Error) => void; onError: (error: Error) => void;
team_id: string; team_id: string;
bull_job_id: string; bull_job_id: string;
priority?: number;
is_scrape?: boolean;
} }
export interface RunWebScraperResult { export interface RunWebScraperResult {
@ -63,6 +72,7 @@ export interface FirecrawlJob {
extractor_options?: ExtractorOptions, extractor_options?: ExtractorOptions,
num_tokens?: number, num_tokens?: number,
retry?: boolean, retry?: boolean,
crawl_id?: string;
} }
export interface FirecrawlScrapeResponse { export interface FirecrawlScrapeResponse {
@ -99,6 +109,7 @@ export enum RateLimiterMode {
Scrape = "scrape", Scrape = "scrape",
Preview = "preview", Preview = "preview",
Search = "search", Search = "search",
Map = "map",
} }
@ -107,7 +118,8 @@ export interface AuthResponse {
team_id?: string; team_id?: string;
error?: string; error?: string;
status?: number; status?: number;
plan?: string; api_key?: string;
plan?: PlanType;
} }
@ -130,4 +142,15 @@ export type ScrapeLog = {
html?: string; html?: string;
ipv4_support?: boolean | null; ipv4_support?: boolean | null;
ipv6_support?: boolean | null; ipv6_support?: boolean | null;
}; };
export type PlanType =
| "starter"
| "standard"
| "scale"
| "hobby"
| "standardnew"
| "growth"
| "growthdouble"
| "free"
| "";

View File

@ -2,16 +2,22 @@
"compilerOptions": { "compilerOptions": {
"rootDir": "./src", "rootDir": "./src",
"lib": ["es6","DOM"], "lib": ["es6","DOM"],
"target": "ES2020", // or higher
// or higher
"target": "ES2020",
"module": "commonjs", "module": "commonjs",
"esModuleInterop": true, "esModuleInterop": true,
"sourceMap": true, "sourceMap": true,
"outDir": "./dist/src", "outDir": "./dist/src",
"moduleResolution": "node", "moduleResolution": "node",
"baseUrl": ".", "baseUrl": ".",
"paths": { "paths": {
"*": ["node_modules/*", "src/types/*"], "*": ["node_modules/*", "src/types/*"],
} },
"inlineSources": true
}, },
"include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
} }

Some files were not shown because too many files have changed in this diff Show More