Merge branch 'main' into bug/crawl-limit

This commit is contained in:
rafaelsideguide 2024-05-22 14:27:28 -03:00
commit f4a3469b9e
44 changed files with 3497 additions and 309 deletions

View File

@ -25,6 +25,9 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
jobs: jobs:
pre-deploy: pre-deploy:

View File

@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom
## What is Firecrawl? ## What is Firecrawl?
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required. [Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
_Pst. hey, you, join our stargazers :)_ _Pst. hey, you, join our stargazers :)_
@ -114,7 +114,7 @@ Response:
### Search (Beta) ### Search (Beta)
Used to search the web, get the most relevant results, scrap each page and return the markdown. Used to search the web, get the most relevant results, scrape each page and return the markdown.
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/search \ curl -X POST https://api.firecrawl.dev/v0/search \
@ -296,7 +296,6 @@ npm install @mendable/firecrawl-js
1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
### Scraping a URL ### Scraping a URL
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary. To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.

View File

@ -1,6 +1,31 @@
# Self-hosting Firecrawl # Self-hosting Firecrawl
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
*This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* ## Getting Started
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`.
```bash
git clone https://github.com/mendableai/firecrawl.git
cd firecrawl
cp ./apps/api/.env.example ./.env
```
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication.
```yml
USE_DB_AUTHENTICATION=false
```
Update the Redis URL in the .env file to align with the Docker configuration:
```yml
REDIS_URL=redis://redis:6379
```
Once that's complete, you can simply run the following commands to get started:
```bash
docker compose up
```
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.

View File

@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8
PORT=3002 PORT=3002
HOST=0.0.0.0 HOST=0.0.0.0
REDIS_URL=redis://localhost:6379 REDIS_URL=redis://localhost:6379
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000
## To turn on DB authentication, you need to set up supabase. ## To turn on DB authentication, you need to set up supabase.
USE_DB_AUTHENTICATION=true USE_DB_AUTHENTICATION=true
@ -16,14 +17,22 @@ SUPABASE_SERVICE_TOKEN=
# Other Optionals # Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= # BULL_AUTH_KEY= #
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs
STRIPE_PRICE_ID_STANDARD=
STRIPE_PRICE_ID_SCALE=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL", "summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeSingleUrl", "operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -45,8 +45,43 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
},
"extractorOptions": {
"type": "object",
"description": "Options for LLM-based extraction of structured information from the page content",
"properties": {
"mode": {
"type": "string",
"enum": ["llm-extraction"],
"description": "The extraction mode to use, currently supports 'llm-extraction'"
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page"
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
} }
}, },
"required": ["url"] "required": ["url"]
@ -126,9 +161,20 @@
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false "default": false
}, },
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl" "description": "Maximum number of pages to crawl",
"default": 10000
} }
} }
}, },
@ -139,6 +185,11 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
} }
@ -191,7 +242,7 @@
"query": { "query": {
"type": "string", "type": "string",
"format": "uri", "format": "uri",
"description": "The URL to scrape" "description": "The query to search for"
}, },
"pageOptions": { "pageOptions": {
"type": "object", "type": "object",
@ -205,6 +256,11 @@
"type": "boolean", "type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true "default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
}, },
@ -298,9 +354,66 @@
"data": { "data": {
"type": "array", "type": "array",
"items": { "items": {
"$ref": "#/components/schemas/ScrapeResponse" "$ref": "#/components/schemas/CrawlStatusResponseObj"
}, },
"description": "Data returned from the job (null when it is in progress)" "description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
}
}
}
}
}
},
"402": {
"description": "Payment required"
},
"429": {
"description": "Too many requests"
},
"500": {
"description": "Server error"
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
} }
} }
} }
@ -343,6 +456,11 @@
"content": { "content": {
"type": "string" "type": "string"
}, },
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": { "metadata": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -361,6 +479,51 @@
"format": "uri" "format": "uri"
} }
} }
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
} }
} }
} }

View File

@ -33,6 +33,7 @@
"express": "^4.18.2", "express": "^4.18.2",
"jest": "^29.6.3", "jest": "^29.6.3",
"jest-fetch-mock": "^3.0.3", "jest-fetch-mock": "^3.0.3",
"mammoth": "^1.7.2",
"nodemon": "^2.0.20", "nodemon": "^2.0.20",
"supabase": "^1.77.9", "supabase": "^1.77.9",
"supertest": "^6.3.3", "supertest": "^6.3.3",
@ -47,6 +48,7 @@
"@bull-board/express": "^5.8.0", "@bull-board/express": "^5.8.0",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13", "@dqbd/tiktoken": "^1.0.13",
"@hyperdx/node-opentelemetry": "^0.7.0",
"@logtail/node": "^0.4.12", "@logtail/node": "^0.4.12",
"@nangohq/node": "^0.36.33", "@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0", "@sentry/node": "^7.48.0",

File diff suppressed because it is too large Load Diff

View File

@ -81,7 +81,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it("should return a successful response with a valid API key and includeHtml set to true", async () => { it("should return a successful response with a valid API key and includeHtml set to true", async () => {
@ -99,10 +99,40 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("html");
expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain("🔥 FireCrawl"); expect(response.body.data.content).toContain("🔥 Firecrawl");
expect(response.body.data.markdown).toContain("🔥 FireCrawl"); expect(response.body.data.markdown).toContain("🔥 Firecrawl");
expect(response.body.data.html).toContain("<h1"); expect(response.body.data.html).toContain("<h1");
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it('should return a successful response for a valid scrape with PDF file', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
it('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds
}); });
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
@ -146,7 +176,274 @@ describe("E2E Tests for API Routes", () => {
); );
}); });
// Additional tests for insufficient credits? it("should return a successful response with a valid API key and valid includes option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
includes: ["blog/*"],
},
});
let response;
let isFinished = false;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
console.log({url})
expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
});
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
}, 60000); // 60 seconds
it("should return a successful response with a valid API key and valid excludes option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
excludes: ["blog/*"],
},
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
});
}, 60000); // 60 seconds
it("should return a successful response with a valid API key and limit to 3", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: { limit: 3 },
});
let isFinished = false;
let response;
while (!isFinished) {
response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
isFinished = response.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = response;
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data.length).toBe(3);
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
}, 60000); // 60 seconds
it("should return a successful response with max depth option for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 2 },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 60 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThan(1);
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
const depth = new URL(url).pathname.split("/").filter(Boolean).length;
expect(depth).toBeLessThanOrEqual(1);
});
}, 120000);
// it("should return a successful response with a valid API key and valid limit option", async () => {
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://mendable.ai",
// crawlerOptions: { limit: 10 },
// });
// const response = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("status");
// expect(response.body.status).toBe("active");
// let isCompleted = false;
// while (!isCompleted) {
// const statusCheckResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusCheckResponse.statusCode).toBe(200);
// isCompleted = statusCheckResponse.body.status === "completed";
// if (!isCompleted) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const completedResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(completedResponse.statusCode).toBe(200);
// expect(completedResponse.body).toHaveProperty("status");
// expect(completedResponse.body.status).toBe("completed");
// expect(completedResponse.body).toHaveProperty("data");
// expect(completedResponse.body.data.length).toBe(10);
// expect(completedResponse.body.data[0]).toHaveProperty("content");
// expect(completedResponse.body.data[0]).toHaveProperty("markdown");
// expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// expect(completedResponse.body.data[0].content).toContain("Mendable");
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
// }, 60000); // 60 seconds
it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
pageOptions: { includeHtml: true },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
// 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000);
}); });
describe("POST /v0/crawlWebsitePreview", () => { describe("POST /v0/crawlWebsitePreview", () => {
@ -176,6 +473,16 @@ describe("E2E Tests for API Routes", () => {
// expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
// }); // });
it("should return a timeout error when scraping takes longer than the specified timeout", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408);
}, 3000);
it("should return a successful response with a valid API key", async () => { it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview") .post("/v0/crawlWebsitePreview")
@ -238,7 +545,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(404); expect(response.statusCode).toBe(404);
}); });
it("should return a successful response for a valid crawl job", async () => { it("should return a successful crawl status response for a valid crawl job", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -246,27 +553,67 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://firecrawl.dev" }); .send({ url: "https://firecrawl.dev" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL) let isCompleted = false;
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) let completedResponse;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 30 seconds while (!isCompleted) {
await new Promise((r) => setTimeout(r, 30000)); const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
const completedResponse = await request(TEST_URL) if (response.body.status === "completed") {
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) isCompleted = true;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); completedResponse = response;
expect(completedResponse.statusCode).toBe(200); } else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
}, 60000); // 60 seconds
it('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
const crawlResponse = await request(TEST_URL)
.post('/v0/crawl')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }});
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
let completedResponse;
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('status');
if (response.body.status === 'completed') {
isCompleted = true;
completedResponse = response;
} else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
expect(completedResponse.body.status).toBe('completed');
expect(completedResponse.body).toHaveProperty('data');
expect(completedResponse.body.data.length).toEqual(1);
expect(completedResponse.body.data).toEqual(
expect.arrayContaining([
expect.objectContaining({
content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
})
])
);
}, 60000); // 60 seconds }, 60000); // 60 seconds
it("should return a successful response with max depth option for a valid crawl job", async () => { it("should return a successful response with max depth option for a valid crawl job", async () => {
@ -280,18 +627,21 @@ describe("E2E Tests for API Routes", () => {
}); });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL) let isCompleted = false;
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) let completedResponse;
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("active");
// wait for 60 seconds
await new Promise((r) => setTimeout(r, 60000));
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
while (!isCompleted) {
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
if (response.body.status === "completed") {
isCompleted = true;
completedResponse = response;
}
}
expect(completedResponse.statusCode).toBe(200); expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");
@ -347,8 +697,8 @@ describe("E2E Tests for API Routes", () => {
// 120 seconds // 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("🔥 FireCrawl"); expect(completedResponse.body.data[0].content).toContain("🔥 Firecrawl");
expect(completedResponse.body.data[0].markdown).toContain("FireCrawl"); expect(completedResponse.body.data[0].markdown).toContain("Firecrawl");
expect(completedResponse.body.data[0].html).toContain("<h1"); expect(completedResponse.body.data[0].html).toContain("<h1");
}, 60000); }, 60000);
}); // 60 seconds }); // 60 seconds
@ -361,10 +711,8 @@ describe("E2E Tests for API Routes", () => {
.send({ url: "https://jestjs.io" }); .send({ url: "https://jestjs.io" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
// wait for 30 seconds // wait for 30 seconds
await new Promise((r) => setTimeout(r, 10000)); await new Promise((r) => setTimeout(r, 20000));
const response = await request(TEST_URL) const response = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
@ -373,7 +721,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("status"); expect(response.body).toHaveProperty("status");
expect(response.body.status).toBe("cancelled"); expect(response.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 20000)); await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL) const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
@ -390,8 +738,6 @@ describe("E2E Tests for API Routes", () => {
}, 60000); // 60 seconds }, 60000); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => { describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => { it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -501,6 +847,107 @@ describe("E2E Tests for API Routes", () => {
// }, 120000); // 120 secs // }, 120000); // 120 secs
// }); // });
describe("POST /v0/crawl with fast mode", () => {
it("should complete the crawl under 20 seconds", async () => {
const startTime = Date.now();
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://flutterbricks.com",
crawlerOptions: {
mode: "fast"
}
});
expect(crawlResponse.statusCode).toBe(200);
const jobId = crawlResponse.body.jobId;
let statusResponse;
let isFinished = false;
while (!isFinished) {
statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusResponse.statusCode).toBe(200);
isFinished = statusResponse.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const endTime = Date.now();
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
console.log(`Time elapsed: ${timeElapsed} seconds`);
expect(statusResponse.body.status).toBe("completed");
expect(statusResponse.body).toHaveProperty("data");
expect(statusResponse.body.data[0]).toHaveProperty("content");
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
const results = statusResponse.body.data;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
// });
expect(results.length).toBeGreaterThanOrEqual(10);
expect(results.length).toBeLessThanOrEqual(15);
}, 20000);
// it("should complete the crawl in more than 10 seconds", async () => {
// const startTime = Date.now();
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://flutterbricks.com",
// });
// expect(crawlResponse.statusCode).toBe(200);
// const jobId = crawlResponse.body.jobId;
// let statusResponse;
// let isFinished = false;
// while (!isFinished) {
// statusResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusResponse.statusCode).toBe(200);
// isFinished = statusResponse.body.status === "completed";
// if (!isFinished) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const endTime = Date.now();
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
// console.log(`Time elapsed: ${timeElapsed} seconds`);
// expect(statusResponse.body.status).toBe("completed");
// expect(statusResponse.body).toHaveProperty("data");
// expect(statusResponse.body.data[0]).toHaveProperty("content");
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
// const results = statusResponse.body.data;
// // results.forEach((result, i) => {
// // console.log(result.metadata.sourceURL);
// // });
// expect(results.length).toBeGreaterThanOrEqual(10);
// expect(results.length).toBeLessThanOrEqual(15);
// }, 50000);// 15 seconds timeout to account for network delays
});
describe("GET /is-production", () => { describe("GET /is-production", () => {
it("should return the production status", async () => { it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production"); const response = await request(TEST_URL).get("/is-production");
@ -508,4 +955,65 @@ describe("E2E Tests for API Routes", () => {
expect(response.body).toHaveProperty("isProduction"); expect(response.body).toHaveProperty("isProduction");
}); });
}); });
describe("Rate Limiter", () => {
it("should return 429 when rate limit is exceeded for preview token", async () => {
for (let i = 0; i < 5; i++) {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer this_is_just_a_preview_token`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com" });
expect(response.statusCode).toBe(200);
}
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer this_is_just_a_preview_token`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com" });
expect(response.statusCode).toBe(429);
}, 60000);
});
// it("should return 429 when rate limit is exceeded for API key", async () => {
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_SCRAPE); i++) {
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(200);
// }
// const response = await request(TEST_URL)
// .post("/v0/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(429);
// }, 60000);
// it("should return 429 when rate limit is exceeded for API key", async () => {
// for (let i = 0; i < parseInt(process.env.RATE_LIMIT_TEST_API_KEY_CRAWL); i++) {
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(200);
// }
// const response = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({ url: "https://www.scrapethissite.com" });
// expect(response.statusCode).toBe(429);
// }, 60000);
}); });

View File

@ -1,14 +1,25 @@
import { parseApi } from "../../src/lib/parseApi"; import { parseApi } from "../../src/lib/parseApi";
import { getRateLimiter } from "../../src/services/rate-limiter"; import { getRateLimiter, } from "../../src/services/rate-limiter";
import { AuthResponse, RateLimiterMode } from "../../src/types"; import { AuthResponse, RateLimiterMode } from "../../src/types";
import { supabase_service } from "../../src/services/supabase"; import { supabase_service } from "../../src/services/supabase";
import { withAuth } from "../../src/lib/withAuth"; import { withAuth } from "../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> { export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser)(req, res, mode);
} }
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key
});
} catch (error) {
console.error('Error setting trace attributes:', error);
}
}
export async function supaAuthenticateUser( export async function supaAuthenticateUser(
req, req,
res, res,
@ -19,7 +30,6 @@ export async function supaAuthenticateUser(
error?: string; error?: string;
status?: number; status?: number;
}> { }> {
const authHeader = req.headers.authorization; const authHeader = req.headers.authorization;
if (!authHeader) { if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 }; return { success: false, error: "Unauthorized", status: 401 };
@ -33,13 +43,87 @@ export async function supaAuthenticateUser(
}; };
} }
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
let rateLimiter: RateLimiterRedis;
let subscriptionData: { team_id: string, plan: string } | null = null;
let normalizedApi: string;
if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
} else {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service.rpc(
'get_key_and_price_id_2', { api_key: normalizedApi }
);
// get_key_and_price_id_2 rpc definition:
// create or replace function get_key_and_price_id_2(api_key uuid)
// returns table(key uuid, team_id uuid, price_id text) as $$
// begin
// if api_key is null then
// return query
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
// end if;
// return query
// select ak.key, ak.team_id, s.price_id
// from api_keys ak
// left join subscriptions s on ak.team_id = s.team_id
// where ak.key = api_key;
// end;
// $$ language plpgsql;
if (error) {
console.error('Error fetching key and price_id:', error);
} else {
// console.log('Key and Price ID:', data);
}
if (error || !data || data.length === 0) {
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const team_id = data[0].team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging
setTrace(team_id, normalizedApi);
subscriptionData = {
team_id: team_id,
plan: plan
}
switch (mode) {
case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
break;
case RateLimiterMode.Scrape:
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
break;
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
case RateLimiterMode.Search:
rateLimiter = getRateLimiter(RateLimiterMode.Search, token);
break;
case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
break;
default:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
break;
// case RateLimiterMode.Search:
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
// break;
}
}
try { try {
const incomingIP = (req.headers["x-forwarded-for"] || await rateLimiter.consume(iptoken);
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
await getRateLimiter(
token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode, token
).consume(iptoken);
} catch (rateLimiterRes) { } catch (rateLimiterRes) {
console.error(rateLimiterRes); console.error(rateLimiterRes);
return { return {
@ -66,19 +150,36 @@ export async function supaAuthenticateUser(
// return { success: false, error: "Unauthorized: Invalid token", status: 401 }; // return { success: false, error: "Unauthorized: Invalid token", status: 401 };
} }
const normalizedApi = parseApi(token);
// make sure api key is valid, based on the api_keys table in supabase // make sure api key is valid, based on the api_keys table in supabase
const { data, error } = await supabase_service if (!subscriptionData) {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service
.from("api_keys") .from("api_keys")
.select("*") .select("*")
.eq("key", normalizedApi); .eq("key", normalizedApi);
if (error || !data || data.length === 0) {
return { if (error || !data || data.length === 0) {
success: false, return {
error: "Unauthorized: Invalid token", success: false,
status: 401, error: "Unauthorized: Invalid token",
}; status: 401,
};
}
subscriptionData = data[0];
} }
return { success: true, team_id: data[0].team_id }; return { success: true, team_id: subscriptionData.team_id };
}
function getPlanByPriceId(price_id: string) {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STANDARD:
return 'standard';
case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale';
default:
return 'starter';
}
} }

View File

@ -15,6 +15,7 @@ export async function scrapeHelper(
crawlerOptions: any, crawlerOptions: any,
pageOptions: PageOptions, pageOptions: PageOptions,
extractorOptions: ExtractorOptions, extractorOptions: ExtractorOptions,
timeout: number
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -30,7 +31,6 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
} }
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",
@ -42,7 +42,19 @@ export async function scrapeHelper(
extractorOptions: extractorOptions, extractorOptions: extractorOptions,
}); });
const docs = await a.getDocuments(false); const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
);
const docsPromise = a.getDocuments(false);
let docs;
try {
docs = await Promise.race([docsPromise, timeoutPromise]);
} catch (error) {
return error;
}
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( const filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
@ -51,12 +63,11 @@ export async function scrapeHelper(
return { success: true, error: "No page found", returnCode: 200 }; return { success: true, error: "No page found", returnCode: 200 };
} }
let creditsToBeBilled = filteredDocs.length;
let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 5; const creditsPerLLMExtract = 5;
if (extractorOptions.mode === "llm-extraction"){ if (extractorOptions.mode === "llm-extraction") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
} }
const billingResult = await billTeam( const billingResult = await billTeam(
@ -95,7 +106,11 @@ export async function scrapeController(req: Request, res: Response) {
const extractorOptions = req.body.extractorOptions ?? { const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown" mode: "markdown"
} }
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
try { try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
@ -114,6 +129,7 @@ export async function scrapeController(req: Request, res: Response) {
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
extractorOptions, extractorOptions,
timeout
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -5,6 +5,8 @@ import "dotenv/config";
import { getWebScraperQueue } from "./services/queue-service"; import { getWebScraperQueue } from "./services/queue-service";
import { redisClient } from "./services/rate-limiter"; import { redisClient } from "./services/rate-limiter";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from '@hyperdx/node-opentelemetry';
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
@ -47,6 +49,11 @@ const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost"; const HOST = process.env.HOST ?? "localhost";
redisClient.connect(); redisClient.connect();
// HyperDX OpenTelemetry
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
export function startServer(port = DEFAULT_PORT) { export function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {

View File

@ -1,30 +1,43 @@
import OpenAI from "openai"; import OpenAI from "openai";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";
import { numTokensFromString } from "./helpers";
export type ScraperCompletionResult = { export type ScraperCompletionResult = {
data: any | null; data: any | null;
url: string; url: string;
}; };
const maxTokens = 32000;
const modifier = 4;
const defaultPrompt = const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
// Check if the markdown content exists in the document let markdown = document.markdown;
if (!document.markdown) {
// Check if the markdown content exists in the document
if (!markdown) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai"
); );
} }
return [{ type: "text", text: document.markdown }]; // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4");
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier));
}
return [[{ type: "text", text: markdown }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
client, client,
model = "gpt-4-turbo", model = "gpt-4o",
document, document,
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
@ -38,7 +51,7 @@ export async function generateOpenAICompletions({
temperature?: number; temperature?: number;
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const content = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,
@ -72,6 +85,7 @@ export async function generateOpenAICompletions({
return { return {
...document, ...document,
llm_extraction: llmExtraction, llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
}; };
} }

View File

@ -44,6 +44,7 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
}; };
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
@ -71,6 +72,7 @@ export class Document {
}; };
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string;
constructor(data: Partial<Document>) { constructor(data: Partial<Document>) {
if (!data.content) { if (!data.content) {

View File

@ -0,0 +1,42 @@
import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const scrapInBatches = async (
urls: string[],
batchSize: number,
delayMs: number
) => {
let successCount = 0;
let errorCount = 0;
for (let i = 0; i < urls.length; i += batchSize) {
const batch = urls
.slice(i, i + batchSize)
.map((url) => scrapWithFireEngine(url));
try {
const results = await Promise.all(batch);
results.forEach((data, index) => {
if (data.trim() === "") {
errorCount++;
} else {
successCount++;
console.log(
`Scraping result ${i + index + 1}:`,
data.trim().substring(0, 20) + "..."
);
}
});
} catch (error) {
console.error("Error during scraping:", error);
}
await delay(delayMs);
}
console.log(`Total successful scrapes: ${successCount}`);
console.log(`Total errored scrapes: ${errorCount}`);
};
function run() {
const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
scrapInBatches(urls, 10, 1000);
}

View File

@ -17,8 +17,10 @@ export async function startWebScraperPipeline({
crawlerOptions: job.data.crawlerOptions, crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions, pageOptions: job.data.pageOptions,
inProgress: (progress) => { inProgress: (progress) => {
partialDocs.push(progress.currentDocument); if (progress.currentDocument) {
job.progress({...progress, partialDocs: partialDocs}); partialDocs.push(progress.currentDocument);
job.progress({ ...progress, partialDocs: partialDocs });
}
}, },
onSuccess: (result) => { onSuccess: (result) => {
job.moveToCompleted(result); job.moveToCompleted(result);
@ -27,7 +29,7 @@ export async function startWebScraperPipeline({
job.moveToFailed(error); job.moveToFailed(error);
}, },
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString() bull_job_id: job.id.toString(),
})) as { success: boolean; message: string; docs: Document[] }; })) as { success: boolean; message: string; docs: Document[] };
} }
export async function runWebScraper({ export async function runWebScraper({
@ -63,26 +65,25 @@ export async function runWebScraper({
urls: [url], urls: [url],
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, pageOptions: pageOptions,
bullJobId: bull_job_id bullJobId: bull_job_id,
}); });
} else { } else {
await provider.setOptions({ await provider.setOptions({
mode: mode, mode: mode,
urls: url.split(","), urls: url.split(","),
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions pageOptions: pageOptions,
}); });
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress); inProgress(progress);
})) as Document[]; })) as Document[];
if (docs.length === 0) { if (docs.length === 0) {
return { return {
success: true, success: true,
message: "No pages found", message: "No pages found",
docs: [] docs: [],
}; };
} }
@ -95,18 +96,14 @@ export async function runWebScraper({
}) })
: docs.filter((doc) => doc.content.trim().length > 0); : docs.filter((doc) => doc.content.trim().length > 0);
const billingResult = await billTeam(team_id, filteredDocs.length);
const billingResult = await billTeam(
team_id,
filteredDocs.length
);
if (!billingResult.success) { if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found"); // throw new Error("Failed to bill team, no subscription was found");
return { return {
success: false, success: false,
message: "Failed to bill team, no subscription was found", message: "Failed to bill team, no subscription was found",
docs: [] docs: [],
}; };
} }

View File

@ -4,7 +4,7 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { Progress } from "../../lib/entities";
import { scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
export class WebCrawler { export class WebCrawler {
@ -15,7 +15,7 @@ export class WebCrawler {
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number; private maxCrawledDepth: number;
private visited: Set<string> = new Set(); private visited: Set<string> = new Set();
private crawledUrls: Set<string> = new Set(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
@ -51,7 +51,6 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
@ -77,9 +76,22 @@ export class WebCrawler {
// Check if the link matches the include patterns, if any are specified // Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0 && this.includes[0] !== "") { if (this.includes.length > 0 && this.includes[0] !== "") {
return this.includes.some((includePattern) => if (!this.includes.some((includePattern) =>
new RegExp(includePattern).test(path) new RegExp(includePattern).test(path)
); )) {
return false;
}
}
// Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL(this.initialUrl);
const normalizedLink = new URL(link);
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
return false;
} }
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
@ -99,19 +111,21 @@ export class WebCrawler {
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl); const response = await axios.get(this.robotsTxtUrl);
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) { } catch (error) {
console.error(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks; return filteredLinks.map(link => ({ url: link, html: "" }));
} }
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
@ -123,18 +137,20 @@ export class WebCrawler {
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
) { ) {
return [this.initialUrl]; return [{ url: this.initialUrl, html: "" }];
} }
// make sure to run include exclude here again // make sure to run include exclude here again
return this.filterLinks(urls, limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
@ -143,13 +159,26 @@ export class WebCrawler {
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task);
newUrls.forEach((url) => this.crawledUrls.add(url)); // add the initial url if not already added
// if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl;
// if (!normalizedInitial.endsWith("/")) {
// normalizedInitial = normalizedInitial + "/";
// }
// if (!newUrls.some(page => page.url === this.initialUrl)) {
// newUrls.push({ url: this.initialUrl, html: "" });
// }
// }
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: Math.min(this.maxCrawledLinks, this.limit), total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1], currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
@ -159,7 +188,7 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls, concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -175,34 +204,48 @@ export class WebCrawler {
} }
); );
await queue.drain(); await queue.drain();
return Array.from(this.crawledUrls); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<string[]> { async crawl(url: string): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
return []; return [];
}
this.visited.add(url); this.visited.add(url);
if (!url.startsWith("http")) { if (!url.startsWith("http")) {
url = "https://" + url; url = "https://" + url;
} }
if (url.endsWith("/")) { if (url.endsWith("/")) {
url = url.slice(0, -1); url = url.slice(0, -1);
} }
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
return []; return [];
} }
try { try {
let content; let content : string = "";
// If it is the first link, fetch with scrapingbee // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
content = await scrapWithScrapingBee(url, "load"); const page = await scrapSingleUrl(url, {includeHtml: true});
content = page.html ?? ""
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data; content = response.data ?? "";
} }
const $ = load(content); const $ = load(content);
let links: string[] = []; let links: {url: string, html: string}[] = [];
// Add the initial URL to the list of links
if(this.visited.size === 1)
{
links.push({url, html: content});
}
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
@ -215,7 +258,6 @@ export class WebCrawler {
const path = url.pathname; const path = url.pathname;
if ( if (
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) && this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
@ -223,12 +265,16 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
links.push(fullUrl); links.push({url: fullUrl, html: content});
} }
} }
}); });
return links.filter((link) => !this.visited.has(link)); if(this.visited.size === 1){
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
return []; return [];
} }
@ -275,9 +321,15 @@ export class WebCrawler {
".mp4", ".mp4",
".mp3", ".mp3",
".pptx", ".pptx",
".docx", // ".docx",
".xlsx", ".xlsx",
".xml", ".xml",
".avi",
".flv",
".woff",
".ttf",
".woff2",
".webp"
]; ];
return fileExtensions.some((ext) => url.endsWith(ext)); return fileExtensions.some((ext) => url.endsWith(ext));
} }
@ -294,18 +346,57 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }
//
private async tryFetchSitemapLinks(url: string): Promise<string[]> { private async tryFetchSitemapLinks(url: string): Promise<string[]> {
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
const sitemapUrl = url.endsWith("/sitemap.xml") const sitemapUrl = url.endsWith("/sitemap.xml")
? url ? url
: `${url}/sitemap.xml`; : `${url}/sitemap.xml`;
let sitemapLinks: string[] = [];
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl);
if (response.status === 200) { if (response.status === 200) {
return await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap(sitemapUrl);
} }
} catch (error) { } catch (error) {
// Error handling for failed sitemap fetch // Error handling for failed sitemap fetch
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
} }
return [];
if (sitemapLinks.length === 0) {
// If the first one doesn't work, try the base URL
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap);
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
}
} catch (error) {
// Error handling for failed base URL sitemap fetch
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
}
}
// Normalize and check if the URL is present in any of the sitemaps
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
// do not push the normalized url
sitemapLinks.push(url);
}
return sitemapLinks;
} }
} }

View File

@ -17,6 +17,7 @@ import {
} from "./utils/replacePaths"; } from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction"; import { generateCompletions } from "../../lib/LLM-extraction";
import { getWebScraperQueue } from "../../../src/services/queue-service"; import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private bullJobId: string; private bullJobId: string;
@ -35,6 +36,7 @@ export class WebScraperDataProvider {
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default";
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -46,7 +48,8 @@ export class WebScraperDataProvider {
private async convertUrlsToDocuments( private async convertUrlsToDocuments(
urls: string[], urls: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
const totalUrls = urls.length; const totalUrls = urls.length;
let processedUrls = 0; let processedUrls = 0;
@ -56,7 +59,12 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests); const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all( await Promise.all(
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const result = await scrapSingleUrl(url, this.pageOptions); const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(
url,
this.pageOptions,
existingHTML
);
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
inProgress({ inProgress({
@ -127,9 +135,30 @@ export class WebScraperDataProvider {
} }
} }
private async cleanIrrelevantPath(links: string[]) {
return links.filter((link) => {
const normalizedInitialUrl = new URL(this.urls[0]);
const normalizedLink = new URL(link);
// Normalize the hostname to account for www and non-www versions
const initialHostname = normalizedInitialUrl.hostname.replace(
/^www\./,
""
);
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
return (
linkHostname === initialHostname &&
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
);
});
}
private async handleCrawlMode( private async handleCrawlMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,
@ -139,19 +168,38 @@ export class WebScraperDataProvider {
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
}); });
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
let links = await crawler.start(
inProgress,
5,
this.limit,
this.maxCrawledDepth
);
let allLinks = links.map((e) => e.url);
const allHtmls = links.map((e) => e.html);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(allLinks, inProgress);
} }
let documents = await this.processLinks(links, inProgress); let documents = [];
return this.cacheAndFinalizeDocuments(documents, links); // check if fast mode is enabled and there is html inside the links
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
documents = await this.processLinks(allLinks, inProgress, allHtmls);
} else {
documents = await this.processLinks(allLinks, inProgress);
}
return this.cacheAndFinalizeDocuments(documents, allLinks);
} }
private async handleSingleUrlsMode( private async handleSingleUrlsMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let documents = await this.processLinks(this.urls, inProgress); const links = this.urls;
let documents = await this.processLinks(links, inProgress);
return documents; return documents;
} }
@ -159,6 +207,8 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap(this.urls[0]);
links = await this.cleanIrrelevantPath(links);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(links, inProgress);
} }
@ -187,16 +237,26 @@ export class WebScraperDataProvider {
private async processLinks( private async processLinks(
links: string[], links: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); const pdfLinks = links.filter(link => link.endsWith(".pdf"));
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
links = links.filter((link) => !link.endsWith(".pdf"));
let documents = await this.convertUrlsToDocuments(links, inProgress); const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
const docxDocuments = await this.fetchDocxDocuments(docLinks);
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
let documents = await this.convertUrlsToDocuments(
links,
inProgress,
allHtmls
);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
this.extractorOptions.mode === "llm-extraction" && this.extractorOptions.mode === "llm-extraction" &&
@ -204,7 +264,7 @@ export class WebScraperDataProvider {
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions);
} }
return documents.concat(pdfDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
@ -219,6 +279,18 @@ export class WebScraperDataProvider {
}) })
); );
} }
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (p) => {
const docXDocument = await fetchAndProcessDocx(p);
return {
content: docXDocument,
metadata: { sourceURL: p },
provider: "web-scraper",
};
})
);
}
private applyPathReplacements(documents: Document[]): Document[] { private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths return this.replaceAllPathsWithAbsolutePaths
@ -395,8 +467,9 @@ export class WebScraperDataProvider {
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -6,9 +6,19 @@ import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags"; import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
dotenv.config(); dotenv.config();
const baseScrapers = [
"fire-engine",
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
] as const;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -32,15 +42,39 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} }
export async function scrapWithCustomFirecrawl( export async function scrapWithFireEngine(
url: string, url: string,
options?: any options?: any
): Promise<string> { ): Promise<string> {
try { try {
// TODO: merge the custom firecrawl scraper into mono-repo when ready const reqParams = await generateRequestParams(url);
return null; const wait_playwright = reqParams["params"]?.wait ?? 0;
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL+ "/scrape", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ url: url, wait: wait_playwright }),
});
if (!response.ok) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
}
} catch (error) { } catch (error) {
console.error(`Error scraping with custom firecrawl-scraper: ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -62,15 +96,21 @@ export async function scrapWithScrapingBee(
if (response.status !== 200 && response.status !== 404) { if (response.status !== 200 && response.status !== 404) {
console.error( console.error(
`Scraping bee error in ${url} with status code ${response.status}` `[ScrapingBee] Error fetching url: ${url} with status code ${response.status}`
); );
return ""; return "";
} }
const decoder = new TextDecoder();
const text = decoder.decode(response.data); const contentType = response.headers['content-type'];
return text; if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
}
} catch (error) { } catch (error) {
console.error(`Error scraping with Scraping Bee: ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
@ -90,23 +130,80 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
if (!response.ok) { if (!response.ok) {
console.error( console.error(
`Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return "";
} }
const data = await response.json(); const contentType = response.headers['content-type'];
const html = data.content; if (contentType && contentType.includes('application/pdf')) {
return html ?? ""; return fetchAndProcessPdf(url);
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
}
} catch (error) { } catch (error) {
console.error(`Error scraping with Puppeteer: ${error}`); console.error(`[Playwright][c] Error fetching url: ${url} -> ${error}`);
return ""; return "";
} }
} }
export async function scrapWithFetch(url: string): Promise<string> {
try {
const response = await fetch(url);
if (!response.ok) {
console.error(
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
);
return "";
}
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const text = await response.text();
return text;
}
} catch (error) {
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
return "";
}
}
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(defaultScraper?: string) {
const availableScrapers = baseScrapers.filter(scraper => {
switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
const defaultOrder = ["scrapingBee", "fire-engine", "playwright", "scrapingBeeLoad", "fetch"];
const filteredDefaultOrder = defaultOrder.filter((scraper: typeof baseScrapers[number]) => availableScrapers.includes(scraper));
const uniqueScrapers = new Set(defaultScraper ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] : [...filteredDefaultOrder, ...availableScrapers]);
const scrapersInOrder = Array.from(uniqueScrapers);
return scrapersInOrder as typeof baseScrapers[number][];
}
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -124,17 +221,14 @@ export async function scrapSingleUrl(
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: method: typeof baseScrapers[number]
| "firecrawl-scraper"
| "scrapingBee"
| "playwright"
| "scrapingBeeLoad"
| "fetch"
) => { ) => {
let text = ""; let text = "";
switch (method) { switch (method) {
case "firecrawl-scraper": case "fire-engine":
text = await scrapWithCustomFirecrawl(url); if (process.env.FIRE_ENGINE_BETA_URL) {
text = await scrapWithFireEngine(url);
}
break; break;
case "scrapingBee": case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) { if (process.env.SCRAPING_BEE_API_KEY) {
@ -156,19 +250,7 @@ export async function scrapSingleUrl(
} }
break; break;
case "fetch": case "fetch":
try { text = await scrapWithFetch(url);
const response = await fetch(url);
if (!response.ok) {
console.error(
`Error fetching URL: ${url} with status: ${response.status}`
);
return "";
}
text = await response.text();
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";
}
break; break;
} }
@ -186,20 +268,22 @@ export async function scrapSingleUrl(
console.error(`Invalid URL key, trying: ${urlToScrap}`); console.error(`Invalid URL key, trying: ${urlToScrap}`);
} }
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = defaultScraper const scrapersInOrder = getScrapingFallbackOrder(defaultScraper)
? [
defaultScraper,
"scrapingBee",
"playwright",
"scrapingBeeLoad",
"fetch",
]
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
if (existingHtml && existingHtml.trim().length >= 100) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break;
}
[text, html] = await attemptScraping(urlToScrap, scraper); [text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.length >= 100) break; if (text && text.trim().length >= 100) break;
console.log(`Falling back to ${scraper}`); const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
if (nextScraperIndex < scrapersInOrder.length) {
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
}
} }
if (!text) { if (!text) {

View File

@ -0,0 +1,13 @@
import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY;
const docxContent = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
);
expect(docxContent.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
);
});
});

View File

@ -63,7 +63,7 @@ export const urlSpecificParams = {
}, },
}, },
"ycombinator.com":{ "ycombinator.com":{
defaultScraper: "playwright", defaultScraper: "fire-engine",
params: { params: {
wait_browser: "networkidle2", wait_browser: "networkidle2",
block_resources: false, block_resources: false,
@ -121,5 +121,25 @@ export const urlSpecificParams = {
accept: accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}, },
},
"help.salesforce.com":{
defaultScraper: "playwright",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
} }
}; };

View File

@ -0,0 +1,41 @@
import axios from "axios";
import fs from "fs";
import { createWriteStream } from "node:fs";
import path from "path";
import os from "os";
import mammoth from "mammoth";
export async function fetchAndProcessDocx(url: string): Promise<string> {
const tempFilePath = await downloadDocx(url);
const content = await processDocxToText(tempFilePath);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
return content;
}
async function downloadDocx(url: string): Promise<string> {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
const writer = createWriteStream(tempFilePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", () => resolve(tempFilePath));
writer.on("error", reject);
});
}
export async function processDocxToText(filePath: string): Promise<string> {
const content = await extractTextFromDocx(filePath);
return content;
}
async function extractTextFromDocx(filePath: string): Promise<string> {
const result = await mammoth.extractRawText({ path: filePath });
return result.value;
}

View File

@ -34,8 +34,6 @@ export const excludeNonMainTags = [
"#nav", "#nav",
".breadcrumbs", ".breadcrumbs",
"#breadcrumbs", "#breadcrumbs",
".form",
"form",
"#search-form", "#search-form",
".search", ".search",
"#search", "#search",
@ -51,10 +49,6 @@ export const excludeNonMainTags = [
"#tag", "#tag",
".category", ".category",
"#category", "#category",
".comment", ".cookie",
"#comment", "#cookie"
".reply",
"#reply",
".author",
"#author",
]; ];

View File

@ -19,8 +19,8 @@ export async function fetchAndProcessPdf(url: string): Promise<string> {
async function downloadPdf(url: string): Promise<string> { async function downloadPdf(url: string): Promise<string> {
const response = await axios({ const response = await axios({
url, url,
method: 'GET', method: "GET",
responseType: 'stream', responseType: "stream",
}); });
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
@ -29,8 +29,8 @@ async function downloadPdf(url: string): Promise<string> {
response.data.pipe(writer); response.data.pipe(writer);
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(tempFilePath)); writer.on("finish", () => resolve(tempFilePath));
writer.on('error', reject); writer.on("error", reject);
}); });
} }
@ -77,12 +77,12 @@ export async function processPdfToText(filePath: string): Promise<string> {
} else { } else {
// If the status code is not 200, increment the attempt counter and wait // If the status code is not 200, increment the attempt counter and wait
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
} }
} catch (error) { } catch (error) {
console.error("Error fetching result:", error); console.error("Error fetching result:", error || '');
attempt++; attempt++;
await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
} }
} }
@ -101,7 +101,7 @@ export async function processPdfToText(filePath: string): Promise<string> {
return content; return content;
} }
async function processPdf(file: string){ async function processPdf(file: string) {
const fileContent = fs.readFileSync(file); const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;

View File

@ -227,10 +227,11 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) { if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used; totalCreditsUsed = creditUsages[0].total_credits_used;
console.log("Total Credits Used:", totalCreditsUsed); // console.log("Total Credits Used:", totalCreditsUsed);
} }
} catch (error) { } catch (error) {
console.error("Error calculating credit usage:", error); console.error("Error calculating credit usage:", error);
} }
// Adjust total credits used by subtracting coupon value // Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);

View File

@ -5,6 +5,11 @@ import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper"; import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook"; import { callWebhook } from "./webhook";
import { logJob } from "./logging/log_job"; import { logJob } from "./logging/log_job";
import { initSDK } from '@hyperdx/node-opentelemetry';
if(process.env.ENV === 'production') {
initSDK({ consoleCapture: true, additionalInstrumentations: []});
}
getWebScraperQueue().process( getWebScraperQueue().process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
@ -26,7 +31,7 @@ getWebScraperQueue().process(
success: success, success: success,
result: { result: {
links: docs.map((doc) => { links: docs.map((doc) => {
return { content: doc, source: doc.metadata.sourceURL }; return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,

View File

@ -2,17 +2,21 @@ import { RateLimiterRedis } from "rate-limiter-flexible";
import * as redis from "redis"; import * as redis from "redis";
import { RateLimiterMode } from "../../src/types"; import { RateLimiterMode } from "../../src/types";
const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; const MAX_CRAWLS_PER_MINUTE_STARTER = 3;
const MAX_CRAWLS_PER_MINUTE_STARTER = 2; const MAX_CRAWLS_PER_MINUTE_STANDARD = 5;
const MAX_CRAWLS_PER_MINUTE_STANDARD = 4;
const MAX_CRAWLS_PER_MINUTE_SCALE = 20; const MAX_CRAWLS_PER_MINUTE_SCALE = 20;
const MAX_SCRAPES_PER_MINUTE_STARTER = 20;
const MAX_SCRAPES_PER_MINUTE_STANDARD = 40;
const MAX_SCRAPES_PER_MINUTE_SCALE = 50;
const MAX_SEARCHES_PER_MINUTE_STARTER = 20;
const MAX_SEARCHES_PER_MINUTE_STANDARD = 40;
const MAX_SEARCHES_PER_MINUTE_SCALE = 50;
const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5;
const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20;
const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 150;
const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120;
export const redisClient = redis.createClient({ export const redisClient = redis.createClient({
url: process.env.REDIS_URL, url: process.env.REDIS_URL,
@ -21,71 +25,109 @@ export const redisClient = redis.createClient({
export const previewRateLimiter = new RateLimiterRedis({ export const previewRateLimiter = new RateLimiterRedis({
storeClient: redisClient, storeClient: redisClient,
keyPrefix: "middleware", keyPrefix: "preview",
points: MAX_REQUESTS_PER_MINUTE_PREVIEW, points: MAX_REQUESTS_PER_MINUTE_PREVIEW,
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const serverRateLimiter = new RateLimiterRedis({ export const serverRateLimiter = new RateLimiterRedis({
storeClient: redisClient, storeClient: redisClient,
keyPrefix: "middleware", keyPrefix: "server",
points: MAX_REQUESTS_PER_MINUTE_ACCOUNT, points: MAX_REQUESTS_PER_MINUTE_ACCOUNT,
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const crawlStatusRateLimiter = new RateLimiterRedis({ export const crawlStatusRateLimiter = new RateLimiterRedis({
storeClient: redisClient, storeClient: redisClient,
keyPrefix: "middleware", keyPrefix: "crawl-status",
points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS, points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS,
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const testSuiteRateLimiter = new RateLimiterRedis({ export const testSuiteRateLimiter = new RateLimiterRedis({
storeClient: redisClient, storeClient: redisClient,
keyPrefix: "middleware", keyPrefix: "test-suite",
points: 1000, points: 10000,
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export function crawlRateLimit(plan: string){ export function getRateLimiter(mode: RateLimiterMode, token: string, plan?: string){
if(plan === "standard"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_STANDARD,
duration: 60, // Duration in seconds
});
}else if(plan === "scale"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_SCALE,
duration: 60, // Duration in seconds
});
}
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "middleware",
points: MAX_CRAWLS_PER_MINUTE_STARTER,
duration: 60, // Duration in seconds
});
}
export function getRateLimiter(mode: RateLimiterMode, token: string){
// Special test suite case. TODO: Change this later. // Special test suite case. TODO: Change this later.
if(token.includes("5089cefa58")){ if (token.includes("5089cefa58") || token.includes("6254cf9")){
return testSuiteRateLimiter; return testSuiteRateLimiter;
} }
switch(mode) { switch (mode) {
case RateLimiterMode.Preview: case RateLimiterMode.Preview:
return previewRateLimiter; return previewRateLimiter;
case RateLimiterMode.CrawlStatus: case RateLimiterMode.CrawlStatus:
return crawlStatusRateLimiter; return crawlStatusRateLimiter;
case RateLimiterMode.Crawl:
if (plan === "standard"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "crawl-standard",
points: MAX_CRAWLS_PER_MINUTE_STANDARD,
duration: 60, // Duration in seconds
});
} else if (plan === "scale"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "crawl-scale",
points: MAX_CRAWLS_PER_MINUTE_SCALE,
duration: 60, // Duration in seconds
});
}
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "crawl-starter",
points: MAX_CRAWLS_PER_MINUTE_STARTER,
duration: 60, // Duration in seconds
});
case RateLimiterMode.Scrape:
if (plan === "standard"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "scrape-standard",
points: MAX_SCRAPES_PER_MINUTE_STANDARD,
duration: 60, // Duration in seconds
});
} else if (plan === "scale"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "scrape-scale",
points: MAX_SCRAPES_PER_MINUTE_SCALE,
duration: 60, // Duration in seconds
});
}
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "scrape-starter",
points: MAX_SCRAPES_PER_MINUTE_STARTER,
duration: 60, // Duration in seconds
});
case RateLimiterMode.Search:
if (plan === "standard"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "search-standard",
points: MAX_SEARCHES_PER_MINUTE_STANDARD,
duration: 60, // Duration in seconds
});
} else if (plan === "scale"){
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "search-scale",
points: MAX_SEARCHES_PER_MINUTE_SCALE,
duration: 60, // Duration in seconds
});
}
return new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "search-starter",
points: MAX_SEARCHES_PER_MINUTE_STARTER,
duration: 60, // Duration in seconds
});
default: default:
return serverRateLimiter; return serverRateLimiter;
} }

View File

@ -240,7 +240,7 @@ export default class FirecrawlApp {
* @param {string} action - The action being performed when the error occurred. * @param {string} action - The action being performed when the error occurred.
*/ */
handleError(response, action) { handleError(response, action) {
if ([402, 409, 500].includes(response.status)) { if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || "Unknown error occurred"; const errorMessage = response.data.error || "Unknown error occurred";
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "0.0.20", "version": "0.0.21",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js", "main": "build/index.js",
"types": "types/index.d.ts", "types": "types/index.d.ts",

View File

@ -109,7 +109,7 @@ export default class FirecrawlApp {
const response: AxiosResponse = await axios.post( const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/scrape", "https://api.firecrawl.dev/v0/scrape",
jsonData, jsonData,
{ headers } { headers },
); );
if (response.status === 200) { if (response.status === 200) {
const responseData = response.data; const responseData = response.data;
@ -324,7 +324,7 @@ export default class FirecrawlApp {
* @param {string} action - The action being performed when the error occurred. * @param {string} action - The action being performed when the error occurred.
*/ */
handleError(response: AxiosResponse, action: string): void { handleError(response: AxiosResponse, action: string): void {
if ([402, 409, 500].includes(response.status)) { if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage: string = const errorMessage: string =
response.data.error || "Unknown error occurred"; response.data.error || "Unknown error occurred";
throw new Error( throw new Error(

View File

@ -4,10 +4,11 @@ import requests
import time import time
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
@ -38,7 +39,7 @@ class FirecrawlApp:
scrape_params[key] = value scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', f'{self.api_url}/v0/scrape',
headers=headers, headers=headers,
json=scrape_params json=scrape_params
) )
@ -48,7 +49,7 @@ class FirecrawlApp:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else: else:
@ -63,7 +64,7 @@ class FirecrawlApp:
if params: if params:
json_data.update(params) json_data.update(params)
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/search', f'{self.api_url}/v0/search',
headers=headers, headers=headers,
json=json_data json=json_data
) )
@ -85,7 +86,7 @@ class FirecrawlApp:
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') job_id = response.json().get('jobId')
if wait_until_done: if wait_until_done:
@ -97,7 +98,7 @@ class FirecrawlApp:
def check_crawl_status(self, job_id): def check_crawl_status(self, job_id):
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
@ -130,7 +131,7 @@ class FirecrawlApp:
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time import time
while True: while True:
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
@ -148,7 +149,7 @@ class FirecrawlApp:
self._handle_error(status_response, 'check crawl status') self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action): def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]: if response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else: else:

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -4,10 +4,11 @@ import requests
import time import time
class FirecrawlApp: class FirecrawlApp:
def __init__(self, api_key=None): def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
raise ValueError('No API key provided') raise ValueError('No API key provided')
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
@ -38,7 +39,7 @@ class FirecrawlApp:
scrape_params[key] = value scrape_params[key] = value
# Make the POST request with the prepared headers and JSON data # Make the POST request with the prepared headers and JSON data
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/scrape', f'{self.api_url}/v0/scrape',
headers=headers, headers=headers,
json=scrape_params json=scrape_params
) )
@ -48,7 +49,7 @@ class FirecrawlApp:
return response['data'] return response['data']
else: else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]: elif response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else: else:
@ -63,7 +64,7 @@ class FirecrawlApp:
if params: if params:
json_data.update(params) json_data.update(params)
response = requests.post( response = requests.post(
'https://api.firecrawl.dev/v0/search', f'{self.api_url}/v0/search',
headers=headers, headers=headers,
json=json_data json=json_data
) )
@ -85,7 +86,7 @@ class FirecrawlApp:
json_data = {'url': url} json_data = {'url': url}
if params: if params:
json_data.update(params) json_data.update(params)
response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
if response.status_code == 200: if response.status_code == 200:
job_id = response.json().get('jobId') job_id = response.json().get('jobId')
if wait_until_done: if wait_until_done:
@ -97,7 +98,7 @@ class FirecrawlApp:
def check_crawl_status(self, job_id): def check_crawl_status(self, job_id):
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
@ -130,7 +131,7 @@ class FirecrawlApp:
def _monitor_job_status(self, job_id, headers, timeout): def _monitor_job_status(self, job_id, headers, timeout):
import time import time
while True: while True:
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200: if status_response.status_code == 200:
status_data = status_response.json() status_data = status_response.json()
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
@ -148,7 +149,7 @@ class FirecrawlApp:
self._handle_error(status_response, 'check crawl status') self._handle_error(status_response, 'check crawl status')
def _handle_error(self, response, action): def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]: if response.status_code in [402, 408, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred') error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else: else:

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: firecrawl-py Name: firecrawl-py
Version: 0.0.8 Version: 0.0.9
Summary: Python SDK for Firecrawl API Summary: Python SDK for Firecrawl API
Home-page: https://github.com/mendableai/firecrawl Home-page: https://github.com/mendableai/firecrawl
Author: Mendable.ai Author: Mendable.ai

View File

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup( setup(
name='firecrawl-py', name='firecrawl-py',
version='0.0.8', version='0.0.9',
url='https://github.com/mendableai/firecrawl', url='https://github.com/mendableai/firecrawl',
author='Mendable.ai', author='Mendable.ai',
author_email='nick@mendable.ai', author_email='nick@mendable.ai',

View File

@ -0,0 +1,178 @@
[
{
"website": "https://www.vellum.ai/llm-leaderboard",
"expected_min_num_of_pages": 1,
"expected_crawled_pages": ["https://www.vellum.ai/llm-leaderboard"]
},
{
"website": "https://openai.com/news",
"expected_min_num_of_pages": 4,
"expected_crawled_pages": [
"https://openai.com/news/company/",
"https://openai.com/news/research/",
"https://openai.com/news/safety-and-alignment/",
"https://openai.com/news/stories/"
]
},
{
"website": "https://www.framer.com/pricing",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://www.framer.com/features/navigation/",
"https://www.framer.com/contact/",
"https://www.framer.com/add-ons/",
"https://www.framer.com/free-saas-ui-kit/",
"https://www.framer.com/help/",
"https://www.framer.com/features/effects/",
"https://www.framer.com/enterprise/",
"https://www.framer.com/templates/"
]
},
{
"website": "https://mendable.ai/pricing",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://mendable.ai/",
"https://mendable.ai/blog",
"https://mendable.ai/signin",
"https://mendable.ai/signup",
"https://mendable.ai",
"https://mendable.ai/usecases/sales-enablement",
"https://mendable.ai/usecases/documentation",
"https://mendable.ai/usecases/cs-enablement",
"https://mendable.ai/usecases/productcopilot",
"https://mendable.ai/security"
],
"notes": "This one should not go backwards, but it does!"
},
{
"website": "https://agentops.ai/blog",
"expected_min_num_of_pages": 6,
"expected_crawled_pages": [
"https://www.agentops.ai/blog/effortless-hr-management-with-saas",
"https://www.agentops.ai/blog/streamlining-hr-with-saas",
"https://www.agentops.ai/blog/simplify-hr-with-modern-saas-solutions",
"https://www.agentops.ai/blog/efficient-hr-operations-with-saas",
"https://www.agentops.ai/blog/hr-made-simple-with-saas",
"https://agentops.ai/blog"
],
"expected_not_crawled_pages": [
"https://agentops.ai/about-us",
"https://agentops.ai/contact-us"
]
},
{
"website": "https://en.wikipedia.org/wiki/T._N._Seshan",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://en.wikipedia.org/wiki/Wikipedia:Contents",
"https://en.wikipedia.org/wiki/Wikipedia:Contact_us",
"https://en.wikipedia.org/wiki/V._S._Ramadevi",
"https://en.wikipedia.org/wiki/Wikipedia:About",
"https://en.wikipedia.org/wiki/Help:Introduction",
"https://en.wikipedia.org/wiki/H._D._Deve_Gowda",
"https://en.wikipedia.org/wiki/File:T.N._Seshan_in_1994.jpg"
]
},
{
"website": "https://ycombinator.com/companies",
"expected_min_num_of_pages": 20,
"expected_crawled_pages": [
"https://www.ycombinator.com/companies/industry/elearning",
"https://www.ycombinator.com/companies/industry/computer-vision",
"https://www.ycombinator.com/companies/industry/health-tech",
"https://www.ycombinator.com/companies/industry/education",
"https://www.ycombinator.com/companies/industry/robotics",
"https://www.ycombinator.com/companies/industry/hardware",
"https://www.ycombinator.com/companies/industry/saas",
"https://www.ycombinator.com/companies/industry/hard-tech",
"https://www.ycombinator.com/companies/industry/developer-tools",
"https://www.ycombinator.com/companies/industry/entertainment",
"https://www.ycombinator.com/companies/industry/finance",
"https://www.ycombinator.com/companies/industry/generative-ai",
"https://www.ycombinator.com/companies/industry/machine-learning"
]
},
{
"website": "https://firecrawl.dev",
"expected_min_num_of_pages": 2,
"expected_crawled_pages": [
"https://firecrawl.dev/",
"https://firecrawl.dev/pricing"
]
},
{
"website": "https://fly.io/docs/gpus/gpu-quickstart",
"expected_min_num_of_pages": 1,
"expected_not_crawled_pages": [
"https://fly.io/docs/getting-started/",
"https://fly.io/docs/hands-on/",
"https://fly.io/docs/about/support/",
"https://fly.io/docs/blueprints/going-to-production-with-healthcare-apps/",
"https://fly.io/docs/machines/flyctl/fly-machine-update/",
"https://fly.io/docs/blueprints/review-apps-guide/",
"https://fly.io/docs/blueprints/supercronic/"
],
"notes": "This one should not go backwards, but it does!"
},
{
"website": "https://www.instructables.com/circuits",
"expected_min_num_of_pages": 12,
"expected_crawled_pages": [
"https://www.instructables.com/circuits/",
"https://www.instructables.com/circuits/apple/projects/",
"https://www.instructables.com/circuits/art/projects/",
"https://www.instructables.com/circuits/electronics/projects/",
"https://www.instructables.com/circuits/microsoft/projects/",
"https://www.instructables.com/circuits/microcontrollers/projects/",
"https://www.instructables.com/circuits/community/",
"https://www.instructables.com/circuits/leds/projects/",
"https://www.instructables.com/circuits/gadgets/projects/",
"https://www.instructables.com/circuits/arduino/projects/",
"https://www.instructables.com/circuits/lasers/projects/",
"https://www.instructables.com/circuits/clocks/projects/"
]
},
{
"website": "https://richmondconfidential.org",
"expected_min_num_of_pages": 20,
"expected_crawled_pages": [
"https://richmondconfidential.org/2009/10/13/salesians-star-guard-has-a-big-impact/",
"https://richmondconfidential.org/2009/10/13/on-team-of-beginners-oilers-old-hand-stands-out/",
"https://richmondconfidential.org/2009/10/19/point-richmond-clockmaker-turns-clutter-into-crafts/",
"https://richmondconfidential.org/2009/10/13/profile-maurice-cathy/",
"https://richmondconfidential.org/2009/10/13/soul-food-rescue-mission-rebuilds-diets-and-lives/",
"https://richmondconfidential.org/2009/10/21/in-tough-economy-pain-trickles-to-the-bottom/",
"https://richmondconfidential.org/2009/10/19/richmond-homicide-map/",
"https://richmondconfidential.org/2009/10/13/rough-roads-for-richmonds-cab-drivers/",
"https://richmondconfidential.org/2009/10/13/before-napa-there-was-winehaven/",
"https://richmondconfidential.org/2009/10/13/family-calls-for-end-to-violence-at-memorial-for-slain-woman-friend/"
]
},
{
"website": "https://www.boardgamegeek.com",
"expected_min_num_of_pages": 15,
"expected_crawled_pages": [
"https://www.boardgamegeek.com/browse/boardgameartist",
"https://www.boardgamegeek.com/browse/boardgamehonor",
"https://www.boardgamegeek.com/browse/boardgamepublisher",
"https://www.boardgamegeek.com/browse/boardgamepodcast",
"https://www.boardgamegeek.com/wiki/page/Index",
"https://www.boardgamegeek.com/browse/boardgamecategory",
"https://www.boardgamegeek.com/boardgame/random",
"https://www.boardgamegeek.com/browse/boardgamemechanic",
"https://www.boardgamegeek.com/forums",
"https://www.boardgamegeek.com/gonecardboard",
"https://www.boardgamegeek.com/browse/boardgameaccessory",
"https://www.boardgamegeek.com/browse/boardgamedesigner",
"https://www.boardgamegeek.com/",
"https://www.boardgamegeek.com/previews",
"https://www.boardgamegeek.com/browse/boardgame"
]
}
]

View File

@ -3,7 +3,9 @@
"version": "1.0.0", "version": "1.0.0",
"description": "", "description": "",
"scripts": { "scripts": {
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false" "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
}, },
"author": "", "author": "",
"license": "ISC", "license": "ISC",

View File

@ -0,0 +1,150 @@
import request from "supertest";
import dotenv from "dotenv";
import { WebsiteScrapeError } from "../utils/types";
import { logErrors } from "../utils/log";
import websitesData from "../data/crawl.json";
import "dotenv/config";
import fs from 'fs';
dotenv.config();
interface WebsiteData {
website: string;
expected_min_num_of_pages: number;
expected_crawled_pages: string[];
}
const TEST_URL = "http://127.0.0.1:3002";
describe("Crawling Checkup (E2E)", () => {
beforeAll(() => {
if (!process.env.TEST_API_KEY) {
throw new Error("TEST_API_KEY is not set");
}
});
describe("Crawling website tests with a dataset", () => {
it("Should crawl the website and verify the response", async () => {
let passedTests = 0;
const startTime = new Date().getTime();
const date = new Date();
const logsDir = `logs/${date.getMonth() + 1}-${date.getDate()}-${date.getFullYear()}`;
let errorLogFileName = `${logsDir}/run.log_${new Date().toTimeString().split(' ')[0]}`;
const errorLog: WebsiteScrapeError[] = [];
for (const websiteData of websitesData) {
try {
const crawlResponse = await request(TEST_URL || "")
.post("/v0/crawl")
.set("Content-Type", "application/json")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send({ url: websiteData.website, pageOptions: { onlyMainContent: true }, crawlerOptions: { limit: 100, returnOnlyUrls: true }});
const jobId = crawlResponse.body.jobId;
let completedResponse: any;
let isFinished = false;
while (!isFinished) {
completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
isFinished = completedResponse.body.status === "completed";
if (!isFinished) {
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
if(!completedResponse) {
// fail the test
console.log('No response');
continue;
// continue;
}
if (!completedResponse.body || completedResponse.body.status !== "completed") {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: 'SUCCESS',
actual_output: 'FAILURE',
error: `Crawl job did not complete successfully.`
});
continue;
}
// check how many webpages were crawled successfully
// compares with expected_num_of_pages
if (completedResponse.body.data.length < websiteData.expected_min_num_of_pages) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_min_num_of_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data.length}`,
error: `Expected at least ${websiteData.expected_min_num_of_pages} webpages, but got ${completedResponse.body.data.length}`
});
console.log('Error: ', errorLog);
continue;
}
// checks if crawled pages contain expected_crawled_pages
if (websiteData.expected_crawled_pages && websiteData.expected_crawled_pages.length > 0 && websiteData.expected_crawled_pages.some(page => !completedResponse.body.data?.some((d: { url: string }) => d.url === page))) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_crawled_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to contain ${websiteData.expected_crawled_pages}, but got ${completedResponse.body.data}`
});
console.log('Error: ', errorLog);
continue;
}
// checks if crawled pages not contain expected_not_crawled_pages
if (websiteData.expected_not_crawled_pages && websiteData.expected_not_crawled_pages.length > 0 && completedResponse.body.data && websiteData.expected_not_crawled_pages.filter(page => completedResponse.body.data.some((d: { url: string }) => d.url === page)).length > 0) {
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: `SUCCESS: ${websiteData.expected_not_crawled_pages}`,
actual_output: `FAILURE: ${completedResponse.body.data}`,
error: `Expected crawled pages to not contain ${websiteData.expected_not_crawled_pages}, but got ${completedResponse.body.data}`
});
console.log('Error: ', errorLog);
continue;
}
passedTests++;
} catch (error) {
console.error(`Error processing ${websiteData.website}: ${error}`);
errorLog.push({
website: websiteData.website,
prompt: 'CRAWL',
expected_output: 'SUCCESS',
actual_output: 'FAILURE',
error: `Error processing ${websiteData.website}: ${error}`
});
continue;
}
}
const score = (passedTests / websitesData.length) * 100;
const endTime = new Date().getTime();
const timeTaken = (endTime - startTime) / 1000;
console.log(`Score: ${score}%`);
await logErrors(errorLog, timeTaken, 0, score, websitesData.length);
if (process.env.ENV === "local" && errorLog.length > 0) {
if (!fs.existsSync(logsDir)){
fs.mkdirSync(logsDir, { recursive: true });
}
fs.writeFileSync(errorLogFileName, JSON.stringify(errorLog, null, 2));
}
expect(score).toBeGreaterThanOrEqual(90);
}, 350000); // 150 seconds timeout
});
});

View File

@ -1,16 +1,14 @@
import request from "supertest"; import request from "supertest";
import dotenv from "dotenv"; import dotenv from "dotenv";
import Anthropic from "@anthropic-ai/sdk"; import { numTokensFromString } from "../utils/tokens";
import { numTokensFromString } from "./utils/tokens";
import OpenAI from "openai"; import OpenAI from "openai";
import { WebsiteScrapeError } from "./utils/types"; import { WebsiteScrapeError } from "../utils/types";
import { logErrors } from "./utils/log"; import { logErrors } from "../utils/log";
const websitesData = require("./data/websites.json"); import websitesData from "../data/scrape.json";
import "dotenv/config"; import "dotenv/config";
const fs = require('fs'); import fs from 'fs';
dotenv.config(); dotenv.config();
interface WebsiteData { interface WebsiteData {
@ -21,8 +19,7 @@ interface WebsiteData {
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
describe("Scraping Checkup (E2E)", () => {
describe("Scraping/Crawling Checkup (E2E)", () => {
beforeAll(() => { beforeAll(() => {
if (!process.env.TEST_API_KEY) { if (!process.env.TEST_API_KEY) {
throw new Error("TEST_API_KEY is not set"); throw new Error("TEST_API_KEY is not set");
@ -72,10 +69,6 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
return null; return null;
} }
const anthropic = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
});
const openai = new OpenAI({ const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY, apiKey: process.env.OPENAI_API_KEY,
}); });
@ -183,7 +176,7 @@ describe("Scraping/Crawling Checkup (E2E)", () => {
} }
expect(score).toBeGreaterThanOrEqual(75); expect(score).toBeGreaterThanOrEqual(70);
}, 350000); // 150 seconds timeout }, 350000); // 150 seconds timeout
}); });
}); });

View File

@ -39,7 +39,7 @@
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */ "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */ // "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */

78
docker-compose.yaml Normal file
View File

@ -0,0 +1,78 @@
name: firecrawl
version: '3.9'
services:
playwright-service:
build: apps/playwright-service
environment:
- PORT=3000
networks:
- backend
api:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
ports:
- "3002:3002"
command: [ "pnpm", "run", "start:production" ]
networks:
- backend
worker:
build: apps/api
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on:
- redis
- playwright-service
- api
networks:
- backend
redis:
image: redis:alpine
networks:
- backend
command: redis-server --bind 0.0.0.0
networks:
backend:
driver: bridge