Merge branch 'feat/maxDepthRelative' of https://github.com/mendableai/firecrawl into feat/maxDepthRelative

This commit is contained in:
Eric Ciarla 2024-06-15 16:52:00 -04:00
commit f0d4146b42
34 changed files with 58443 additions and 25 deletions

3
.gitignore vendored
View File

@ -13,4 +13,5 @@ apps/test-suite/node_modules/
apps/test-suite/.env
apps/test-suite/logs
apps/test-suite/logs
apps/test-suite/load-test-results/test-run-report.json

65
apps/api/fly.staging.toml Normal file
View File

@ -0,0 +1,65 @@
# fly.toml app configuration file generated for firecrawl-scraper-js on 2024-04-07T21:09:59-03:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#
app = 'staging-firecrawl-scraper-js'
primary_region = 'mia'
kill_signal = 'SIGINT'
kill_timeout = '5s'
[build]
[processes]
app = 'npm run start:production'
worker = 'npm run worker:production'
[http_service]
internal_port = 8080
force_https = true
auto_stop_machines = true
auto_start_machines = true
min_machines_running = 2
processes = ['app']
[http_service.concurrency]
type = "requests"
hard_limit = 100
soft_limit = 50
[[http_service.checks]]
grace_period = "10s"
interval = "30s"
method = "GET"
timeout = "5s"
path = "/"
[[services]]
protocol = 'tcp'
internal_port = 8080
processes = ['worker']
[[services.ports]]
port = 80
handlers = ['http']
force_https = true
[[services.ports]]
port = 443
handlers = ['tls', 'http']
[services.concurrency]
type = 'connections'
hard_limit = 25
soft_limit = 20
[[vm]]
size = 'performance-1x'
processes = ['app','worker']

View File

@ -17,7 +17,8 @@
"worker:production": "node dist/src/services/queue-worker.js",
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
"mongo-docker-console": "docker exec -it mongodb mongosh",
"run-example": "npx ts-node src/example.ts"
"run-example": "npx ts-node src/example.ts",
"deploy:fly:staging": "fly deploy -c fly.staging.toml"
},
"author": "",
"license": "ISC",

View File

@ -677,7 +677,7 @@ describe("E2E Tests for API Routes", () => {
});
}, 180000);
@ -726,7 +726,7 @@ describe("E2E Tests for API Routes", () => {
// expect(completedResponse.body.data[0].content).not.toContain("main menu");
// }, 60000); // 60 seconds
it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (1)", async () => {
it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -770,7 +770,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// 120 seconds
// 120 seconds
expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("_Roast_");

View File

@ -230,11 +230,10 @@ export class WebCrawler {
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
}
this.visited.add(normalizedUrl);
this.visited.add(url);
if (!url.startsWith("http")) {
url = "https://" + url;
@ -282,15 +281,16 @@ export class WebCrawler {
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (
this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
@ -300,12 +300,15 @@ export class WebCrawler {
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
return links.filter((link) => !this.visited.has(link.url));
} catch (error) {
return [];
}
}
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
@ -332,12 +335,10 @@ export class WebCrawler {
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
return urlObj.hostname === domainWithoutProtocol;
}
private matchesPattern(link: string): boolean {
return true; // Placeholder for future pattern matching implementation
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
return linkDomain === baseDomain;
}
private isFile(url: string): boolean {

View File

@ -292,6 +292,7 @@ function getScrapingFallbackOrder(
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
: [...filteredDefaultOrder, ...availableScrapers]
);
const scrapersInOrder = Array.from(uniqueScrapers);
return scrapersInOrder as (typeof baseScrapers)[number][];
}

View File

@ -122,6 +122,36 @@ export const urlSpecificParams = {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"scrapethissite.com":{
defaultScraper: "fetch",
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"rsseau.fr":{
defaultScraper: "fetch",
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"help.salesforce.com":{
defaultScraper: "playwright",
params: {

View File

@ -41,8 +41,8 @@ const RATE_LIMITS = {
default: 5,
},
account: {
free: 20,
default: 20,
free: 100,
default: 100,
},
crawlStatus: {
free: 150,
@ -72,21 +72,25 @@ export const serverRateLimiter = createRateLimiter(
RATE_LIMITS.account.default
);
export const testSuiteRateLimiter = createRateLimiter(
"test-suite",
RATE_LIMITS.testSuite.default
);
export const testSuiteRateLimiter = new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: "test-suite",
points: 10000,
duration: 60, // Duration in seconds
});
export function getRateLimiter(
mode: RateLimiterMode,
token: string,
plan?: string
) {
if (token.includes("a01ccae") || token.includes("6254cf9")) {
return testSuiteRateLimiter;
}
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
if (!rateLimitConfig) return serverRateLimiter;
const planKey = plan ? plan.replace("-", "") : "default"; // "default"

View File

@ -45,7 +45,7 @@ export interface FirecrawlJob {
export enum RateLimiterMode {
Crawl = "crawl",
CrawlStatus = "crawl-status",
CrawlStatus = "crawlStatus",
Scrape = "scrape",
Preview = "preview",
Search = "search",

View File

@ -16,6 +16,22 @@ npx playwright install
npm run test
```
## Running Load Tests with Artillery
To run load tests using Artillery, follow these steps:
1. Install Artillery globally if you haven't already:
```bash
npm install -g artillery
```
2. Run the load test:
```bash
artillery run load-test.yml
```
## Test Results
The tests are designed to cover various aspects of the system, including:

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 201 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
# Scraping Load Testing - Test #1
## Summary
The load test successfully processed 600 requests in 60 seconds with all requests returning HTTP 200 status codes. The average response time was 1380.1 ms, with CPU utilization peaking at around 50% on both machines, indicating sufficient CPU resources. However, there was a significant increase in memory usage post-test, which did not return to pre-test levels, suggesting a potential memory leak. Further investigation and additional load tests are recommended to address this issue and optimize the system's performance.
## Table of Contents
- [Scraping Load Testing - Test #1](#scraping-load-testing---test-1)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load #1 - 600 reqs 60 secs (initial load only)](#load-1---600-reqs-60-secs-initial-load-only)
- [Archillery Report](#archillery-report)
- [CPU Utilization](#cpu-utilization)
- [Memory Utilization](#memory-utilization)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU |
|---|---|
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB |
---
## Load #1 - 600 reqs 60 secs (initial load only)
```yml
# load-test.yml
- duration: 60
arrivalRate: 10 # Initial load
```
### Archillery Report
Date: 10:49:39(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| http.codes.200 | 600 |
| http.downloaded_bytes | 0 |
| http.request_rate | 10/sec |
| http.requests | 600 |
| http.response_time.min | 984 |
| http.response_time.max | 2267 |
| http.response_time.mean | 1380.1 |
| http.response_time.median | 1353.1 |
| http.response_time.p95 | 1755 |
| http.response_time.p99 | 2059.5 |
| http.responses | 600 |
| vusers.completed | 600 |
| vusers.created | 600 |
| vusers.created_by_name.Scrape a URL | 600 |
| vusers.failed | 0 |
| vusers.session_length.min | 1053.7 |
| vusers.session_length.max | 2332.6 |
| vusers.session_length.mean | 1447.4 |
| vusers.session_length.median | 1436.8 |
| vusers.session_length.p95 | 1863.5 |
| vusers.session_length.p99 | 2143.5 |
### CPU Utilization
![](./assets/CPU-utilization-report-test-1.png)
Both machines peaked at around 50% CPU utilization.
### Memory Utilization
![](./assets/memory-utilization-report-test-1.png)
| Machine | Before | After Load Test |
|---|---|---|
| e286de4f711e86 | 295 MiB | 358 MiB |
| 73d8dd909c1189 | 296 MiB | 355 MiB |
Notice that the memory utilization has not re-stabilished to the pre-test values during the check window, which may indicate a memory leak problem.
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system handled 600 requests in 60 seconds with a mean response time of 1380.1 ms. All requests were successful (HTTP 200).
2. **CPU Utilization:** Both machines peaked at around 50% CPU utilization, indicating that the CPU resources were sufficient for the load.
3. **Memory Utilization:** There was a noticeable increase in memory usage on both machines post-test, and the memory did not re-stabilize to pre-test levels, suggesting a potential memory leak.
### Next Steps
1. **Investigate Memory Leak:** Conduct a detailed analysis to identify and fix the potential memory leak. This may involve profiling the application and reviewing the code for memory management issues.
2. **Additional Load Tests:** Perform additional load tests with varying request rates and durations to further assess the system's performance and stability.
3. **Optimize Performance:** Based on the findings, optimize the application to improve response times and resource utilization.
4. **Monitor in Production:** Implement monitoring in the production environment to ensure that similar issues do not occur under real-world conditions.
5. **Documentation:** Update the documentation with the findings and any changes made to the system as a result of this test.
By following these steps, we can ensure that the system is robust, efficient, and ready to handle production workloads.

View File

@ -0,0 +1,93 @@
# Scraping Load Testing - Test #2
## Summary
The load test encountered significant issues, processing 9000 requests with 5473 timeouts and a 61.6% failure rate. The average response time was 3682.1 ms, with a peak response time of 9919 ms. Both machines reached 100% CPU utilization, leading to severe performance bottlenecks and high failure rates. This indicates the need for substantial optimizations, autoscaling, and further investigation.
## Table of Contents
- [Scraping Load Testing - Test #2](#scraping-load-testing---test-2)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load #2 - 9000 reqs 7 mins 11 secs (4 phases)](#load-2---9000-reqs-7-mins-11-secs-4-phases)
- [Archillery Report](#archillery-report)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU |
|---|---|
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB |
---
## Load #2 - 9000 reqs 7 mins 11 secs (4 phases)
```yml
# load-test.yml
- duration: 60
arrivalRate: 10 # Initial load
- duration: 120
arrivalRate: 20 # Increased load
- duration: 180
arrivalRate: 30 # Peak load
- duration: 60
arrivalRate: 10 # Cool down
```
### Archillery Report
Date: 13:50:08(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| errors.ETIMEDOUT | 5473 |
| errors.Failed capture or match | 73 |
| http.codes.200 | 3454 |
| http.codes.401 | 64 |
| http.codes.402 | 9 |
| http.downloaded_bytes | 0 |
| http.request_rate | 21/sec |
| http.requests | 9000 |
| http.response_time.min | 929 |
| http.response_time.max | 9919 |
| http.response_time.mean | 3682.1 |
| http.response_time.median | 3395.5 |
| http.response_time.p95 | 8024.5 |
| http.response_time.p99 | 9607.1 |
| http.responses | 3527 |
| vusers.completed | 3454 |
| vusers.created | 9000 |
| vusers.created_by_name.Scrape a URL | 9000 |
| vusers.failed | 5546 |
| vusers.session_length.min | 1127.6 |
| vusers.session_length.max | 9982.2 |
| vusers.session_length.mean | 3730.6 |
| vusers.session_length.median | 3464.1 |
| vusers.session_length.p95 | 7865.6 |
| vusers.session_length.p99 | 9607.1 |
### Metrics
![](./assets/metrics-test-2.png)
Both machines reached 100% CPU utilization, which led to a significant number of request failures (61.6% failure rate).
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system struggled with 9000 requests, resulting in 5473 timeouts and a mean response time of 3682.1 ms.
2. **CPU Utilization:** Both machines experienced 100% CPU utilization, causing severe performance degradation and high failure rates.
### Next Steps
Implement an autoscaling solution on Fly.io and conduct tests using the same configurations.

View File

@ -0,0 +1,107 @@
# Scraping Load Testing - Test #3
## Summary
The load test involved setting up an autoscaling option and adjusting the hard and soft limits for the Fly.io configuration. The test environment consisted of 5 machines, with 3 machines automatically scaling up during the test. Despite the scaling, there were 653 timeouts (7.3%) and 2 HTTP 502 responses (0.02%). The average response time was 3037.2 ms, with a peak response time of 9941 ms. Further adjustments to the soft limit are recommended to improve performance and reduce errors.
## Table of Contents
- [Scraping Load Testing - Test #3](#scraping-load-testing---test-3)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load Test Phases](#load-test-phases)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU | Status |
|---|---|---|
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused |
| 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused |
| 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused |
---
## Load Test Phases
### Configuration
```toml
# fly.staging.toml
[http_service.concurrency]
type = "requests"
hard_limit = 100
soft_limit = 75
```
```yml
# load-test.yml
- duration: 60
arrivalRate: 10 # Initial load
- duration: 120
arrivalRate: 20 # Increased load
- duration: 180
arrivalRate: 30 # Peak load
- duration: 60
arrivalRate: 10 # Cool down
```
### Results
Date: 14:53:32(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| errors.ETIMEDOUT | 653 |
| errors.Failed capture or match | 2 |
| http.codes.200 | 8345 |
| http.codes.502 | 2 |
| http.downloaded_bytes | 0 |
| http.request_rate | 11/sec |
| http.requests | 9000 |
| http.response_time.min | 979 |
| http.response_time.max | 9941 |
| http.response_time.mean | 3037.2 |
| http.response_time.median | 2059.5 |
| http.response_time.p95 | 7709.8 |
| http.response_time.p99 | 9416.8 |
| http.responses | 8347 |
| vusers.completed | 8345 |
| vusers.created | 9000 |
| vusers.created_by_name.Scrape a URL | 9000 |
| vusers.failed | 655 |
| vusers.session_length.min | 1044.5 |
| vusers.session_length.max | 9998.8 |
| vusers.session_length.mean | 3109.7 |
| vusers.session_length.median | 2143.5 |
| vusers.session_length.p95 | 7709.8 |
| vusers.session_length.p99 | 9416.8 |
### Metrics
![](./assets/metrics-test-3.png)
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system handled 9000 requests with a mean response time of 3037.2 ms. There were 653 timeouts and 2 HTTP 502 responses.
2. **Autoscaling:** Three machines automatically scaled up during the test, but the scaling was not sufficient to prevent all errors.
3. **Response Times:** The peak response time was 9941 ms, indicating that the system struggled under peak load conditions.
### Next Steps
1. **Adjust Soft Limit:** Change the soft limit to 100 and the hard limit to 50 to test if machines will start faster and reduce the number of 502 errors.
2. **Further Load Tests:** Conduct additional load tests with the new configuration to assess improvements.
By following these steps, we can enhance the system's performance and reliability under varying load conditions.

View File

@ -0,0 +1,103 @@
# Scraping Load Testing - Test #4
## Summary
The load test was conducted with the Fly.io configuration set to a hard limit of 100 and a soft limit of 50. The test involved four phases with varying arrival rates. Despite the adjustments, there were 1329 timeouts (14.8%) but no HTTP 502 responses. The average response time was 3547.9 ms, with a peak response time of 9935 ms. Further adjustments to the artillery timeout configuration are recommended to improve performance.
## Table of Contents
- [Scraping Load Testing - Test #4](#scraping-load-testing---test-4)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load Test Phases](#load-test-phases)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU | Status |
|---|---|---|
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused |
| 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused |
| 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused |
---
## Load Test Phases
### Configuration
```toml
# fly.staging.toml
[http_service.concurrency]
type = "requests"
hard_limit = 100
soft_limit = 50
```
```yml
# load-test.yml
- duration: 60
arrivalRate: 10 # Initial load
- duration: 120
arrivalRate: 20 # Increased load
- duration: 180
arrivalRate: 30 # Peak load
- duration: 60
arrivalRate: 10 # Cool down
```
### Results
Date: 15:43:26(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| errors.ETIMEDOUT | 1329 |
| http.codes.200 | 7671 |
| http.downloaded_bytes | 0 |
| http.request_rate | 23/sec |
| http.requests | 9000 |
| http.response_time.min | 999 |
| http.response_time.max | 9935 |
| http.response_time.mean | 3547.9 |
| http.response_time.median | 2836.2 |
| http.response_time.p95 | 8352 |
| http.response_time.p99 | 9607.1 |
| http.responses | 7671 |
| vusers.completed | 7671 |
| vusers.created | 9000 |
| vusers.created_by_name.Scrape a URL | 9000 |
| vusers.failed | 1329 |
| vusers.session_length.min | 1063.4 |
| vusers.session_length.max | 10006.8 |
| vusers.session_length.mean | 3616 |
| vusers.session_length.median | 2893.5 |
| vusers.session_length.p95 | 8352 |
| vusers.session_length.p99 | 9607.1 |
## Metrics
![](./assets/metrics-test-4.png)
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system handled 9000 requests with a mean response time of 3547.9 ms. There were 1329 timeouts but no HTTP 502 responses.
2. **Response Times:** The peak response time was 9935 ms, indicating that the system struggled under peak load conditions.
### Next Steps
1. **Adjust Timeout Configuration:** Change the artillery timeout configuration to reduce the number of timeouts.
2. **Further Load Tests:** Conduct additional load tests with the new timeout configuration to assess improvements.
By following these steps, we can enhance the system's performance and reliability under varying load conditions.

View File

@ -0,0 +1,94 @@
# Scraping Load Testing - Test #5
## Summary
The load test was conducted with a higher timeout configuration to address previous timeout issues. The test involved 9000 requests with a timeout set to 30 seconds. The system handled the load well, with only 4 HTTP 502 responses (0.04%). The average response time was 5661.8 ms, with a peak response time of 18924 ms. Further analysis is recommended to optimize response times.
## Table of Contents
- [Scraping Load Testing - Test #5](#scraping-load-testing---test-5)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load Test Configuration](#load-test-configuration)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU | Status |
|---|---|---|
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused |
| 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused |
| 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused |
---
## Load Test Configuration
### Configuration
```yml
http:
timeout: 30
```
### Results
Date: 15:59:50(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| errors.Failed capture or match | 4 |
| http.codes.200 | 8996 |
| http.codes.502 | 4 |
| http.downloaded_bytes | 0 |
| http.request_rate | 23/sec |
| http.requests | 9000 |
| http.response_time.min | 62 |
| http.response_time.max | 18924 |
| http.response_time.mean | 5661.8 |
| http.response_time.median | 5378.9 |
| http.response_time.p95 | 11050.8 |
| http.response_time.p99 | 12968.3 |
| http.responses | 9000 |
| vusers.completed | 8996 |
| vusers.created | 9000 |
| vusers.created_by_name.Scrape a URL | 9000 |
| vusers.failed | 4 |
| vusers.session_length.min | 1079.2 |
| vusers.session_length.max | 18980.3 |
| vusers.session_length.mean | 5734.4 |
| vusers.session_length.median | 5487.5 |
| vusers.session_length.p95 | 11050.8 |
| vusers.session_length.p99 | 12968.3 |
### Metrics
![](./assets/metrics-test-5.png)
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system handled 9000 requests with a mean response time of 5661.8 ms. There were only 4 HTTP 502 responses which represent a 0.04% failure rate.
2. **Response Times:** The peak response time was 18924 ms, indicating that while the system handled the load, there is room for optimization.
### Next Steps
2. **Testing Scraping Strategies:** Conduct further testing on the Playwright instance to ensure it can handle increased load and identify any potential bottlenecks.
3. **Load Testing Other Functionalities:** Evaluate the performance of other critical routes, such as the crawl route, through additional load tests to ensure comprehensive system reliability.
4. **Optimize Response Times:** Investigate and implement strategies to reduce the peak response time from 18924 ms. This could involve optimizing database queries, improving server configurations, or enhancing caching mechanisms.
5. **Error Handling Improvements:** Analyze the causes of the 4 HTTP 502 responses and implement robust error handling and recovery mechanisms to minimize such occurrences in future tests.
6. **Scalability Assessment:** Assess the system's scalability by gradually increasing the load beyond 9000 requests to determine its breaking point and plan for necessary infrastructure upgrades.
By following these steps, we can further enhance the system's performance and reliability under varying load conditions.

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 216 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

View File

@ -0,0 +1,104 @@
# Load Testing Crawl Routes - Test #6
## Summary
The load test was conducted with a duration of 10 minutes and an arrival rate of 10 requests per second. The system handled the load well, with no failed requests. The average response time was 838.1 ms, with a peak response time of 1416 ms. Further analysis is recommended to optimize response times and assess the impact of higher loads.
## Table of Contents
- [Load Testing Crawl Routes - Test #6](#load-testing-crawl-routes---test-6)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load Test Configuration](#load-test-configuration)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU | Status |
|---|---|---|
| 06e825d0da2387 mia (worker) | performance-cpu-1x@2048MB | always on |
| 178134db566489 mia (worker) | performance-cpu-1x@2048MB | always on |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
Other app machines with autoscaling shouldn't start during crawl tests.
---
## Load Test Configuration
### Configuration
```yml
# load-test.yml
- duration: 10
arrivalRate: 10
```
### Results
Date: 16:00:06(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| http.codes.200 | 200 |
| http.downloaded_bytes | 0 |
| http.request_rate | 10/sec |
| http.requests | 200 |
| http.response_time.min | 687 |
| http.response_time.max | 1416 |
| http.response_time.mean | 838.1 |
| http.response_time.median | 788.5 |
| http.response_time.p95 | 1085.9 |
| http.response_time.p99 | 1274.3 |
| http.responses | 200 |
| vusers.completed | 100 |
| vusers.created | 100 |
| vusers.created_by_name.Crawl a URL | 100 |
| vusers.failed | 0 |
| vusers.session_length.min | 11647.5 |
| vusers.session_length.max | 12310 |
| vusers.session_length.mean | 11812.7 |
| vusers.session_length.median | 11734.2 |
| vusers.session_length.p95 | 11971.2 |
| vusers.session_length.p99 | 12213.1 |
### Metrics
![](./assets/metrics-test-6.png)
**CPU Utilization:**
- **App machines:** Less than 2.3% CPU utilization with no changes in memory utilization.
- **Worker machines:** High CPU utilization for over 4 minutes and 45 seconds, with 56% (peaking at 75.8%) on 178134db566489 and 40% (peaking at 62.7%) on 06e825d0da2387.
**Memory Utilization:**
- **App machines:** No relevant changes during the tests.
- **Worker machines:**
- 06e825d0da2387: From 359MiB to over 388MiB during 4 minutes and 45 seconds (peaking at 461MiB).
- 178134db566489: From 366MiB to over 449MiB during 4 minutes and 45 seconds (peaking at 523MiB).
---
## Conclusions and Next Steps
### Conclusions
1. **Performance:** The system handled 200 requests with a mean response time of 838.1 ms. There were no failed requests.
2. **Response Times:** The peak response time was 1416 ms, indicating that while the system handled the load, there is room for optimization.
### Next Steps
1. **Higher Load Testing:** Conduct further testing with higher loads to assess the system's performance under increased stress.
2. **Optimize Response Times:** Investigate and implement strategies to reduce the peak response time from 1416 ms. This could involve optimizing database queries, improving server configurations, or enhancing caching mechanisms.
3. **Scalability Assessment:** Assess the system's scalability by gradually increasing the load beyond the current configuration to determine its breaking point and plan for necessary infrastructure upgrades.
By following these steps, we can further enhance the system's performance and reliability under varying load conditions.

View File

@ -0,0 +1,127 @@
# Load Testing Crawl Routes - Test #7
## Summary
This load test, conducted over a period of 7 minutes with an extended observation, aimed to evaluate the system's performance under variable loads. Although the system was able to queue all requests successfully and no requests failed, the test was prematurely terminated due to a critical failure in the fire-engine machines after 22 minutes. This incident revealed significant vulnerabilities in handling sustained loads, specifically related to resource management.
## Table of Contents
- [Load Testing Crawl Routes - Test #7](#load-testing-crawl-routes---test-7)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Test environment](#test-environment)
- [Machines](#machines)
- [Load Test Configuration](#load-test-configuration)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Test environment
### Machines
| Machine | Size/CPU | Status |
|---|---|---|
| 06e825d0da2387 mia (worker) | performance-cpu-1x@2048MB | always on |
| 178134db566489 mia (worker) | performance-cpu-1x@2048MB | always on |
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
fire-engine machines:
| Machine | Size/CPU | Status |
|---|---|---|
| 2874d0db0e5258 mia app | performance-cpu-2x@4096MB | always on |
| 48ed194f7de258 mia app | performance-cpu-2x@4096MB | always on |
| 56830d45f70218 sjc app | performance-cpu-2x@4096MB | initialized during the test |
---
## Load Test Configuration
### Configuration
```yml
phases:
- duration: 60
arrivalRate: 1 # Initial load
- duration: 120
arrivalRate: 2 # Increased load
- duration: 180
arrivalRate: 3 # Peak load
- duration: 60
arrivalRate: 1 # Cool down
```
using fire-engine as default scraping strategy
```yml
NUM_WORKERS_PER_QUEUE=8
```
### Results
Date: 17:31:33(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| http.codes.200 | 1800 |
| http.downloaded_bytes | 0 |
| http.request_rate | 3/sec |
| http.requests | 1800 |
| http.response_time.min | 711 |
| http.response_time.max | 5829 |
| http.response_time.mean | 849.2 |
| http.response_time.median | 804.5 |
| http.response_time.p95 | 1043.3 |
| http.response_time.p99 | 1274.3 |
| http.responses | 1800 |
| vusers.completed | 900 |
| vusers.created | 900 |
| vusers.created_by_name.Crawl a URL | 900 |
| vusers.failed | 0 |
| vusers.session_length.min | 11637 |
| vusers.session_length.max | 16726.1 |
| vusers.session_length.mean | 11829.5 |
| vusers.session_length.median | 11734.2 |
| vusers.session_length.p95 | 12213.1 |
| vusers.session_length.p99 | 12213.1 |
### Metrics
![](./assets/metrics-fire-engine-test-7.png)
![](./assets/metrics-fire-engine-test-7-2.png)
![](./assets/metrics-test-7.png)
**CPU Utilization:**
- **Fire-engine mia machines:** Reached 100% after 22 minutes of processing the queue. The sjc machine was not requested during the test.
- **Worker machines:** Maintained CPU utilization above 71% during the load testing time.
**Memory Utilization:**
- **Fire-engine mia machines:** utilization reached 100% after 22 minutes of processing the queue.
- **Worker machines:** Maintained Memory utilization above 700MiB during the test.
---
## Conclusions and Next Steps
### Conclusions
1. **Request Handling:** The system effectively managed to queue all requests, demonstrating its capability to handle the initial setup of traffic without any failures.
2. **Critical Failures:** The abrupt failure of the fire-engine machines part-way through the test underscores a significant stability issue, directly impacting the ability to continue operations under load.
3. **Resource Management Deficiencies:** The failure was linked to insufficient resource management, particularly memory handling, which necessitates immediate attention to prevent future disruptions.
### Next Steps
1. **Increase Workers per Machine:** The number of workers per worker machine will be increased from 8 to 12. This change aims to enhance the processing capability of each machine, potentially reducing response times and handling larger volumes of requests more efficiently.
2. **Implement Autoscaling:** Introduce autoscaling capabilities to dynamically adjust the number of active machines based on the current load. This will help in maintaining optimal performance and prevent system overloads by automatically scaling resources up during peak demands and down during low usage periods.
3. **Enhanced Resource Management:** With the increase in workers and the implementation of autoscaling, it is crucial to optimize resource management strategies. This involves improving memory handling and cleanup processes to ensure that resource allocation and recovery are efficient and effective, particularly under sustained high loads.
4. **Extended Duration Testing:** Conduct further tests with extended durations to evaluate the impact of the increased number of workers and autoscaling on system stability and performance. These tests should focus on assessing how well the system sustains operational efficiency over longer periods and under varying load conditions.
5. **Monitor and Optimize:** Continuously monitor system performance during the new tests, particularly focusing on the effects of the increased worker count and autoscaling. Use the gathered data to optimize configurations and troubleshoot any new issues that arise, ensuring the system is fine-tuned for both high performance and reliability.
By following these steps, we can further enhance the system's performance and reliability under varying load conditions.

View File

@ -0,0 +1,116 @@
# Load Testing Crawl Routes - Test #8
## Summary
This load test, conducted over a period of 7 minutes with an extended observation, aimed to evaluate the system's performance under variable loads. The test revealed that while the system managed to handle the initial load effectively, there were issues with autoscaling and resource management that need to be addressed.
## Table of Contents
- [Load Testing Crawl Routes - Test #8](#load-testing-crawl-routes---test-8)
- [Summary](#summary)
- [Table of Contents](#table-of-contents)
- [Load Test Configuration](#load-test-configuration)
- [Configuration](#configuration)
- [Results](#results)
- [Metrics](#metrics)
- [Conclusions and Next Steps](#conclusions-and-next-steps)
- [Conclusions](#conclusions)
- [Next Steps](#next-steps)
## Load Test Configuration
### Configuration
| Machine | Size/CPU | Status |
|---|---|---|
| 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on |
| e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on |
| 178134db566489 mia (worker) | performance-cpu-1x@2048MB | always on |
| 7811020c91d138 mia (worker) | performance-cpu-1x@2048MB | always on
178134db566489 mia (worker) | performance-cpu-1x@2048MB | stopped
| 06e825d0da2387 mia (worker) | performance-cpu-1x@2048MB | stopped |
fire-engine machines:
| Machine | Size/CPU | Status |
|---|---|---|
| 2874d0db0e5258 mia app | performance-cpu-2x@4096MB | always on |
| 48ed194f7de258 mia app | performance-cpu-2x@4096MB | always on |
| 56830d45f70218 sjc app | performance-cpu-2x@4096MB | always on |
```yml
phases:
- duration: 60
arrivalRate: 1 # Initial load
- duration: 120
arrivalRate: 2 # Increased load
- duration: 180
arrivalRate: 3 # Peak load
- duration: 60
arrivalRate: 1 # Cool down
```
using fire-engine as default scraping strategy
```yml
NUM_WORKERS_PER_QUEUE=12
```
### Results
Date: 14:42:27(-0300)
| Metric | Value |
|---------------------------------------------|---------|
| errors.Failed capture or match | 43 |
| http.codes.200 | 1757 |
| http.codes.404 | 43 |
| http.downloaded_bytes | 0 |
| http.request_rate | 3/sec |
| http.requests | 1800 |
| http.response_time.min | 363 |
| http.response_time.max | 6065 |
| http.response_time.mean | 847.8 |
| http.response_time.median | 804.5 |
| http.response_time.p95 | 1130.2 |
| http.response_time.p99 | 1353.1 |
| http.responses | 1800 |
| vusers.completed | 857 |
| vusers.created | 900 |
| vusers.created_by_name.Crawl a URL | 900 |
| vusers.failed | 43 |
| vusers.session_length.min | 11598.4 |
| vusers.session_length.max | 17005.3 |
| vusers.session_length.mean | 11854.1 |
| vusers.session_length.median | 11734.2 |
| vusers.session_length.p95 | 12213.1 |
| vusers.session_length.p99 | 12459.8 |
### Metrics
![](./assets/metrics-fire-engine-test-8.png)
![](./assets/metrics-test-8.png)
**CPU Utilization:**
- **Fire-engine mia machines:** Reached 99% after 16 minutes of processing the queue, but dropped to 0% after the queue was fully processed. The sjc machine was not requested during the
- **Worker machines:** Maintained CPU utilization above 89% during the load testing time. The higher CPU value indicates an accurate maximum number of workers/machine (12). Other worker machines were not autoscaled for solving the queue as spected.
**Memory Utilization:**
- **Fire-engine mia machines:** utilization reached 92% after 16 minutes of processing the queue.
- **Worker machines:** Maintained Memory utilization above 650MiB during the test.
## Conclusions and Next Steps
### Conclusions
1. **Request Handling:** The system effectively managed to queue all requests, demonstrating its capability to handle the initial setup of traffic without any failures.
2. **Autoscaling Issues:** The other worker machines should have been turned on, but the autoscaling strategy did not work as expected.
3. **Resource Management:** The system maintained high CPU and memory utilization, indicating efficient use of resources, but the autoscaling failure needs to be addressed.
### Next Steps
1. **Investigate Autoscaling:** Investigate why the autoscaling strategy did not work properly and ensure that additional worker machines are turned on as needed.
2. **Optimize Autoscaling:** Implement and test improvements to the autoscaling strategy to ensure it dynamically adjusts the number of active machines based on the current load.
3. **Extended Duration Testing:** Conduct further tests with extended durations to evaluate the impact of the improved autoscaling strategy on system stability and performance.
4. **Monitor and Optimize:** Continuously monitor system performance during the new tests, focusing on the effects of the autoscaling improvements. Use the gathered data to optimize configurations and troubleshoot any new issues that arise.
By following these steps, we can further enhance the system's performance and reliability under varying load conditions.

View File

@ -0,0 +1,77 @@
config:
target: "https://staging-firecrawl-scraper-js.fly.dev/v0"
http:
timeout: 30
phases:
# /scrape
# - duration: 60
# arrivalRate: 10 # Initial load
# - duration: 120
# arrivalRate: 20 # Increased load
# - duration: 180
# arrivalRate: 30 # Peak load
# - duration: 60
# arrivalRate: 10 # Cool down
# /crawl
- duration: 60
arrivalRate: 1 # Initial load
- duration: 120
arrivalRate: 2 # Increased load
- duration: 180
arrivalRate: 3 # Peak load
- duration: 60
arrivalRate: 1 # Cool down
defaults:
headers:
Authorization: "Bearer YOUR_API_KEY"
scenarios:
# - name: Scrape a URL
# flow:
# - post:
# url: "/scrape"
# json:
# url: "https://www.scrapethissite.com"
# pageOptions:
# onlyMainContent: true
# capture:
# - json: "$.data.markdown"
# as: markdown_content
- name: Crawl a URL
flow:
- post:
url: "/crawl"
json:
url: "https://rsseau.fr"
crawlerOptions:
limit: 100
pageOptions:
onlyMainContent: true
capture:
- json: "$.jobId"
as: job_id
- think: 10
- get:
url: "/crawl/status/{{ job_id }}"
capture:
- json: "$.status"
as: crawl_status
until:
- condition: "equals"
value: "completed"
variable: "crawl_status"
retry:
count: 20
wait: 10
# - name: Search for a query
# flow:
# - post:
# url: "/search"
# json:
# query: "firecrawl"
# pageOptions:
# fetchPageContent: true
# capture:
# - json: "$.data[0].markdown"
# as: search_markdown_content

View File

@ -3,7 +3,8 @@
"version": "1.0.0",
"description": "",
"scripts": {
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:suite": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:load": "artillery run --output ./load-test-results/test-run-report.json load-test.yml",
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
},