mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Compare commits
1 Commits
d9c4bb5aaf
...
2b26c725a4
Author | SHA1 | Date | |
---|---|---|---|
|
2b26c725a4 |
2
.github/archive/js-sdk.yml
vendored
2
.github/archive/js-sdk.yml
vendored
|
@ -8,6 +8,7 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -20,6 +21,7 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/python-sdk.yml
vendored
2
.github/archive/python-sdk.yml
vendored
|
@ -8,6 +8,7 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -20,6 +21,7 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
jobs:
|
||||
|
|
2
.github/archive/rust-sdk.yml
vendored
2
.github/archive/rust-sdk.yml
vendored
|
@ -8,6 +8,7 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -20,6 +21,7 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
|
||||
|
||||
|
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
@ -12,6 +12,7 @@ env:
|
|||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
HOST: ${{ secrets.HOST }}
|
||||
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
|
||||
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
|
||||
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
|
||||
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
|
||||
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
|
||||
|
@ -24,6 +25,7 @@ env:
|
|||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
|
|
32
.github/workflows/deploy-image-staging.yml
vendored
32
.github/workflows/deploy-image-staging.yml
vendored
|
@ -1,32 +0,0 @@
|
|||
name: STAGING Deploy Images to GHCR
|
||||
|
||||
env:
|
||||
DOTNET_VERSION: '6.0.x'
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- mog/webscraper-refactor
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
push-app-image:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: './apps/api'
|
||||
steps:
|
||||
- name: 'Checkout GitHub Action'
|
||||
uses: actions/checkout@main
|
||||
|
||||
- name: 'Login to GitHub Container Registry'
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{github.actor}}
|
||||
password: ${{secrets.GITHUB_TOKEN}}
|
||||
|
||||
- name: 'Build Inventory Image'
|
||||
run: |
|
||||
docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
|
||||
docker push ghcr.io/mendableai/firecrawl-staging:latest
|
3
.github/workflows/deploy-image.yml
vendored
3
.github/workflows/deploy-image.yml
vendored
|
@ -2,7 +2,6 @@ name: Deploy Images to GHCR
|
|||
|
||||
env:
|
||||
DOTNET_VERSION: '6.0.x'
|
||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||
|
||||
on:
|
||||
push:
|
||||
|
@ -29,5 +28,5 @@ jobs:
|
|||
|
||||
- name: 'Build Inventory Image'
|
||||
run: |
|
||||
docker build . --tag ghcr.io/mendableai/firecrawl:latest --secret id=SENTRY_AUTH_TOKEN
|
||||
docker build . --tag ghcr.io/mendableai/firecrawl:latest
|
||||
docker push ghcr.io/mendableai/firecrawl:latest
|
|
@ -41,6 +41,7 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -62,6 +62,7 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
|
|||
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
||||
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
|
|
|
@ -33,6 +33,8 @@ SCRAPING_BEE_API_KEY=
|
|||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
|
@ -52,6 +54,9 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
|
|||
STRIPE_PRICE_ID_GROWTH=
|
||||
STRIPE_PRICE_ID_GROWTH_YEARLY=
|
||||
|
||||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
// global.fetch = require('jest-fetch-mock');
|
||||
global.fetch = require('jest-fetch-mock');
|
||||
|
|
|
@ -32,11 +32,9 @@
|
|||
"@tsconfig/recommended": "^1.0.3",
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/cors": "^2.8.13",
|
||||
"@types/escape-html": "^1.0.4",
|
||||
"@types/express": "^4.17.17",
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/node": "^20.14.1",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"body-parser": "^1.20.1",
|
||||
"express": "^4.18.2",
|
||||
"jest": "^29.6.3",
|
||||
|
@ -55,7 +53,9 @@
|
|||
"@bull-board/api": "^5.20.5",
|
||||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.16",
|
||||
"@dqbd/tiktoken": "^1.0.13",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
|
@ -78,9 +78,9 @@
|
|||
"date-fns": "^3.6.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"dotenv-cli": "^7.4.2",
|
||||
"escape-html": "^1.0.3",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"ioredis": "^5.4.1",
|
||||
|
@ -92,7 +92,6 @@
|
|||
"languagedetect": "^2.0.0",
|
||||
"logsnag": "^1.0.0",
|
||||
"luxon": "^3.4.3",
|
||||
"marked": "^14.1.2",
|
||||
"md5": "^2.3.0",
|
||||
"moment": "^2.29.4",
|
||||
"mongoose": "^8.4.4",
|
||||
|
@ -113,11 +112,8 @@
|
|||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"typesense": "^1.5.4",
|
||||
"undici": "^6.20.1",
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"winston": "^3.14.2",
|
||||
"winston-transport": "^4.8.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,49 +1,82 @@
|
|||
# Pick your baseUrl here:
|
||||
@baseUrl = http://localhost:3002
|
||||
# @baseUrl = https://api.firecrawl.dev
|
||||
|
||||
### Scrape Website
|
||||
# @name scrape
|
||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"corterix.com"
|
||||
}
|
||||
|
||||
### Crawl Website
|
||||
# @name crawl
|
||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||
|
||||
|
||||
### Scrape Website
|
||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||
Authorization: Bearer fc-
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "firecrawl.dev"
|
||||
}
|
||||
|
||||
### Check Crawl Status
|
||||
@crawlId = {{crawl.response.body.$.id}}
|
||||
# @name crawlStatus
|
||||
GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
## "reoveTags": [],
|
||||
# "mode": "crawl",
|
||||
# "crawlerOptions": {
|
||||
# "allowBackwardCrawling": false
|
||||
# },
|
||||
# "pageOptions": {
|
||||
# "onlyMainContent": false,
|
||||
# "includeHtml": false,
|
||||
# "parsePDF": true
|
||||
# }
|
||||
|
||||
|
||||
### Batch Scrape Websites
|
||||
# @name batchScrape
|
||||
POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Scrape Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"urls": [
|
||||
"firecrawl.dev",
|
||||
"mendable.ai"
|
||||
]
|
||||
"url":"https://mendable.ai"
|
||||
}
|
||||
|
||||
### Check Batch Scrape Status
|
||||
@batchScrapeId = {{batchScrape.response.body.$.id}}
|
||||
# @name batchScrapeStatus
|
||||
GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
||||
Authorization: Bearer
|
||||
|
||||
### Get Job Result
|
||||
|
||||
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
|
||||
Authorization: Bearer
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"https://mendable.ai"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
|
||||
Authorization: Bearer
|
||||
|
||||
### Get Active Jobs Count
|
||||
GET http://localhost:3002/serverHealthCheck
|
||||
content-type: application/json
|
||||
|
||||
### Notify Server Health Check
|
||||
GET http://localhost:3002/serverHealthCheck/notify
|
||||
content-type: application/json
|
||||
|
||||
|
|
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
2
apps/api/sharedLibs/go-html-to-md/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
|||
html-to-markdown.so
|
||||
html-to-markdown.h
|
|
@ -2,7 +2,7 @@ package main
|
|||
|
||||
import (
|
||||
"C"
|
||||
// "log"
|
||||
"log"
|
||||
|
||||
md "github.com/tomkosm/html-to-markdown"
|
||||
"github.com/tomkosm/html-to-markdown/plugin"
|
||||
|
@ -15,7 +15,7 @@ func ConvertHTMLToMarkdown(html *C.char) *C.char {
|
|||
|
||||
markdown, err := converter.ConvertString(C.GoString(html))
|
||||
if err != nil {
|
||||
// log.Fatal(err)
|
||||
log.Fatal(err)
|
||||
}
|
||||
return C.CString(markdown)
|
||||
}
|
||||
|
|
|
@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(crawlInitResponse.statusCode).toBe(200);
|
||||
expect(crawlInitResponse.body).toHaveProperty("jobId");
|
||||
|
||||
let crawlStatus: string = "scraping";
|
||||
let crawlStatus: string;
|
||||
let crawlData = [];
|
||||
while (crawlStatus !== "completed") {
|
||||
const statusResponse = await request(TEST_URL)
|
||||
|
|
|
@ -20,6 +20,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||
process.env.SCRAPING_BEE_API_KEY = "";
|
||||
process.env.OPENAI_API_KEY = "";
|
||||
process.env.BULL_AUTH_KEY = "";
|
||||
process.env.LOGTAIL_KEY = "";
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||
process.env.LLAMAPARSE_API_KEY = "";
|
||||
process.env.TEST_API_KEY = "";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequestInput,
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
|
@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
|
@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
|
@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/abs/2410.04840",
|
||||
};
|
||||
|
||||
|
@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
|
@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
|
@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
|
@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
|
@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 8000
|
||||
|
@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
|
|||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequestInput = {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
|
@ -868,7 +868,7 @@ describe("POST /v1/crawl", () => {
|
|||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
|
|
|
@ -1,603 +0,0 @@
|
|||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
||||
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||
// expect(response.body.data.markdown).toContain("Loading...");
|
||||
expect(response.body.data.markdown).toContain("Click me!");
|
||||
expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
|
||||
expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
|
||||
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
||||
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
||||
expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'formats:html' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html"]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
|
||||
expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||
expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["rawHtml"]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
|
||||
expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
||||
},
|
||||
30000);
|
||||
|
||||
// - TODO: tests for links
|
||||
// - TODO: tests for screenshot
|
||||
// - TODO: tests for screenshot@fullPage
|
||||
|
||||
it.concurrent("should handle 'headers' parameter correctly", async () => {
|
||||
// @ts-ignore
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
headers: { "e2e-header-test": "firecrawl" }
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("should handle 'includeTags' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
includeTags: ['#content-1']
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'excludeTags' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
excludeTags: ['#content-1']
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.markdown).not.toContain("Content with id #content-1");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'onlyMainContent' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html", "markdown"],
|
||||
onlyMainContent: false
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
||||
expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'timeout' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
timeout: 500
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
|
||||
if (!("error" in response.body)) {
|
||||
throw new Error("Expected response body to have 'error' property");
|
||||
}
|
||||
expect(response.body.error).toBe("Request timed out");
|
||||
expect(response.body.success).toBe(false);
|
||||
}, 30000);
|
||||
|
||||
|
||||
it.concurrent("should handle 'mobile' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
mobile: true
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).toContain("This content is only visible on mobile");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'parsePDF' parameter correctly",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
|
||||
expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||
|
||||
const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(responseNoParsePDF.statusCode).toBe(200);
|
||||
expect(responseNoParsePDF.body).toHaveProperty('data');
|
||||
if (!("data" in responseNoParsePDF.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
||||
},
|
||||
30000);
|
||||
|
||||
// it.concurrent("should handle 'location' parameter correctly",
|
||||
// async () => {
|
||||
// const scrapeRequest: ScrapeRequest = {
|
||||
// url: "https://roastmywebsite.ai",
|
||||
// location: {
|
||||
// country: "US",
|
||||
// languages: ["en"]
|
||||
// }
|
||||
// };
|
||||
|
||||
// const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
// .post("/v1/scrape")
|
||||
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
// .set("Content-Type", "application/json")
|
||||
// .send(scrapeRequest);
|
||||
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// // Add assertions to verify location is handled correctly
|
||||
// },
|
||||
// 30000);
|
||||
|
||||
it.concurrent("should handle 'skipTlsVerification' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: "https://expired.badssl.com/",
|
||||
timeout: 120000
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
console.log("Error1a")
|
||||
// console.log(response.body)
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||
console.log("Error?")
|
||||
|
||||
const scrapeRequestWithSkipTlsVerification = {
|
||||
url: "https://expired.badssl.com/",
|
||||
skipTlsVerification: true,
|
||||
timeout: 120000
|
||||
|
||||
} as ScrapeRequest;
|
||||
|
||||
const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithSkipTlsVerification);
|
||||
|
||||
console.log("Error1b")
|
||||
// console.log(responseWithSkipTlsVerification.body)
|
||||
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
|
||||
if (!("data" in responseWithSkipTlsVerification.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(responseWithSkipTlsVerification.body.data)
|
||||
expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
|
||||
},
|
||||
60000);
|
||||
|
||||
it.concurrent("should handle 'removeBase64Images' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
removeBase64Images: true
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(response.body.data.markdown)
|
||||
// - TODO: not working for every image
|
||||
// expect(response.body.data.markdown).toContain("Image-Removed");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action wait' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "wait",
|
||||
milliseconds: 10000
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).not.toContain("Loading...");
|
||||
expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
|
||||
},
|
||||
30000);
|
||||
|
||||
// screenshot
|
||||
it.concurrent("should handle 'action screenshot' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "screenshot"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
if (!response.body.data.actions?.screenshots) {
|
||||
throw new Error("Expected response body to have screenshots array");
|
||||
}
|
||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||
|
||||
// TODO compare screenshot with expected screenshot
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "screenshot",
|
||||
fullPage: true
|
||||
},
|
||||
{
|
||||
type:"scrape"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
// console.log(response.body.data.actions?.screenshots[0])
|
||||
if (!response.body.data.actions?.screenshots) {
|
||||
throw new Error("Expected response body to have screenshots array");
|
||||
}
|
||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
||||
|
||||
if (!response.body.data.actions?.scrapes) {
|
||||
throw new Error("Expected response body to have scrapes array");
|
||||
}
|
||||
expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
|
||||
expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
||||
// TODO compare screenshot with expected full page screenshot
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action click' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
actions: [{
|
||||
type: "click",
|
||||
selector: "#click-me"
|
||||
}]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data.markdown).not.toContain("Click me!");
|
||||
expect(response.body.data.markdown).toContain("Text changed after click!");
|
||||
},
|
||||
30000);
|
||||
|
||||
it.concurrent("should handle 'action write' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["html"],
|
||||
actions: [{
|
||||
type: "click",
|
||||
selector: "#input-1"
|
||||
},
|
||||
{
|
||||
type: "write",
|
||||
text: "Hello, world!"
|
||||
}
|
||||
]} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
// uncomment the following line:
|
||||
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
it.concurrent("should handle 'action pressKey' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["markdown"],
|
||||
actions: [
|
||||
{
|
||||
type: "press",
|
||||
key: "ArrowDown"
|
||||
}
|
||||
]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
// // TODO: fix this test (need to fix fire-engine first)
|
||||
// // right now response.body is: { success: false, error: '(Internal server error) - null' }
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// if (!("data" in response.body)) {
|
||||
// throw new Error("Expected response body to have 'data' property");
|
||||
// }
|
||||
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: fix this test (need to fix fire-engine first)
|
||||
it.concurrent("should handle 'action scroll' parameter correctly",
|
||||
async () => {
|
||||
const scrapeRequest = {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
formats: ["markdown"],
|
||||
actions: [
|
||||
{
|
||||
type: "click",
|
||||
selector: "#scroll-bottom-loader"
|
||||
},
|
||||
{
|
||||
type: "scroll",
|
||||
direction: "down",
|
||||
amount: 2000
|
||||
}
|
||||
]
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
// TODO: uncomment this tests
|
||||
// expect(response.statusCode).toBe(200);
|
||||
// if (!("data" in response.body)) {
|
||||
// throw new Error("Expected response body to have 'data' property");
|
||||
// }
|
||||
//
|
||||
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
||||
},
|
||||
30000);
|
||||
|
||||
// TODO: test scrape action
|
||||
|
||||
});
|
|
@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(1);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
|
@ -776,8 +776,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||
await new Promise((r) => setTimeout(r, 10000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.maxResponseSize(4000000000);
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
|
|
|
@ -9,8 +9,9 @@ import {
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { logger } from "../lib/logger";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { deleteKey, getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
|
@ -39,8 +40,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
|
|||
export async function setCachedACUC(
|
||||
api_key: string,
|
||||
acuc:
|
||||
| AuthCreditUsageChunk | null
|
||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
||||
| AuthCreditUsageChunk
|
||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
|
||||
) {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
const redLockKey = `lock_${cacheKeyACUC}`;
|
||||
|
@ -48,7 +49,7 @@ export async function setCachedACUC(
|
|||
try {
|
||||
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
||||
if (typeof acuc === "function") {
|
||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
|
||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
|
||||
|
||||
if (acuc === null) {
|
||||
if (signal.aborted) {
|
||||
|
@ -68,7 +69,7 @@ export async function setCachedACUC(
|
|||
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||
Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,7 +103,7 @@ export async function getACUC(
|
|||
break;
|
||||
}
|
||||
|
||||
logger.warn(
|
||||
Logger.warn(
|
||||
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
||||
);
|
||||
retries++;
|
||||
|
@ -145,14 +146,33 @@ export async function authenticateUser(
|
|||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
plan?: PlanType;
|
||||
chunk?: AuthCreditUsageChunk;
|
||||
}> {
|
||||
const authHeader =
|
||||
req.headers.authorization ??
|
||||
(req.headers["sec-websocket-protocol"]
|
||||
|
@ -180,7 +200,7 @@ export async function supaAuthenticateUser(
|
|||
|
||||
let teamId: string | null = null;
|
||||
let priceId: string | null = null;
|
||||
let chunk: AuthCreditUsageChunk | null = null;
|
||||
let chunk: AuthCreditUsageChunk;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
if (mode == RateLimiterMode.CrawlStatus) {
|
||||
|
@ -213,6 +233,8 @@ export async function supaAuthenticateUser(
|
|||
priceId = chunk.price_id;
|
||||
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: teamId,
|
||||
plan,
|
||||
|
@ -269,7 +291,7 @@ export async function supaAuthenticateUser(
|
|||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
|
@ -296,7 +318,7 @@ export async function supaAuthenticateUser(
|
|||
mode === RateLimiterMode.CrawlStatus ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview", chunk: null };
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
// const origin = req.headers.origin;
|
||||
// if (origin && origin.includes("firecrawl.dev")){
|
||||
|
@ -311,12 +333,12 @@ export async function supaAuthenticateUser(
|
|||
|
||||
return {
|
||||
success: true,
|
||||
team_id: teamId ?? undefined,
|
||||
plan: (subscriptionData?.plan ?? "") as PlanType,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: (subscriptionData.plan ?? "") as PlanType,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string | null): PlanType {
|
||||
function getPlanByPriceId(price_id: string): PlanType {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return "starter";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { supabase_service } from "../../../services/supabase";
|
||||
import { clearACUC } from "../../auth";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function acucCacheClearController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
|
|||
.select("*")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
|
||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
||||
|
||||
res.json({ ok: true });
|
||||
} catch (error) {
|
||||
logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||
|
@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
logger.info("🐂 Cleaning jobs older than 24h");
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
const batchSize = 10;
|
||||
|
@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
|
|||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
|
|||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
|
|||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
if (targetMachineCount !== activeMachines) {
|
||||
logger.info(
|
||||
Logger.info(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||
);
|
||||
|
||||
|
@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||
count: activeMachines,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).send("Failed to initialize autoscaler");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
|
@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt === retries) throw error;
|
||||
logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL!);
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||
await retryOperation(() => queueRedis.del(testKey));
|
||||
} catch (error) {
|
||||
logger.error(`queueRedis health check failed: ${error}`);
|
||||
Logger.error(`queueRedis health check failed: ${error}`);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
);
|
||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||
} catch (error) {
|
||||
logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
|
@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
logger.info("Both Redis instances are healthy");
|
||||
Logger.info("Both Redis instances are healthy");
|
||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
logger.info(
|
||||
Logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
|
@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Redis health check failed: ${error}`);
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
// true
|
||||
|
|
|
@ -2,7 +2,7 @@ import { Request, Response } from "express";
|
|||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
@ -12,17 +12,15 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const { team_id } = auth;
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
|
@ -48,7 +46,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
|
@ -56,7 +54,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,17 +2,15 @@ import { Request, Response } from "express";
|
|||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job } from "bullmq";
|
||||
import { toLegacyDocument } from "../v1/types";
|
||||
configDotenv();
|
||||
|
||||
export async function getJobs(crawlId: string, ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
||||
|
@ -34,17 +32,15 @@ export async function getJobs(crawlId: string, ids: string[]) {
|
|||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const { team_id } = auth;
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
|
@ -75,7 +71,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
|
||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active";
|
||||
|
||||
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||
|
||||
if (
|
||||
jobs.length > 0 &&
|
||||
|
@ -94,12 +90,12 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
status: jobStatus,
|
||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||
total: jobs.length,
|
||||
data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
|
||||
data: jobStatus === "completed" ? data : null,
|
||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,28 +9,24 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
|
|||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
|
@ -39,7 +35,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -75,13 +71,13 @@ export async function crawlController(req: Request, res: Response) {
|
|||
await checkTeamCredits(chunk, team_id, limitCheck);
|
||||
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
|
||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
|
||||
}
|
||||
|
||||
// TODO: need to do this to v1
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
let url = urlSchema.parse(req.body.url);
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
}
|
||||
|
@ -127,7 +123,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// logger.error(error);
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
@ -136,15 +132,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||
|
||||
delete (scrapeOptions as any).timeout;
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
createdAt: Date.now(),
|
||||
|
@ -179,11 +170,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -197,7 +187,6 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
|
@ -219,11 +208,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan: plan!,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
},
|
||||
|
@ -238,9 +226,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
return res.status(500).json({ error: error instanceof ZodError
|
||||
? "Invalid URL"
|
||||
: error.message });
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,16 +3,15 @@ import { authenticateUser } from "../auth";
|
|||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
const auth = await authenticateUser(
|
||||
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Preview
|
||||
|
@ -20,12 +19,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
|
||||
const team_id = "preview";
|
||||
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const { plan } = auth;
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "Url is required" });
|
||||
|
@ -74,7 +71,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// logger.error(error);
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
@ -87,13 +84,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
robots = await this.getRobotsTxt();
|
||||
} catch (_) {}
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
pageOptions,
|
||||
team_id,
|
||||
plan,
|
||||
robots,
|
||||
|
@ -113,11 +107,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -130,11 +123,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id,
|
||||
plan: plan!,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: "website-preview",
|
||||
crawl_id: id,
|
||||
}, {}, jobId);
|
||||
|
@ -144,7 +136,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
res.json({ jobId: id });
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,14 +8,13 @@ import { authenticateUser } from "../auth";
|
|||
export const keyAuthController = async (req: Request, res: Response) => {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
// if success, return success: true
|
||||
return res.status(200).json({ success: true });
|
||||
} catch (error) {
|
||||
|
|
|
@ -7,7 +7,7 @@ import {
|
|||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import {
|
||||
|
@ -19,11 +19,9 @@ import {
|
|||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||
import { ZodError } from "zod";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
|
@ -37,10 +35,10 @@ export async function scrapeHelper(
|
|||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document | { url: string };
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = urlSchema.parse(req.body.url);
|
||||
const url = req.body.url;
|
||||
if (typeof url !== "string") {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
@ -56,16 +54,15 @@ export async function scrapeHelper(
|
|||
|
||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
plan: plan!,
|
||||
pageOptions,
|
||||
plan,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -84,9 +81,9 @@ export async function scrapeHelper(
|
|||
},
|
||||
async (span) => {
|
||||
try {
|
||||
doc = (await waitForJob<Document>(jobId, timeout));
|
||||
doc = (await waitForJob(jobId, timeout))[0];
|
||||
} catch (e) {
|
||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
|
@ -152,7 +149,7 @@ export async function scrapeHelper(
|
|||
|
||||
return {
|
||||
success: true,
|
||||
data: toLegacyDocument(doc, internalOptions),
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
@ -161,17 +158,15 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = {
|
||||
|
@ -205,11 +200,11 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({
|
||||
error:
|
||||
"Error checking team credits. Please contact help@firecrawl.com for help.",
|
||||
"Error checking team credits. Please contact hello@firecrawl.com for help.",
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -229,8 +224,8 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && (result.data as Document).markdown
|
||||
? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
|
||||
result.data && result.data.markdown
|
||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
if (result.success) {
|
||||
|
@ -251,7 +246,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
if (creditsToBeBilled > 0) {
|
||||
// billing for doc done on queue end, bill only for llm extraction
|
||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
@ -259,19 +254,17 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
|
||||
let doc = result.data;
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && (doc as Document).rawHtml) {
|
||||
delete (doc as Document).rawHtml;
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
||||
delete (doc as Document).markdown;
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
|
@ -283,22 +276,21 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
scrapeOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({
|
||||
error:
|
||||
error instanceof ZodError
|
||||
? "Invalid URL"
|
||||
: typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
|
@ -7,23 +8,21 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
|
|||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { Job } from "bullmq";
|
||||
import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
subscription_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
plan: PlanType | undefined
|
||||
plan: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
|
@ -36,8 +35,8 @@ export async function searchHelper(
|
|||
return { success: false, error: "Query is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
const tbs = searchOptions.tbs ?? undefined;
|
||||
const filter = searchOptions.filter ?? undefined;
|
||||
const tbs = searchOptions.tbs ?? null;
|
||||
const filter = searchOptions.filter ?? null;
|
||||
let num_results = Math.min(searchOptions.limit ?? 7, 10);
|
||||
|
||||
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||
|
@ -58,12 +57,11 @@ export async function searchHelper(
|
|||
});
|
||||
|
||||
let justSearch = pageOptions.fetchPageContent === false;
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
|
||||
|
||||
|
||||
if (justSearch) {
|
||||
billTeam(team_id, subscription_id, res.length).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
|
@ -90,9 +88,9 @@ export async function searchHelper(
|
|||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
pageOptions: pageOptions,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
|
@ -106,7 +104,7 @@ export async function searchHelper(
|
|||
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
|
||||
}
|
||||
|
||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
|
||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
|
@ -117,7 +115,7 @@ export async function searchHelper(
|
|||
|
||||
// make sure doc.content is not empty
|
||||
const filteredDocs = docs.filter(
|
||||
(doc: any) => doc && doc.content && doc.content.trim().length > 0
|
||||
(doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
|
@ -134,15 +132,14 @@ export async function searchHelper(
|
|||
export async function searchController(req: Request, res: Response) {
|
||||
try {
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Search
|
||||
);
|
||||
if (!auth.success) {
|
||||
return res.status(auth.status).json({ error: auth.error });
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
const { team_id, plan, chunk } = auth;
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
|
@ -165,7 +162,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -192,16 +189,17 @@ export async function searchController(req: Request, res: Response) {
|
|||
mode: "search",
|
||||
url: req.body.query,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && (error.message.startsWith("Job wait") || error.message === "timeout")) {
|
||||
if (error instanceof Error && error.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({ error: "Request timed out" });
|
||||
}
|
||||
|
||||
Sentry.captureException(error);
|
||||
logger.error("Unhandled error occurred in search", { error });
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ import {
|
|||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
|
@ -16,7 +18,6 @@ import { logCrawl } from "../../services/logging/crawl_log";
|
|||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
|
@ -28,16 +29,19 @@ export async function batchScrapeController(
|
|||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account!;
|
||||
let { remainingCredits } = req.account;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -60,14 +64,14 @@ export async function batchScrapeController(
|
|||
url: x,
|
||||
mode: "single_urls" as const,
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan!,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
scrapeOptions: req.body,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
webhook: req.body.webhook,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuidv4(),
|
||||
|
@ -78,7 +82,6 @@ export async function batchScrapeController(
|
|||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
|
@ -87,10 +90,6 @@ export async function batchScrapeController(
|
|||
);
|
||||
await addScrapeJobs(jobs);
|
||||
|
||||
if(req.body.webhook) {
|
||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
|
||||
}
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Response } from "express";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
|||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
|
@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Job, JobState } from "bullmq";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
|
@ -57,7 +56,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs: string[] = [];
|
||||
let doneJobIDs = [];
|
||||
let finished = false;
|
||||
|
||||
const loop = async () => {
|
||||
|
@ -71,14 +70,15 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||
const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
|
||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
|
||||
for (const jobID of newlyDoneJobIDs) {
|
||||
const job = await getJob(jobID);
|
||||
|
||||
for (const job of newlyDoneJobs) {
|
||||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: job.returnvalue,
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
|
@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
|
@ -126,7 +126,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -139,21 +139,19 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!auth.success) {
|
||||
if (!success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error: auth.error,
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
const { team_id, plan } = auth;
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
|
@ -172,10 +170,10 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
|
|||
}
|
||||
}
|
||||
|
||||
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
|
@ -25,7 +24,7 @@ export async function getJob(id: string) {
|
|||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
@ -64,8 +63,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
|
@ -82,7 +81,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs: Job[] = [];
|
||||
let doneJobs = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
|
@ -99,7 +98,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(job.returnvalue).length;
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,7 +122,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
}
|
||||
|
||||
if (data.length > 0) {
|
||||
if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
|
||||
if (!doneJobs[0].data.pageOptions.includeRawHtml) {
|
||||
for (let ii = 0; ii < doneJobs.length; ii++) {
|
||||
if (data[ii]) {
|
||||
delete data[ii].rawHtml;
|
||||
|
@ -143,7 +142,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -4,8 +4,9 @@ import {
|
|||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
toLegacyCrawlerOptions,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
|
@ -19,10 +20,9 @@ import {
|
|||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
|
@ -34,22 +34,18 @@ export async function crawlController(
|
|||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account!;
|
||||
let { remainingCredits } = req.account;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const crawlerOptions = {
|
||||
...req.body,
|
||||
url: undefined,
|
||||
scrapeOptions: undefined,
|
||||
};
|
||||
const scrapeOptions = req.body.scrapeOptions;
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includePaths)) {
|
||||
for (const x of crawlerOptions.includePaths) {
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
|
@ -58,8 +54,8 @@ export async function crawlController(
|
|||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludePaths)) {
|
||||
for (const x of crawlerOptions.excludePaths) {
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
|
@ -72,9 +68,8 @@ export async function crawlController(
|
|||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
scrapeOptions,
|
||||
internalOptions: {},
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -83,9 +78,9 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
logger.debug(
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
|
@ -117,7 +112,7 @@ export async function crawlController(
|
|||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -133,7 +128,6 @@ export async function crawlController(
|
|||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
|
@ -148,10 +142,10 @@ export async function crawlController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||
plan: req.auth.plan!,
|
||||
plan: req.auth.plan,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
@ -14,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine";
|
|||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import Redis from "ioredis";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
const redis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
// Max Links that /map can return
|
||||
const MAX_MAP_LIMIT = 5000;
|
||||
|
@ -40,13 +44,8 @@ export async function mapController(
|
|||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: {
|
||||
...req.body,
|
||||
limit: req.body.sitemapOnly ? 10000000 : limit,
|
||||
scrapeOptions: undefined,
|
||||
},
|
||||
scrapeOptions: scrapeOptions.parse({}),
|
||||
internalOptions: {},
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -54,93 +53,77 @@ export async function mapController(
|
|||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
// If sitemapOnly is true, only get links from sitemap
|
||||
if (req.body.sitemapOnly) {
|
||||
const sitemap = await crawler.tryGetSitemap(true, true);
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
links = links.slice(1, limit);
|
||||
}
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
|
||||
let allResults: any[];
|
||||
let pagePromises: Promise<any>[];
|
||||
|
||||
if (cachedResult) {
|
||||
allResults = JSON.parse(cachedResult);
|
||||
} else {
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
const fetchPage = async (page: number) => {
|
||||
return fireEngineMap(mapUrl, {
|
||||
numResults: resultsPerPage,
|
||||
page: page,
|
||||
});
|
||||
};
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||
allResults = await Promise.all(pagePromises);
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(
|
||||
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
|
||||
);
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
}
|
||||
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = null;
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
let allResults: any[] = [];
|
||||
let pagePromises: Promise<any>[] = [];
|
||||
if (!cachedResult) {
|
||||
allResults = searchResults;
|
||||
}
|
||||
|
||||
if (cachedResult) {
|
||||
allResults = JSON.parse(cachedResult);
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
||||
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
|
||||
if (mapResults.length > minumumCutoff) {
|
||||
mapResults = mapResults.slice(0, minumumCutoff);
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
const fetchPage = async (page: number) => {
|
||||
return fireEngineMap(mapUrl, {
|
||||
numResults: resultsPerPage,
|
||||
page: page,
|
||||
});
|
||||
};
|
||||
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
||||
fetchPage(i + 1)
|
||||
);
|
||||
allResults = await Promise.all(pagePromises);
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
}
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
if (!cachedResult) {
|
||||
allResults = searchResults;
|
||||
}
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let mapResults = allResults
|
||||
.flat()
|
||||
.filter((result) => result !== null && result !== undefined);
|
||||
|
||||
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
|
||||
if (mapResults.length > minumumCutoff) {
|
||||
mapResults = mapResults.slice(0, minumumCutoff);
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.search) {
|
||||
const searchQuery = req.body.search.toLowerCase();
|
||||
|
@ -156,7 +139,7 @@ export async function mapController(
|
|||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null) as string[];
|
||||
.filter((x) => x !== null);
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
@ -170,7 +153,7 @@ export async function mapController(
|
|||
links = removeDuplicateUrls(links);
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||
logger.error(
|
||||
Logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
|
@ -192,8 +175,9 @@ export async function mapController(
|
|||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
scrapeOptions: {},
|
||||
pageOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {
|
|||
|
||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||
|
||||
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: "You are not allowed to access this resource.",
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
import { Response } from "express";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ScrapeRequest,
|
||||
scrapeRequestSchema,
|
||||
|
@ -9,6 +12,7 @@ import {
|
|||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
@ -24,6 +28,8 @@ export async function scrapeController(
|
|||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
|
@ -37,10 +43,11 @@ export async function scrapeController(
|
|||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
plan: req.auth.plan!,
|
||||
plan: req.auth.plan,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -49,14 +56,14 @@ export async function scrapeController(
|
|||
jobPriority
|
||||
);
|
||||
|
||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
||||
|
||||
let doc: Document;
|
||||
let doc: any | undefined;
|
||||
try {
|
||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||
doc = (await waitForJob(jobId, timeout + totalWait))[0];
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
|
@ -64,19 +71,34 @@ export async function scrapeController(
|
|||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`,
|
||||
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
extractorOptions && extractorOptions.mode !== "markdown"
|
||||
? " - Could be due to LLM parsing issues"
|
||||
: ""
|
||||
}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await getScrapeQueue().remove(jobId);
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc,
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
doc && doc.extract
|
||||
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
? 0 // TODO: fix
|
||||
doc && doc.markdown
|
||||
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
|
@ -89,16 +111,22 @@ export async function scrapeController(
|
|||
}
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||
logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
|
||||
if (!req.body.formats.includes("rawHtml")) {
|
||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
if (doc && doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if(pageOptions && pageOptions.includeExtract) {
|
||||
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
delete doc.markdown;
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
|
@ -109,14 +137,16 @@ export async function scrapeController(
|
|||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
scrapeOptions: req.body,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: doc,
|
||||
data: legacyDocumentConverter(doc),
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
import { countries } from "../../lib/validate-country";
|
||||
import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
|
||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
|
@ -53,7 +52,7 @@ const strictMessage = "Unrecognized key in body -- please review the v1 API docu
|
|||
export const extractOptions = z.object({
|
||||
mode: z.enum(["llm"]).default("llm"),
|
||||
schema: z.any().optional(),
|
||||
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."),
|
||||
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
|
||||
prompt: z.string().optional()
|
||||
}).strict(strictMessage);
|
||||
|
||||
|
@ -88,16 +87,11 @@ export const actionsSchema = z.array(z.union([
|
|||
}),
|
||||
z.object({
|
||||
type: z.literal("scroll"),
|
||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||
selector: z.string().optional(),
|
||||
direction: z.enum(["up", "down"]),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("scrape"),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal("executeJavascript"),
|
||||
script: z.string()
|
||||
}),
|
||||
]));
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
|
@ -173,23 +167,10 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||
});
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
|
||||
export const webhookSchema = z.preprocess(x => {
|
||||
if (typeof x === "string") {
|
||||
return { url: x };
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}, z.object({
|
||||
url: z.string().url(),
|
||||
headers: z.record(z.string(), z.string()).default({}),
|
||||
}).strict(strictMessage))
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
webhook: webhookSchema.optional(),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
|
@ -216,8 +197,6 @@ const crawlerOptions = z.object({
|
|||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
deduplicateSimilarURLs: z.boolean().default(true),
|
||||
ignoreQueryParameters: z.boolean().default(false),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
|
@ -236,7 +215,7 @@ export const crawlRequestSchema = crawlerOptions.extend({
|
|||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
webhook: webhookSchema.optional(),
|
||||
webhook: z.string().url().optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
|
@ -261,8 +240,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
|||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
sitemapOnly: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(5000).default(5000),
|
||||
limit: z.number().min(1).max(5000).default(5000).optional(),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
|
@ -274,14 +252,13 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
|
|||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
extract?: any;
|
||||
extract?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
actions?: {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
screenshots: string[];
|
||||
};
|
||||
warning?: string;
|
||||
metadata: {
|
||||
|
@ -314,11 +291,11 @@ export type Document = {
|
|||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
url?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -389,7 +366,7 @@ export type CrawlStatusResponse =
|
|||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType | undefined;
|
||||
plan: PlanType;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
|
@ -462,7 +439,7 @@ export interface ResponseWithSentry<
|
|||
sentry?: string,
|
||||
}
|
||||
|
||||
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
|
@ -473,96 +450,71 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
};
|
||||
}
|
||||
|
||||
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
crawlOptions: crawlerOptions.parse({
|
||||
includePaths: x.includes,
|
||||
excludePaths: x.excludes,
|
||||
limit: x.maxCrawledLinks ?? x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
},
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
includeExtract: x.formats.includes("extract"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
headers: x.headers,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.location ?? x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification,
|
||||
removeBase64Images: x.removeBase64Images,
|
||||
mobile: x.mobile,
|
||||
};
|
||||
}
|
||||
|
||||
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
|
||||
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||
return {
|
||||
scrapeOptions: scrapeOptions.parse({
|
||||
formats: [
|
||||
(pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
|
||||
(pageOptions.includeHtml ?? false) ? "html" as const : null,
|
||||
(pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
|
||||
(pageOptions.screenshot ?? false) ? "screenshot" as const : null,
|
||||
(pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
|
||||
(extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
|
||||
"links"
|
||||
].filter(x => x !== null),
|
||||
waitFor: pageOptions.waitFor,
|
||||
headers: pageOptions.headers,
|
||||
includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
|
||||
excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
timeout: timeout,
|
||||
parsePDF: pageOptions.parsePDF,
|
||||
actions: pageOptions.actions,
|
||||
location: pageOptions.geolocation,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification,
|
||||
removeBase64Images: pageOptions.removeBase64Images,
|
||||
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
|
||||
systemPrompt: extractorOptions.extractionPrompt,
|
||||
prompt: extractorOptions.userPrompt,
|
||||
schema: extractorOptions.extractionSchema,
|
||||
} : undefined,
|
||||
mobile: pageOptions.mobile,
|
||||
}),
|
||||
internalOptions: {
|
||||
atsv: pageOptions.atsv,
|
||||
v0DisableJsDom: pageOptions.disableJsDom,
|
||||
v0UseFastMode: pageOptions.useFastMode,
|
||||
},
|
||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||
}
|
||||
mode: x.mode ? "llm-extraction" : "markdown",
|
||||
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||
extractionSchema: x.schema,
|
||||
userPrompt: x.prompt ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
|
||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||
}
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc === null || doc === undefined) return null;
|
||||
|
||||
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
|
||||
if (internalOptions.v0CrawlOnlyUrls) {
|
||||
return { url: document.metadata.sourceURL! };
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
content: document.markdown!,
|
||||
markdown: document.markdown!,
|
||||
html: document.html,
|
||||
rawHtml: document.rawHtml,
|
||||
linksOnPage: document.links,
|
||||
llm_extraction: document.extract,
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
extract: doc.llm_extraction,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
actions: doc.actions ?? undefined,
|
||||
warning: doc.warning ?? undefined,
|
||||
metadata: {
|
||||
...document.metadata,
|
||||
error: undefined,
|
||||
statusCode: undefined,
|
||||
pageError: document.metadata.error,
|
||||
pageStatusCode: document.metadata.statusCode,
|
||||
screenshot: document.screenshot,
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata?.pageError,
|
||||
statusCode: doc.metadata?.pageStatusCode,
|
||||
},
|
||||
actions: document.actions ,
|
||||
warning: document.warning,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
19
apps/api/src/example.ts
Normal file
19
apps/api/src/example.ts
Normal file
|
@ -0,0 +1,19 @@
|
|||
import { WebScraperDataProvider } from "./scraper/WebScraper";
|
||||
|
||||
async function example() {
|
||||
const example = new WebScraperDataProvider();
|
||||
|
||||
await example.setOptions({
|
||||
jobId: "TEST",
|
||||
mode: "crawl",
|
||||
urls: ["https://mendable.ai"],
|
||||
crawlerOptions: {},
|
||||
});
|
||||
const docs = await example.getDocuments(false);
|
||||
docs.map((doc) => {
|
||||
console.log(doc.metadata.sourceURL);
|
||||
});
|
||||
console.log(docs.length);
|
||||
}
|
||||
|
||||
// example();
|
|
@ -6,24 +6,28 @@ import bodyParser from "body-parser";
|
|||
import cors from "cors";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import os from "os";
|
||||
import { logger } from "./lib/logger";
|
||||
import { Logger } from "./lib/logger";
|
||||
import { adminRouter } from "./routes/admin";
|
||||
import { ScrapeEvents } from "./lib/scrape-events";
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import dns from 'node:dns';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
const { ExpressAdapter } = require("@bull-board/express");
|
||||
|
||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||
logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
|
||||
const cacheable = new CacheableLookup()
|
||||
|
||||
|
@ -51,6 +55,7 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
|||
serverAdapter: serverAdapter,
|
||||
});
|
||||
|
||||
|
||||
app.use(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
serverAdapter.getRouter()
|
||||
|
@ -73,24 +78,18 @@ app.use(adminRouter);
|
|||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
|
||||
// HyperDX OpenTelemetry
|
||||
if (process.env.ENV === "production") {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
||||
}
|
||||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
logger.info(
|
||||
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
Logger.info(
|
||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
});
|
||||
|
||||
const exitHandler = () => {
|
||||
logger.info('SIGTERM signal received: closing HTTP server')
|
||||
server.close(() => {
|
||||
logger.info("Server closed.");
|
||||
process.exit(0);
|
||||
});
|
||||
};
|
||||
|
||||
process.on('SIGTERM', exitHandler);
|
||||
process.on('SIGINT', exitHandler);
|
||||
return server;
|
||||
}
|
||||
|
||||
|
@ -104,6 +103,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
|||
const [waitingJobs] = await Promise.all([
|
||||
scrapeQueue.getWaitingCount(),
|
||||
]);
|
||||
|
||||
const noWaitingJobs = waitingJobs === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||
|
@ -111,7 +111,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
|||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
@ -140,7 +140,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||
// Re-check the waiting jobs count after the timeout
|
||||
waitingJobsCount = await getWaitingJobsCount();
|
||||
if (waitingJobsCount >= treshold) {
|
||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
|
||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||
const message = {
|
||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||
timeout / 60000
|
||||
|
@ -156,14 +156,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||
});
|
||||
|
||||
if (!response.ok) {
|
||||
logger.error("Failed to send Slack notification");
|
||||
Logger.error("Failed to send Slack notification");
|
||||
}
|
||||
}
|
||||
}, timeout);
|
||||
}
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
logger.debug(error);
|
||||
Logger.debug(error);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -178,7 +178,7 @@ app.get("/is-production", (req, res) => {
|
|||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
|
||||
logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||
Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||
}
|
||||
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
|
@ -206,11 +206,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
|
|||
}
|
||||
}
|
||||
|
||||
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
});
|
||||
|
||||
logger.info(`Worker ${process.pid} started`);
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
|
||||
// const sq = getScrapeQueue();
|
||||
|
||||
|
|
|
@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
|||
|
||||
import { generateOpenAICompletions } from "./models";
|
||||
import { Document, ExtractorOptions } from "../entities";
|
||||
import { logger } from "../logger";
|
||||
import { Logger } from "../logger";
|
||||
|
||||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
documents: Document[],
|
||||
extractionOptions: ExtractorOptions | undefined,
|
||||
extractionOptions: ExtractorOptions,
|
||||
mode: "markdown" | "raw-html"
|
||||
): Promise<Document[]> {
|
||||
// const schema = zodToJsonSchema(options.schema)
|
||||
|
||||
const schema = extractionOptions?.extractionSchema;
|
||||
const systemPrompt = extractionOptions?.extractionPrompt;
|
||||
const prompt = extractionOptions?.userPrompt;
|
||||
const schema = extractionOptions.extractionSchema;
|
||||
const systemPrompt = extractionOptions.extractionPrompt;
|
||||
const prompt = extractionOptions.userPrompt;
|
||||
|
||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
|
@ -51,7 +51,7 @@ export async function generateCompletions(
|
|||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
logger.error(`Error generating completions: ${error}`);
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -95,7 +95,7 @@ export async function generateOpenAICompletions({
|
|||
|
||||
try {
|
||||
llmExtraction = JSON.parse(
|
||||
(jsonCompletion.choices[0].message.content ?? "").trim()
|
||||
jsonCompletion.choices[0].message.content.trim()
|
||||
);
|
||||
} catch (e) {
|
||||
throw new Error("Invalid JSON");
|
||||
|
|
|
@ -3,7 +3,7 @@ export async function batchProcess<T>(
|
|||
batchSize: number,
|
||||
asyncFunction: (item: T, index: number) => Promise<void>
|
||||
): Promise<void> {
|
||||
const batches: T[][] = [];
|
||||
const batches = [];
|
||||
for (let i = 0; i < array.length; i += batchSize) {
|
||||
const batch = array.slice(i, i + batchSize);
|
||||
batches.push(batch);
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
import { generateURLPermutations } from "./crawl-redis";
|
||||
|
||||
describe("generateURLPermutations", () => {
|
||||
it("generates permutations correctly", () => {
|
||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttps.length).toBe(4);
|
||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
|
||||
expect(bareHttp.length).toBe(4);
|
||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttps.length).toBe(4);
|
||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
|
||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
|
||||
expect(wwwHttp.length).toBe(4);
|
||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||
})
|
||||
});
|
|
@ -1,17 +1,13 @@
|
|||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
import { logger } from "./logger";
|
||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl?: string;
|
||||
crawlerOptions: any;
|
||||
scrapeOptions: Omit<ScrapeOptions, "timeout">;
|
||||
internalOptions: InternalOptions;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
plan?: string;
|
||||
plan: string;
|
||||
robots?: string;
|
||||
cancelled?: boolean;
|
||||
createdAt: number;
|
||||
|
@ -91,74 +87,40 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
|||
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
|
||||
}
|
||||
|
||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||
const urlO = new URL(url);
|
||||
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
|
||||
urlO.search = "";
|
||||
}
|
||||
urlO.hash = "";
|
||||
return urlO.href;
|
||||
}
|
||||
|
||||
export function generateURLPermutations(url: string | URL): URL[] {
|
||||
const urlO = new URL(url);
|
||||
|
||||
// Construct two versions, one with www., one without
|
||||
const urlWithWWW = new URL(urlO);
|
||||
const urlWithoutWWW = new URL(urlO);
|
||||
if (urlO.hostname.startsWith("www.")) {
|
||||
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
|
||||
} else {
|
||||
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
|
||||
}
|
||||
|
||||
let permutations = [urlWithWWW, urlWithoutWWW];
|
||||
|
||||
// Construct more versions for http/https
|
||||
permutations = permutations.flatMap(urlO => {
|
||||
if (!["http:", "https:"].includes(urlO.protocol)) {
|
||||
return [urlO];
|
||||
}
|
||||
|
||||
const urlWithHTTP = new URL(urlO);
|
||||
const urlWithHTTPS = new URL(urlO);
|
||||
urlWithHTTP.protocol = "http:";
|
||||
urlWithHTTPS.protocol = "https:";
|
||||
|
||||
return [urlWithHTTP, urlWithHTTPS];
|
||||
});
|
||||
|
||||
return permutations;
|
||||
}
|
||||
|
||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
|
||||
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
url = normalizeURL(url, sc);
|
||||
|
||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
||||
|
||||
let res: boolean;
|
||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
} else {
|
||||
const permutations = generateURLPermutations(url);
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
||||
try {
|
||||
const urlO = new URL(url);
|
||||
urlO.search = "";
|
||||
urlO.hash = "";
|
||||
url = urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
}
|
||||
|
||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||
urls = urls.map(url => {
|
||||
return normalizeURL(url, sc);
|
||||
try {
|
||||
const urlO = new URL(url);
|
||||
urlO.search = "";
|
||||
urlO.hash = "";
|
||||
return urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
return url;
|
||||
});
|
||||
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||
|
@ -166,15 +128,14 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||
return res;
|
||||
}
|
||||
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
|
||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
initialUrl: sc.originUrl!,
|
||||
baseUrl: newBase ? new URL(newBase).origin : undefined,
|
||||
initialUrl: sc.originUrl,
|
||||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
|
||||
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
import type { Document as V1Document } from "../controllers/v1/types";
|
||||
|
||||
export interface Progress {
|
||||
current: number;
|
||||
total: number;
|
||||
|
@ -30,13 +28,9 @@ export type Action = {
|
|||
key: string,
|
||||
} | {
|
||||
type: "scroll",
|
||||
direction?: "up" | "down",
|
||||
selector?: string,
|
||||
direction: "up" | "down"
|
||||
} | {
|
||||
type: "scrape",
|
||||
} | {
|
||||
type: "executeJavascript",
|
||||
script: string,
|
||||
}
|
||||
|
||||
export type PageOptions = {
|
||||
|
@ -135,8 +129,7 @@ export class Document {
|
|||
provider?: string;
|
||||
warning?: string;
|
||||
actions?: {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
screenshots: string[];
|
||||
}
|
||||
|
||||
index?: number;
|
||||
|
|
|
@ -5,29 +5,23 @@ import "../services/sentry"
|
|||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { logger } from './logger';
|
||||
import { stat } from 'fs/promises';
|
||||
import { Logger } from './logger';
|
||||
dotenv.config();
|
||||
|
||||
// TODO: add a timeout to the Go parser
|
||||
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
|
||||
|
||||
class GoMarkdownConverter {
|
||||
private static instance: GoMarkdownConverter;
|
||||
private convert: any;
|
||||
|
||||
private constructor() {
|
||||
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
|
||||
const lib = koffi.load(goExecutablePath);
|
||||
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
|
||||
}
|
||||
|
||||
public static async getInstance(): Promise<GoMarkdownConverter> {
|
||||
public static getInstance(): GoMarkdownConverter {
|
||||
if (!GoMarkdownConverter.instance) {
|
||||
try {
|
||||
await stat(goExecutablePath);
|
||||
} catch (_) {
|
||||
throw Error("Go shared library not found");
|
||||
}
|
||||
GoMarkdownConverter.instance = new GoMarkdownConverter();
|
||||
}
|
||||
return GoMarkdownConverter.instance;
|
||||
|
@ -46,28 +40,24 @@ class GoMarkdownConverter {
|
|||
}
|
||||
}
|
||||
|
||||
export async function parseMarkdown(html: string | null | undefined): Promise<string> {
|
||||
export async function parseMarkdown(html: string): Promise<string> {
|
||||
if (!html) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
|
||||
const converter = await GoMarkdownConverter.getInstance();
|
||||
const converter = GoMarkdownConverter.getInstance();
|
||||
let markdownContent = await converter.convertHTMLToMarkdown(html);
|
||||
|
||||
markdownContent = processMultiLineLinks(markdownContent);
|
||||
markdownContent = removeSkipToContentLinks(markdownContent);
|
||||
logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
Logger.info(`HTML to Markdown conversion using Go parser successful`);
|
||||
return markdownContent;
|
||||
}
|
||||
} catch (error) {
|
||||
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
|
||||
Sentry.captureException(error);
|
||||
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
} else {
|
||||
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
|
||||
}
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
||||
}
|
||||
|
||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||
|
@ -99,7 +89,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
|||
|
||||
return markdownContent;
|
||||
} catch (error) {
|
||||
logger.error("Error converting HTML to Markdown", {error});
|
||||
console.error("Error converting HTML to Markdown: ", error);
|
||||
return ""; // Optionally return an empty string or handle the error as needed
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { redisConnection } from "../../src/services/queue-service";
|
||||
import { PlanType } from "../../src/types";
|
||||
import { logger } from "./logger";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
const SET_KEY_PREFIX = "limit_team_id:";
|
||||
export async function addJobPriority(team_id, job_id) {
|
||||
|
@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
|
|||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||
await redisConnection.expire(setKey, 60);
|
||||
} catch (e) {
|
||||
logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||
// remove job_id from the set
|
||||
await redisConnection.srem(setKey, job_id);
|
||||
} catch (e) {
|
||||
logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,7 @@ export async function getJobPriority({
|
|||
team_id,
|
||||
basePriority = 10,
|
||||
}: {
|
||||
plan: PlanType | undefined;
|
||||
plan: PlanType;
|
||||
team_id: string;
|
||||
basePriority?: number;
|
||||
}): Promise<number> {
|
||||
|
@ -95,7 +95,7 @@ export async function getJobPriority({
|
|||
);
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
Logger.error(
|
||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
||||
);
|
||||
return basePriority;
|
||||
|
|
42
apps/api/src/lib/load-testing-example.ts
Normal file
42
apps/api/src/lib/load-testing-example.ts
Normal file
|
@ -0,0 +1,42 @@
|
|||
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
|
||||
|
||||
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
// const scrapInBatches = async (
|
||||
// urls: string[],
|
||||
// batchSize: number,
|
||||
// delayMs: number
|
||||
// ) => {
|
||||
// let successCount = 0;
|
||||
// let errorCount = 0;
|
||||
|
||||
// for (let i = 0; i < urls.length; i += batchSize) {
|
||||
// const batch = urls
|
||||
// .slice(i, i + batchSize)
|
||||
// .map((url) => scrapWithFireEngine(url));
|
||||
// try {
|
||||
// const results = await Promise.all(batch);
|
||||
// results.forEach((data, index) => {
|
||||
// if (data.trim() === "") {
|
||||
// errorCount++;
|
||||
// } else {
|
||||
// successCount++;
|
||||
// console.log(
|
||||
// `Scraping result ${i + index + 1}:`,
|
||||
// data.trim().substring(0, 20) + "..."
|
||||
// );
|
||||
// }
|
||||
// });
|
||||
// } catch (error) {
|
||||
// console.error("Error during scraping:", error);
|
||||
// }
|
||||
// await delay(delayMs);
|
||||
// }
|
||||
|
||||
// console.log(`Total successful scrapes: ${successCount}`);
|
||||
// console.log(`Total errored scrapes: ${errorCount}`);
|
||||
// };
|
||||
// function run() {
|
||||
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
|
||||
// scrapInBatches(urls, 10, 1000);
|
||||
// }
|
|
@ -1,51 +1,57 @@
|
|||
import * as winston from "winston";
|
||||
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
const logFormat = winston.format.printf(info =>
|
||||
`${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
|
||||
info.metadata,
|
||||
(_, value) => {
|
||||
if (value instanceof Error) {
|
||||
return {
|
||||
...value,
|
||||
name: value.name,
|
||||
message: value.message,
|
||||
stack: value.stack,
|
||||
cause: value.cause,
|
||||
}
|
||||
} else {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
) : ""}`
|
||||
)
|
||||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
||||
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
||||
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
||||
}
|
||||
export class Logger {
|
||||
static colors = {
|
||||
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
||||
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
||||
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
||||
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
||||
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
||||
};
|
||||
|
||||
export const logger = winston.createLogger({
|
||||
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
|
||||
format: winston.format.json({
|
||||
replacer(key, value) {
|
||||
if (value instanceof Error) {
|
||||
return {
|
||||
...value,
|
||||
name: value.name,
|
||||
message: value.message,
|
||||
stack: value.stack,
|
||||
cause: value.cause,
|
||||
}
|
||||
} else {
|
||||
return value;
|
||||
}
|
||||
static log (message: string, level: LogLevel) {
|
||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
|
||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||
const currentLevelIndex = levels.indexOf(logLevel);
|
||||
const messageLevelIndex = levels.indexOf(level);
|
||||
|
||||
if (currentLevelIndex >= messageLevelIndex) {
|
||||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
// if (useDbAuthentication) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
}
|
||||
}),
|
||||
transports: [
|
||||
new winston.transports.Console({
|
||||
format: winston.format.combine(
|
||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||
winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
|
||||
...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
|
||||
),
|
||||
}),
|
||||
],
|
||||
});
|
||||
}
|
||||
static error(message: string | any) {
|
||||
Logger.log(message, LogLevel.ERROR);
|
||||
}
|
||||
|
||||
static warn(message: string) {
|
||||
Logger.log(message, LogLevel.WARN);
|
||||
}
|
||||
|
||||
static info(message: string) {
|
||||
Logger.log(message, LogLevel.INFO);
|
||||
}
|
||||
|
||||
static debug(message: string) {
|
||||
Logger.log(message, LogLevel.DEBUG);
|
||||
}
|
||||
|
||||
static trace(message: string) {
|
||||
Logger.log(message, LogLevel.TRACE);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { logger } from "./logger";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
export function performCosineSimilarity(links: string[], searchQuery: string) {
|
||||
try {
|
||||
|
@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
|
|||
links = a.map((item) => item.link);
|
||||
return links;
|
||||
} catch (error) {
|
||||
logger.error(`Error performing cosine similarity: ${error}`);
|
||||
Logger.error(`Error performing cosine similarity: ${error}`);
|
||||
return links;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import { Job } from "bullmq";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { logger } from "./logger";
|
||||
import { Logger } from "./logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
|
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
|
|||
type: "scrape",
|
||||
url: string,
|
||||
worker?: string,
|
||||
method: Engine,
|
||||
method: (typeof baseScrapers)[number],
|
||||
result: null | {
|
||||
success: boolean,
|
||||
response_code?: number,
|
||||
|
@ -49,7 +49,7 @@ export class ScrapeEvents {
|
|||
}).select().single();
|
||||
return (result.data as any).id;
|
||||
} catch (error) {
|
||||
// logger.error(`Error inserting scrape event: ${error}`);
|
||||
// Logger.error(`Error inserting scrape event: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -69,7 +69,7 @@ export class ScrapeEvents {
|
|||
}
|
||||
}).eq("id", logId);
|
||||
} catch (error) {
|
||||
logger.error(`Error updating scrape result: ${error}`);
|
||||
Logger.error(`Error updating scrape result: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,7 +81,7 @@ export class ScrapeEvents {
|
|||
worker: process.env.FLY_MACHINE_ID,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error(`Error logging job event: ${error}`);
|
||||
Logger.error(`Error logging job event: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
import { logger } from "./logger";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
/**
|
||||
|
@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
|
|||
.in("job_id", jobIds);
|
||||
|
||||
if (error) {
|
||||
logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||
Logger.error(`Error in supabaseGetJobsById: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
|||
.eq("crawl_id", crawlId)
|
||||
|
||||
if (error) {
|
||||
logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||
Sentry.captureException(error);
|
||||
return [];
|
||||
}
|
||||
|
|
|
@ -1,25 +1,30 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { logger } from "./logger";
|
||||
import { Logger } from "./logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
export function withAuth<T, U extends any[]>(
|
||||
originalFunction: (...args: U) => Promise<T>,
|
||||
mockSuccess: T,
|
||||
export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||
originalFunction: (...args: U) => Promise<T>
|
||||
) {
|
||||
return async function (...args: U): Promise<T> {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (!useDbAuthentication) {
|
||||
if (warningCount < 5) {
|
||||
logger.warn("You're bypassing authentication");
|
||||
Logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
}
|
||||
return { success: true } as T;
|
||||
} else {
|
||||
return await originalFunction(...args);
|
||||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,127 +1,151 @@
|
|||
import { Job } from "bullmq";
|
||||
import {
|
||||
CrawlResult,
|
||||
WebScraperOptions,
|
||||
RunWebScraperParams,
|
||||
RunWebScraperResult,
|
||||
} from "../types";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { Document } from "../lib/entities";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { logger } from "../lib/logger";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
}: {
|
||||
job: Job<WebScraperOptions> & { id: string };
|
||||
job: Job<WebScraperOptions>;
|
||||
token: string;
|
||||
}) {
|
||||
let partialDocs: Document[] = [];
|
||||
return (await runWebScraper({
|
||||
url: job.data.url,
|
||||
mode: job.data.mode,
|
||||
scrapeOptions: {
|
||||
...job.data.scrapeOptions,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
||||
includeRawHtml: true,
|
||||
}): {}),
|
||||
},
|
||||
internalOptions: job.data.internalOptions,
|
||||
// onSuccess: (result, mode) => {
|
||||
// logger.debug(`🐂 Job completed ${job.id}`);
|
||||
// saveJob(job, result, token, mode);
|
||||
// },
|
||||
// onError: (error) => {
|
||||
// logger.error(`🐂 Job failed ${job.id}`);
|
||||
// ScrapeEvents.logJobEvent(job, "failed");
|
||||
// },
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
partialDocs.push(progress.currentDocument);
|
||||
if (partialDocs.length > 50) {
|
||||
partialDocs = partialDocs.slice(-50);
|
||||
}
|
||||
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||
}
|
||||
},
|
||||
onSuccess: (result, mode) => {
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result, token, mode);
|
||||
},
|
||||
onError: (error) => {
|
||||
Logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error, token, false);
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
bull_job_id: job.id.toString(),
|
||||
priority: job.opts.priority,
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
}));
|
||||
})) as { success: boolean; message: string; docs: Document[] };
|
||||
}
|
||||
|
||||
export async function runWebScraper({
|
||||
url,
|
||||
mode,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
// onSuccess,
|
||||
// onError,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
inProgress,
|
||||
onSuccess,
|
||||
onError,
|
||||
team_id,
|
||||
bull_job_id,
|
||||
priority,
|
||||
is_scrape=false,
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
let response: ScrapeUrlResponse | undefined = undefined;
|
||||
let engines: EngineResultsTracker = {};
|
||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||
try {
|
||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
} else {
|
||||
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
|
||||
}
|
||||
const provider = new WebScraperDataProvider();
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
bullJobId: bull_job_id,
|
||||
priority,
|
||||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
extractorOptions,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
priority,
|
||||
teamId: team_id
|
||||
});
|
||||
}
|
||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||
inProgress(progress);
|
||||
})) as Document[];
|
||||
|
||||
if (docs.length === 0) {
|
||||
return {
|
||||
success: true,
|
||||
message: "No pages found",
|
||||
docs: [],
|
||||
};
|
||||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
}
|
||||
})
|
||||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (scrapeOptions.extract) {
|
||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
|
||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This is where the returnvalue from the job is set
|
||||
// onSuccess(response.document, mode);
|
||||
onSuccess(filteredDocs, mode);
|
||||
|
||||
engines = response.engines;
|
||||
return response;
|
||||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
} catch (error) {
|
||||
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
|
||||
|
||||
if (response !== undefined) {
|
||||
return {
|
||||
...response,
|
||||
success: false,
|
||||
error,
|
||||
}
|
||||
} else {
|
||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
|
||||
}
|
||||
// onError(error);
|
||||
} finally {
|
||||
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
|
||||
|
||||
for (const engine of engineOrder) {
|
||||
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
|
||||
ScrapeEvents.insert(bull_job_id, {
|
||||
type: "scrape",
|
||||
url,
|
||||
method: engine,
|
||||
result: {
|
||||
success: result.state === "success",
|
||||
response_code: (result.state === "success" ? result.result.statusCode : undefined),
|
||||
response_size: (result.state === "success" ? result.result.html.length : undefined),
|
||||
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
|
||||
time_taken: result.finishedAt - result.startedAt,
|
||||
},
|
||||
});
|
||||
}
|
||||
onError(error);
|
||||
return { success: false, message: error.message, docs: [] };
|
||||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
|
||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if (useDbAuthentication) {
|
||||
|
@ -149,6 +173,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string, engin
|
|||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -6,8 +6,8 @@ import {
|
|||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { wrap } from "./v1";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { wrap } from "./v1";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ import expressWs from "express-ws";
|
|||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||
import { logger } from "../lib/logger";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
|
@ -32,12 +32,10 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
|
||||
if (chunk) {
|
||||
req.acuc = chunk;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
||||
req.acuc = chunk;
|
||||
if (!success) {
|
||||
logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
||||
}
|
||||
|
@ -52,27 +50,20 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const auth = await authenticateUser(
|
||||
const { success, team_id, error, status, plan, chunk } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!auth.success) {
|
||||
if (!success) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(auth.status).json({ success: false, error: auth.error });
|
||||
} else {
|
||||
return;
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
}
|
||||
|
||||
const { team_id, plan, chunk } = auth;
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
req.acuc = chunk ?? undefined;
|
||||
if (chunk) {
|
||||
req.account = { remainingCredits: chunk.remaining_credits };
|
||||
}
|
||||
req.acuc = chunk;
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import { WebCrawler } from '../crawler';
|
||||
import axios from 'axios';
|
||||
import robotsParser from 'robots-parser';
|
||||
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
|
||||
|
||||
jest.mock('axios');
|
||||
jest.mock('robots-parser');
|
||||
|
@ -34,6 +35,165 @@ describe('WebCrawler', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
|
||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 2;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page1', // depth 1
|
||||
initialUrl + '/page1/page2', // depth 2
|
||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page1', html: '' },
|
||||
{ url: initialUrl + '/page1/page2', html: '' }
|
||||
]);
|
||||
|
||||
|
||||
// Ensure that the link with depth 3 is not included
|
||||
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
|
||||
const initialUrl = 'http://example.com'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 0;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page1', // depth 1
|
||||
initialUrl + '/page1/page2', // depth 2
|
||||
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
|
||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 1;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page2', // depth 1
|
||||
initialUrl + '/page2/page3', // depth 2
|
||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page2', html: '' }
|
||||
]);
|
||||
});
|
||||
|
||||
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
|
||||
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
|
||||
const enteredMaxCrawledDepth = 2;
|
||||
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
|
||||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
|
||||
});
|
||||
|
||||
// Mock sitemap fetching function to return controlled links
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl, // depth 0
|
||||
initialUrl + '/page2', // depth 1
|
||||
initialUrl + '/page2/page3', // depth 2
|
||||
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
|
||||
]);
|
||||
|
||||
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: initialUrl + '/page2', html: '' },
|
||||
{ url: initialUrl + '/page2/page3', html: '' }
|
||||
]);
|
||||
});
|
||||
|
||||
it('should handle allowBackwardCrawling option correctly', async () => {
|
||||
const initialUrl = 'https://mendable.ai/blog';
|
||||
|
||||
// Setup the crawler with the specific test case options
|
||||
const crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
limit: 100,
|
||||
maxCrawledDepth: 3, // Example depth
|
||||
allowBackwardCrawling: true
|
||||
});
|
||||
|
||||
// Mock the sitemap fetching function to simulate backward crawling
|
||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
||||
initialUrl,
|
||||
'https://mendable.ai', // backward link
|
||||
initialUrl + '/page1',
|
||||
initialUrl + '/page1/page2'
|
||||
]);
|
||||
|
||||
const results = await crawler.start();
|
||||
expect(results).toEqual([
|
||||
{ url: initialUrl, html: '' },
|
||||
{ url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
|
||||
{ url: initialUrl + '/page1', html: '' },
|
||||
{ url: initialUrl + '/page1/page2', html: '' }
|
||||
]);
|
||||
|
||||
// Check that the backward link is included if allowBackwardCrawling is true
|
||||
expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
|
||||
});
|
||||
|
||||
it('should respect the limit parameter by not returning more links than specified', async () => {
|
||||
const initialUrl = 'http://example.com';
|
||||
const limit = 2; // Set a limit for the number of links
|
||||
|
|
37
apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
Normal file
37
apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts
Normal file
|
@ -0,0 +1,37 @@
|
|||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
|
||||
jest.mock('../single_url', () => {
|
||||
const originalModule = jest.requireActual('../single_url');
|
||||
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
|
||||
|
||||
return originalModule;
|
||||
});
|
||||
|
||||
describe('scrapSingleUrl', () => {
|
||||
it('should handle includeHtml option correctly', async () => {
|
||||
const url = 'https://roastmywebsite.ai';
|
||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||
|
||||
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||
|
||||
expect(resultWithHtml.html).toBeDefined();
|
||||
expect(resultWithoutHtml.html).toBeUndefined();
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://flutterbricks.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 15000);
|
|
@ -2,10 +2,13 @@ import axios, { AxiosError } from "axios";
|
|||
import cheerio, { load } from "cheerio";
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import async from "async";
|
||||
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { logger } from "../../../src/lib/logger";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
|
@ -27,7 +30,6 @@ export class WebCrawler {
|
|||
constructor({
|
||||
jobId,
|
||||
initialUrl,
|
||||
baseUrl,
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks = 10000,
|
||||
|
@ -39,7 +41,6 @@ export class WebCrawler {
|
|||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
baseUrl?: string;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
maxCrawledLinks?: number;
|
||||
|
@ -51,7 +52,7 @@ export class WebCrawler {
|
|||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
|
||||
this.baseUrl = new URL(initialUrl).origin;
|
||||
this.includes = Array.isArray(includes) ? includes : [];
|
||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||
this.limit = limit;
|
||||
|
@ -65,19 +66,14 @@ export class WebCrawler {
|
|||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||
// If the initial URL is a sitemap.xml, skip filtering
|
||||
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
|
||||
return sitemapLinks.slice(0, limit);
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
let url: URL;
|
||||
try {
|
||||
url = new URL(link.trim(), this.baseUrl);
|
||||
} catch (error) {
|
||||
logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
const path = url.pathname;
|
||||
|
@ -136,7 +132,7 @@ export class WebCrawler {
|
|||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
// Check if the link is disallowed by robots.txt
|
||||
if (!isAllowed) {
|
||||
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -164,24 +160,130 @@ export class WebCrawler {
|
|||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||
}
|
||||
|
||||
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
||||
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if(fromMap && onlySitemap) {
|
||||
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public async start(
|
||||
inProgress?: (progress: Progress) => void,
|
||||
pageOptions?: PageOptions,
|
||||
crawlerOptions?: CrawlerOptions,
|
||||
concurrencyLimit: number = 5,
|
||||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
|
||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const txt = await this.getRobotsTxt();
|
||||
this.importRobotsTxt(txt);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
const sm = await this.tryGetSitemap();
|
||||
if (sm !== null) {
|
||||
return sm;
|
||||
}
|
||||
}
|
||||
|
||||
const urls = await this.crawlUrls(
|
||||
[this.initialUrl],
|
||||
pageOptions,
|
||||
concurrencyLimit,
|
||||
inProgress
|
||||
);
|
||||
|
||||
if (
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
) {
|
||||
return [{ url: this.initialUrl, html: "" }];
|
||||
}
|
||||
|
||||
// make sure to run include exclude here again
|
||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||
}
|
||||
|
||||
private async crawlUrls(
|
||||
urls: string[],
|
||||
pageOptions: PageOptions,
|
||||
concurrencyLimit: number,
|
||||
inProgress?: (progress: Progress) => void,
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
Logger.debug(`Crawling ${task}`);
|
||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
return;
|
||||
}
|
||||
const newUrls = await this.crawl(task, pageOptions);
|
||||
// add the initial url if not already added
|
||||
// if (this.visited.size === 1) {
|
||||
// let normalizedInitial = this.initialUrl;
|
||||
// if (!normalizedInitial.endsWith("/")) {
|
||||
// normalizedInitial = normalizedInitial + "/";
|
||||
// }
|
||||
// if (!newUrls.some(page => page.url === this.initialUrl)) {
|
||||
// newUrls.push({ url: this.initialUrl, html: "" });
|
||||
// }
|
||||
// }
|
||||
|
||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||
|
||||
if (inProgress && newUrls.length > 0) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||
});
|
||||
} else if (inProgress) {
|
||||
inProgress({
|
||||
current: this.crawledUrls.size,
|
||||
total: Math.min(this.maxCrawledLinks, this.limit),
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: task,
|
||||
});
|
||||
}
|
||||
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
}
|
||||
}, concurrencyLimit);
|
||||
|
||||
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
||||
queue.push(
|
||||
urls.filter(
|
||||
(url) =>
|
||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
||||
),
|
||||
(err) => {
|
||||
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
public filterURL(href: string, url: string): string | null {
|
||||
let fullUrl = href;
|
||||
if (!href.startsWith("http")) {
|
||||
try {
|
||||
fullUrl = new URL(href, url).toString();
|
||||
fullUrl = new URL(href, this.baseUrl).toString();
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
|
@ -244,9 +346,79 @@ export class WebCrawler {
|
|||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
}
|
||||
this.visited.add(url);
|
||||
|
||||
if (!url.startsWith("http")) {
|
||||
url = "https://" + url;
|
||||
}
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
let content: string = "";
|
||||
let pageStatusCode: number;
|
||||
let pageError: string | undefined = undefined;
|
||||
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||
content = page.html ?? "";
|
||||
pageStatusCode = page.metadata?.pageStatusCode;
|
||||
pageError = page.metadata?.pageError || undefined;
|
||||
} else {
|
||||
const response = await axios.get(url, { timeout: axiosTimeout });
|
||||
content = response.data ?? "";
|
||||
pageStatusCode = response.status;
|
||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||
}
|
||||
|
||||
const $ = load(content);
|
||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||
|
||||
// Add the initial URL to the list of links
|
||||
if (this.visited.size === 1) {
|
||||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
}
|
||||
|
||||
// Create a new list to return to avoid modifying the visited list
|
||||
return links.filter((link) => !this.visited.has(link.url));
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private isRobotsAllowed(url: string): boolean {
|
||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||
}
|
||||
private normalizeCrawlUrl(url: string): string {
|
||||
try{
|
||||
const urlObj = new URL(url);
|
||||
urlObj.searchParams.sort(); // Sort query parameters to normalize
|
||||
return urlObj.toString();
|
||||
} catch (error) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
private matchesIncludes(url: string): boolean {
|
||||
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
||||
}
|
||||
|
||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||
return this.excludes.some((pattern) => {
|
||||
|
@ -331,7 +503,7 @@ export class WebCrawler {
|
|||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||
} catch (error) {
|
||||
logger.error(`Error processing URL in isFile: ${error}`);
|
||||
Logger.error(`Error processing URL in isFile: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -352,6 +524,7 @@ export class WebCrawler {
|
|||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
||||
//
|
||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||
const normalizeUrl = (url: string) => {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
|
@ -361,7 +534,6 @@ export class WebCrawler {
|
|||
return url;
|
||||
};
|
||||
|
||||
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||
? url
|
||||
: `${url}/sitemap.xml`;
|
||||
|
@ -374,7 +546,7 @@ export class WebCrawler {
|
|||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
|
@ -393,7 +565,7 @@ export class WebCrawler {
|
|||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||
// ignore 404
|
||||
} else {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { logger } from "../../../lib/logger";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
|
@ -6,7 +6,7 @@ export async function handleCustomScraping(
|
|||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
|
||||
logger.debug(
|
||||
Logger.debug(
|
||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -21,7 +21,7 @@ export async function handleCustomScraping(
|
|||
|
||||
// Check for Vanta security portals
|
||||
if (text.includes('<link href="https://static.vanta.com')) {
|
||||
logger.debug(
|
||||
Logger.debug(
|
||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -36,7 +36,7 @@ export async function handleCustomScraping(
|
|||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||
if (googleDriveMetaMatch) {
|
||||
const url = googleDriveMetaMatch[1];
|
||||
logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
|
||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||
if (fileIdMatch) {
|
||||
|
|
1
apps/api/src/scraper/WebScraper/global.ts
Normal file
1
apps/api/src/scraper/WebScraper/global.ts
Normal file
|
@ -0,0 +1 @@
|
|||
export const universalTimeout = 15000;
|
743
apps/api/src/scraper/WebScraper/index.ts
Normal file
743
apps/api/src/scraper/WebScraper/index.ts
Normal file
|
@ -0,0 +1,743 @@
|
|||
import {
|
||||
Document,
|
||||
ExtractorOptions,
|
||||
PageOptions,
|
||||
WebScraperOptions,
|
||||
} from "../../lib/entities";
|
||||
import { Progress } from "../../lib/entities";
|
||||
import { scrapSingleUrl } from "./single_url";
|
||||
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { getValue, setValue } from "../../services/redis";
|
||||
import { getImageDescription } from "./utils/imageDescription";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import {
|
||||
replaceImgPathsWithAbsolutePaths,
|
||||
replacePathsWithAbsolutePaths,
|
||||
} from "./utils/replacePaths";
|
||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private jobId: string;
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string | string[];
|
||||
private excludes: string | string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
private limit: number = 10000;
|
||||
private concurrentRequests: number = 20;
|
||||
private generateImgAltText: boolean = false;
|
||||
private ignoreSitemap: boolean = false;
|
||||
private pageOptions?: PageOptions;
|
||||
private extractorOptions?: ExtractorOptions;
|
||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||
"gpt-4-turbo";
|
||||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
private priority?: number;
|
||||
private teamId?: string;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
authorizeNango(): Promise<void> {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
|
||||
private async convertUrlsToDocuments(
|
||||
urls: string[],
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const totalUrls = urls.length;
|
||||
let processedUrls = 0;
|
||||
|
||||
const results: (Document | null)[] = new Array(urls.length).fill(null);
|
||||
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
|
||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||
await Promise.all(
|
||||
batchUrls.map(async (url, index) => {
|
||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(
|
||||
this.jobId,
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
existingHTML,
|
||||
this.priority,
|
||||
this.teamId,
|
||||
);
|
||||
processedUrls++;
|
||||
if (inProgress) {
|
||||
inProgress({
|
||||
current: processedUrls,
|
||||
total: totalUrls,
|
||||
status: "SCRAPING",
|
||||
currentDocumentUrl: url,
|
||||
currentDocument: { ...result, index: processedUrls },
|
||||
});
|
||||
}
|
||||
|
||||
results[i + index] = result;
|
||||
})
|
||||
);
|
||||
}
|
||||
return results.filter((result) => result !== null) as Document[];
|
||||
}
|
||||
|
||||
async getDocuments(
|
||||
useCaching: boolean = false,
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
this.validateInitialUrl();
|
||||
if (!useCaching) {
|
||||
return this.processDocumentsWithoutCache(inProgress);
|
||||
}
|
||||
|
||||
return this.processDocumentsWithCache(inProgress);
|
||||
}
|
||||
|
||||
private validateInitialUrl(): void {
|
||||
if (this.urls[0].trim() === "") {
|
||||
throw new Error("Url is required");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process documents without cache handling each mode
|
||||
* @param inProgress inProgress
|
||||
* @returns documents
|
||||
*/
|
||||
private async processDocumentsWithoutCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
switch (this.mode) {
|
||||
case "crawl":
|
||||
return this.handleCrawlMode(inProgress);
|
||||
case "single_urls":
|
||||
return this.handleSingleUrlsMode(inProgress);
|
||||
case "sitemap":
|
||||
return this.handleSitemapMode(inProgress);
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async cleanIrrelevantPath(links: string[]) {
|
||||
return links.filter((link) => {
|
||||
const normalizedInitialUrl = new URL(this.urls[0]);
|
||||
const normalizedLink = new URL(link);
|
||||
|
||||
// Normalize the hostname to account for www and non-www versions
|
||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||
/^www\./,
|
||||
""
|
||||
);
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||
|
||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||
return (
|
||||
linkHostname === initialHostname &&
|
||||
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let includes: string[];
|
||||
if (Array.isArray(this.includes)) {
|
||||
if (this.includes[0] != "") {
|
||||
includes = this.includes;
|
||||
}
|
||||
} else {
|
||||
includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
let excludes: string[];
|
||||
if (Array.isArray(this.excludes)) {
|
||||
if (this.excludes[0] != "") {
|
||||
excludes = this.excludes;
|
||||
}
|
||||
} else {
|
||||
excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||
allowExternalContentLinks: this.allowExternalContentLinks,
|
||||
});
|
||||
|
||||
let links = await crawler.start(
|
||||
inProgress,
|
||||
this.pageOptions,
|
||||
{
|
||||
ignoreSitemap: this.ignoreSitemap,
|
||||
},
|
||||
5,
|
||||
this.limit,
|
||||
this.maxCrawledDepth
|
||||
);
|
||||
|
||||
let allLinks = links.map((e) => e.url);
|
||||
const allHtmls = links.map((e) => e.html);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
||||
}
|
||||
|
||||
let documents = [];
|
||||
// check if fast mode is enabled and there is html inside the links
|
||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||
} else {
|
||||
documents = await this.processLinks(allLinks, inProgress);
|
||||
}
|
||||
|
||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||
}
|
||||
|
||||
private async handleSingleUrlsMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
const links = this.urls;
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return documents;
|
||||
}
|
||||
|
||||
private async handleSitemapMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
||||
links = await this.cleanIrrelevantPath(links);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
||||
private async returnOnlyUrlsResponse(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
inProgress?.({
|
||||
current: links.length,
|
||||
total: links.length,
|
||||
status: "COMPLETED",
|
||||
currentDocumentUrl: this.urls[0],
|
||||
});
|
||||
return links.map((url) => ({
|
||||
content: "",
|
||||
html: this.pageOptions?.includeHtml ? "" : undefined,
|
||||
markdown: "",
|
||||
metadata: { sourceURL: url, pageStatusCode: 200 },
|
||||
}));
|
||||
}
|
||||
|
||||
private async processLinks(
|
||||
links: string[],
|
||||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(
|
||||
(link) => link.endsWith(".doc") || link.endsWith(".docx")
|
||||
);
|
||||
|
||||
const [pdfDocuments, docxDocuments] = await Promise.all([
|
||||
this.fetchPdfDocuments(pdfLinks),
|
||||
this.fetchDocxDocuments(docLinks),
|
||||
]);
|
||||
|
||||
links = links.filter(
|
||||
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
|
||||
);
|
||||
|
||||
let [documents, sitemapData] = await Promise.all([
|
||||
this.convertUrlsToDocuments(links, inProgress, allHtmls),
|
||||
this.mode === "single_urls" && links.length > 0
|
||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||
(error) => {
|
||||
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
||||
return null;
|
||||
}
|
||||
)
|
||||
: Promise.resolve(null),
|
||||
]);
|
||||
|
||||
if (this.mode === "single_urls" && documents.length > 0) {
|
||||
documents[0].metadata.sitemap = sitemapData ?? undefined;
|
||||
} else {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
||||
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||
|
||||
if (
|
||||
extractionMode === "llm-extraction" ||
|
||||
extractionMode === "llm-extraction-from-markdown" ||
|
||||
extractionMode === "llm-extraction-from-raw-html"
|
||||
) {
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
completionMode
|
||||
);
|
||||
}
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: pdfLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "pdf-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions.parsePDF
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (docxLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: docxLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "docx-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||
docxLink
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
content,
|
||||
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
private applyPathReplacements(documents: Document[]): Document[] {
|
||||
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||
documents = replacePathsWithAbsolutePaths(documents);
|
||||
}
|
||||
return replaceImgPathsWithAbsolutePaths(documents);
|
||||
}
|
||||
|
||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||
return this.generateImgAltText
|
||||
? this.generatesImgAltText(documents)
|
||||
: documents;
|
||||
}
|
||||
|
||||
private async cacheAndFinalizeDocuments(
|
||||
documents: Document[],
|
||||
links: string[]
|
||||
): Promise<Document[]> {
|
||||
// await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private async processDocumentsWithCache(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let documents = await this.getCachedDocuments(
|
||||
this.urls.slice(0, this.limit)
|
||||
);
|
||||
if (documents.length < this.limit) {
|
||||
const newDocuments: Document[] = await this.getDocuments(
|
||||
false,
|
||||
inProgress
|
||||
);
|
||||
documents = this.mergeNewDocuments(documents, newDocuments);
|
||||
}
|
||||
documents = this.filterDocsExcludeInclude(documents);
|
||||
documents = this.filterDepth(documents);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
||||
private mergeNewDocuments(
|
||||
existingDocuments: Document[],
|
||||
newDocuments: Document[]
|
||||
): Document[] {
|
||||
newDocuments.forEach((doc) => {
|
||||
if (
|
||||
!existingDocuments.some(
|
||||
(d) =>
|
||||
this.normalizeUrl(d.metadata.sourceURL) ===
|
||||
this.normalizeUrl(doc.metadata?.sourceURL)
|
||||
)
|
||||
) {
|
||||
existingDocuments.push(doc);
|
||||
}
|
||||
});
|
||||
return existingDocuments;
|
||||
}
|
||||
|
||||
private filterDocsExcludeInclude(documents: Document[]): Document[] {
|
||||
return documents.filter((document) => {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
|
||||
if (!Array.isArray(this.excludes)) {
|
||||
this.excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
// Check if the link should be excluded
|
||||
if (
|
||||
this.excludes.some((excludePattern) =>
|
||||
new RegExp(excludePattern).test(path)
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!Array.isArray(this.includes)) {
|
||||
this.includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0) {
|
||||
return this.includes.some((includePattern) =>
|
||||
new RegExp(includePattern).test(path)
|
||||
);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
private normalizeUrl(url: string): string {
|
||||
if (url.includes("//www.")) {
|
||||
return url.replace("//www.", "//");
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
private removeChildLinks(documents: Document[]): Document[] {
|
||||
for (let document of documents) {
|
||||
if (document?.childrenLinks) delete document.childrenLinks;
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
|
||||
for (const document of documents) {
|
||||
if (document.content.trim().length === 0) {
|
||||
continue;
|
||||
}
|
||||
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
|
||||
await setValue(
|
||||
"web-scraper-cache:" + normalizedUrl,
|
||||
JSON.stringify({
|
||||
...document,
|
||||
childrenLinks: childrenLinks || [],
|
||||
}),
|
||||
60 * 60
|
||||
); // 10 days
|
||||
}
|
||||
}
|
||||
|
||||
async getCachedDocuments(urls: string[]): Promise<Document[]> {
|
||||
let documents: Document[] = [];
|
||||
for (const url of urls) {
|
||||
const normalizedUrl = this.normalizeUrl(url);
|
||||
Logger.debug(
|
||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
const cachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
if (cachedDocumentString) {
|
||||
const cachedDocument = JSON.parse(cachedDocumentString);
|
||||
documents.push(cachedDocument);
|
||||
|
||||
// get children documents
|
||||
for (const childUrl of cachedDocument.childrenLinks || []) {
|
||||
const normalizedChildUrl = this.normalizeUrl(childUrl);
|
||||
const childCachedDocumentString = await getValue(
|
||||
"web-scraper-cache:" + normalizedChildUrl
|
||||
);
|
||||
if (childCachedDocumentString) {
|
||||
const childCachedDocument = JSON.parse(childCachedDocumentString);
|
||||
if (
|
||||
!documents.find(
|
||||
(doc) =>
|
||||
doc.metadata.sourceURL ===
|
||||
childCachedDocument.metadata.sourceURL
|
||||
)
|
||||
) {
|
||||
documents.push(childCachedDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
setOptions(options: WebScraperOptions): void {
|
||||
if (!options.urls) {
|
||||
throw new Error("Urls are required");
|
||||
}
|
||||
|
||||
this.jobId = options.jobId;
|
||||
this.bullJobId = options.bullJobId;
|
||||
this.urls = options.urls;
|
||||
this.mode = options.mode;
|
||||
this.concurrentRequests = options.concurrentRequests ?? 20;
|
||||
this.includes = options.crawlerOptions?.includes ?? [];
|
||||
this.excludes = options.crawlerOptions?.excludes ?? [];
|
||||
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
|
||||
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
|
||||
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
useFastMode: options.pageOptions?.useFastMode ?? false,
|
||||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
|
||||
mobile: options.pageOptions?.mobile ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
|
||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.priority = options.priority;
|
||||
this.teamId = options.teamId ?? null;
|
||||
|
||||
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
if (!url.trim().startsWith("http")) {
|
||||
return `https://${url}`;
|
||||
}
|
||||
return url;
|
||||
});
|
||||
}
|
||||
|
||||
private async getSitemapData(baseUrl: string, documents: Document[]) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl);
|
||||
if (sitemapData) {
|
||||
for (let i = 0; i < documents.length; i++) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) =>
|
||||
this.normalizeUrl(data.loc) ===
|
||||
this.normalizeUrl(documents[i].metadata.sourceURL)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
documents[i].metadata.sitemap = sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
private async getSitemapDataForSingleUrl(
|
||||
baseUrl: string,
|
||||
url: string,
|
||||
timeout?: number
|
||||
) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl, timeout);
|
||||
if (sitemapData) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
return sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||
await Promise.all(
|
||||
documents.map(async (document) => {
|
||||
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
|
||||
|
||||
await Promise.all(
|
||||
images.map(async (image: string) => {
|
||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (
|
||||
!altText &&
|
||||
!imageUrl.startsWith("data:image") &&
|
||||
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
|
||||
) {
|
||||
const imageIndex = document.content.indexOf(image);
|
||||
const contentLength = document.content.length;
|
||||
let backText = document.content.substring(
|
||||
imageIndex + image.length,
|
||||
Math.min(imageIndex + image.length + 1000, contentLength)
|
||||
);
|
||||
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
|
||||
let frontText = document.content.substring(
|
||||
frontTextStartIndex,
|
||||
imageIndex
|
||||
);
|
||||
altText = await getImageDescription(
|
||||
imageUrl,
|
||||
backText,
|
||||
frontText,
|
||||
this.generateImgAltTextModel
|
||||
);
|
||||
}
|
||||
|
||||
document.content = document.content.replace(
|
||||
image,
|
||||
`![${altText}](${imageUrl})`
|
||||
);
|
||||
})
|
||||
);
|
||||
})
|
||||
);
|
||||
|
||||
return documents;
|
||||
};
|
||||
|
||||
filterDepth(documents: Document[]): Document[] {
|
||||
return documents.filter((document) => {
|
||||
const url = new URL(document.metadata.sourceURL);
|
||||
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
|
||||
});
|
||||
}
|
||||
}
|
89
apps/api/src/scraper/WebScraper/scrapers/fetch.ts
Normal file
89
apps/api/src/scraper/WebScraper/scrapers/fetch.ts
Normal file
|
@ -0,0 +1,89 @@
|
|||
import axios from "axios";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Axios
|
||||
* @param url The URL to scrape
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithFetch(
|
||||
url: string,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "fetch",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: universalTimeout,
|
||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
||||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
Logger.debug(
|
||||
`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
|
||||
);
|
||||
logParams.error_message = response.statusText;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: response.status,
|
||||
pageError: response.statusText,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode: response.status, pageError };
|
||||
} else {
|
||||
const text = response.data;
|
||||
logParams.success = true;
|
||||
logParams.html = text;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
content: text,
|
||||
pageStatusCode: response.status,
|
||||
pageError: null,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: error.response?.status ?? null,
|
||||
pageError: logParams.error_message,
|
||||
};
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
230
apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
Normal file
230
apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
Normal file
|
@ -0,0 +1,230 @@
|
|||
import axios from "axios";
|
||||
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import axiosRetry from 'axios-retry';
|
||||
|
||||
axiosRetry(axios, { retries: 3 , onRetry:()=>{
|
||||
console.log("Retrying (fire-engine)...");
|
||||
}, retryDelay: axiosRetry.exponentialDelay});
|
||||
/**
|
||||
* Scrapes a URL with Fire-Engine
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param screenshot Whether to take a screenshot
|
||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||
* @param pageOptions The options for the page
|
||||
* @param headers The headers to send with the request
|
||||
* @param options The options for the request
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithFireEngine({
|
||||
url,
|
||||
actions,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
priority,
|
||||
teamId,
|
||||
}: {
|
||||
url: string;
|
||||
actions?: Action[];
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
priority?: number;
|
||||
teamId?: string;
|
||||
}): Promise<FireEngineResponse> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "fire-engine",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
let waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
|
||||
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
||||
if(options?.endpoint === "request") {
|
||||
endpoint = "/request";
|
||||
}
|
||||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
if (pageOptions?.useFastMode) {
|
||||
fireEngineOptionsParam.engine = "tlsclient";
|
||||
engine = "tlsclient";
|
||||
}
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
// atsv is only available for beta customers
|
||||
const betaCustomersString = process.env.BETA_CUSTOMERS;
|
||||
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
|
||||
|
||||
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
|
||||
fireEngineOptionsParam.atsv = true;
|
||||
} else {
|
||||
pageOptions.atsv = false;
|
||||
}
|
||||
|
||||
const axiosInstance = axios.create({
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
|
||||
const startTime = Date.now();
|
||||
const _response = await Sentry.startSpan({
|
||||
name: "Call to fire-engine"
|
||||
}, async span => {
|
||||
|
||||
return await axiosInstance.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
headers: headers,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
disableJsDom: pageOptions?.disableJsDom ?? false,
|
||||
priority,
|
||||
engine,
|
||||
instantReturn: true,
|
||||
mobile: pageOptions?.mobile ?? false,
|
||||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
removeBase64Images: pageOptions?.removeBase64Images ?? true,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
}
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
|
||||
|
||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
|
||||
// added 5 seconds to the timeout to account for 'smart wait'
|
||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
|
||||
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
|
||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||
}
|
||||
|
||||
if (checkStatusResponse.data.processing) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||
axiosInstance.delete(
|
||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
|
||||
validateStatus: (status) => true
|
||||
}
|
||||
).catch((error) => {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||
});
|
||||
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
return { html: "", pageStatusCode: null, pageError: "" };
|
||||
}
|
||||
|
||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||
Logger.debug(
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
|
||||
);
|
||||
|
||||
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
|
||||
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
||||
|
||||
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
|
||||
}
|
||||
|
||||
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
||||
|
||||
return {
|
||||
html: "",
|
||||
pageStatusCode,
|
||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
|
||||
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
logParams.success = true;
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { html: content, pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = checkStatusResponse.data;
|
||||
|
||||
logParams.success =
|
||||
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
||||
data.pageStatusCode === 404;
|
||||
logParams.html = data.content ?? "";
|
||||
logParams.response_code = data.pageStatusCode;
|
||||
logParams.error_message = data.pageError ?? data.error;
|
||||
|
||||
return {
|
||||
html: data.content ?? "",
|
||||
screenshots: data.screenshots ?? [data.screenshot] ?? [],
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError ?? data.error,
|
||||
scrapeActionContent: data?.actionContent ?? [],
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
} else {
|
||||
Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
}
|
||||
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams, pageOptions);
|
||||
}
|
||||
}
|
||||
|
||||
|
111
apps/api/src/scraper/WebScraper/scrapers/playwright.ts
Normal file
111
apps/api/src/scraper/WebScraper/scrapers/playwright.ts
Normal file
|
@ -0,0 +1,111 @@
|
|||
import axios from "axios";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Playwright
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param headers The headers to send with the request
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: "playwright",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
// If the user has passed a wait parameter in the request, use that
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL,
|
||||
{
|
||||
url: url,
|
||||
wait_after_load: waitParam,
|
||||
timeout: universalTimeout + waitParam,
|
||||
headers: headers,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
|
||||
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
|
||||
}
|
||||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
||||
);
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: response.data?.pageStatusCode,
|
||||
pageError: response.data?.pageError,
|
||||
};
|
||||
}
|
||||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode, pageError };
|
||||
} else {
|
||||
const textData = response.data;
|
||||
try {
|
||||
const data = JSON.parse(textData);
|
||||
const html = data.content;
|
||||
logParams.success = true;
|
||||
logParams.html = html;
|
||||
logParams.response_code = data.pageStatusCode;
|
||||
logParams.error_message = data.pageError;
|
||||
return {
|
||||
content: html ?? "",
|
||||
pageStatusCode: data.pageStatusCode,
|
||||
pageError: data.pageError,
|
||||
};
|
||||
} catch (jsonError) {
|
||||
logParams.error_message = jsonError.message || jsonError;
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
||||
);
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
92
apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
Normal file
92
apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts
Normal file
|
@ -0,0 +1,92 @@
|
|||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with ScrapingBee
|
||||
* @param url The URL to scrape
|
||||
* @param wait_browser The browser event to wait for
|
||||
* @param timeout The timeout for the scrape
|
||||
* @param pageOptions The options for the page
|
||||
* @returns The scraped content
|
||||
*/
|
||||
export async function scrapWithScrapingBee(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = universalTimeout,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
|
||||
const logParams = {
|
||||
url,
|
||||
scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
|
||||
success: false,
|
||||
response_code: null,
|
||||
time_taken_seconds: null,
|
||||
error_message: null,
|
||||
html: "",
|
||||
startTime: Date.now(),
|
||||
};
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
const clientParams = await generateRequestParams(
|
||||
url,
|
||||
wait_browser,
|
||||
timeout
|
||||
);
|
||||
const response = await client.get({
|
||||
...clientParams,
|
||||
params: {
|
||||
...clientParams.params,
|
||||
transparent_status_code: "True",
|
||||
},
|
||||
});
|
||||
Logger.info(
|
||||
`⛏️ ScrapingBee: Scraping ${url}`
|
||||
);
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
logParams.success = true;
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
logParams.response_code = pageStatusCode;
|
||||
logParams.error_message = pageError;
|
||||
return { content, pageStatusCode, pageError };
|
||||
} else {
|
||||
let text = "";
|
||||
try {
|
||||
const decoder = new TextDecoder();
|
||||
text = decoder.decode(response.data);
|
||||
logParams.success = true;
|
||||
} catch (decodeError) {
|
||||
Logger.debug(
|
||||
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
||||
);
|
||||
logParams.error_message = decodeError.message || decodeError;
|
||||
}
|
||||
logParams.response_code = response.status;
|
||||
logParams.html = text;
|
||||
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
|
||||
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
|
||||
return {
|
||||
content: text,
|
||||
pageStatusCode: response.status,
|
||||
pageError: response.statusText !== "OK" ? response.statusText : undefined,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
logParams.response_code = error.response?.status;
|
||||
return {
|
||||
content: "",
|
||||
pageStatusCode: error.response?.status,
|
||||
pageError: error.response?.statusText,
|
||||
};
|
||||
} finally {
|
||||
const endTime = Date.now();
|
||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||
await logScrape(logParams);
|
||||
}
|
||||
}
|
506
apps/api/src/scraper/WebScraper/single_url.ts
Normal file
506
apps/api/src/scraper/WebScraper/single_url.ts
Normal file
|
@ -0,0 +1,506 @@
|
|||
import * as cheerio from "cheerio";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
Document,
|
||||
PageOptions,
|
||||
FireEngineResponse,
|
||||
ExtractorOptions,
|
||||
Action,
|
||||
} from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
||||
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
|
||||
import { scrapWithFetch } from "./scrapers/fetch";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||
import { extractLinks } from "./utils/utils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
import { clientSideError } from "../../strings";
|
||||
import { ScrapeActionContent } from "../../lib/entities";
|
||||
import { removeBase64Images } from "./utils/removeBase64Images";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
|
||||
export const baseScrapers = [
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = 15000
|
||||
): Promise<any> {
|
||||
const defaultParams = {
|
||||
url: url,
|
||||
params: { timeout: timeout, wait_browser: wait_browser },
|
||||
headers: { "ScrapingService-Request": "TRUE" },
|
||||
};
|
||||
|
||||
try {
|
||||
const urlKey = new URL(url).hostname.replace(/^www\./, "");
|
||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||
} else {
|
||||
return defaultParams;
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating URL key: ${error}`);
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the order of scrapers to be used for scraping a URL
|
||||
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
|
||||
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
|
||||
* @returns The order of scrapers to be used for scraping a URL
|
||||
*/
|
||||
function getScrapingFallbackOrder(
|
||||
defaultScraper?: string,
|
||||
isWaitPresent: boolean = false,
|
||||
isScreenshotPresent: boolean = false,
|
||||
isHeadersPresent: boolean = false,
|
||||
isActionsPresent: boolean = false,
|
||||
) {
|
||||
if (isActionsPresent) {
|
||||
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
||||
}
|
||||
|
||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||
switch (scraper) {
|
||||
case "scrapingBee":
|
||||
case "scrapingBeeLoad":
|
||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||
case "fire-engine":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "fire-engine;chrome-cdp":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "playwright":
|
||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
let defaultOrder = [
|
||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||
useFireEngine ? "fire-engine" : undefined,
|
||||
useScrapingBee ? "scrapingBee" : undefined,
|
||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||
useFireEngine ? undefined : "playwright",
|
||||
"fetch",
|
||||
].filter(Boolean);
|
||||
|
||||
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
// defaultOrder = [
|
||||
// "fire-engine",
|
||||
// useFireEngine ? undefined : "playwright",
|
||||
// ...defaultOrder.filter(
|
||||
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
// ),
|
||||
// ].filter(Boolean);
|
||||
// }
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
(scraper: (typeof baseScrapers)[number]) =>
|
||||
availableScrapers.includes(scraper)
|
||||
);
|
||||
const uniqueScrapers = new Set(
|
||||
defaultScraper
|
||||
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
|
||||
: [...filteredDefaultOrder, ...availableScrapers]
|
||||
);
|
||||
|
||||
const scrapersInOrder = Array.from(uniqueScrapers);
|
||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||
}
|
||||
|
||||
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
includeExtract: pageOptions.includeExtract ?? false,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
useFastMode: pageOptions.useFastMode ?? false,
|
||||
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
removeBase64Images: pageOptions.removeBase64Images ?? true,
|
||||
mobile: pageOptions.mobile ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
url: string,
|
||||
method: (typeof baseScrapers)[number]
|
||||
) => {
|
||||
let scraperResponse: {
|
||||
text: string;
|
||||
screenshot: string;
|
||||
actions?: {
|
||||
screenshots?: string[];
|
||||
scrapes?: ScrapeActionContent[];
|
||||
};
|
||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
type: "scrape",
|
||||
url,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method,
|
||||
result: null,
|
||||
});
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if (method === "fire-engine;chrome-cdp") {
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
|
||||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||
const result: Action[] = [];
|
||||
// Don't add a wait if the previous action is a wait
|
||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||
result.push(action);
|
||||
// Don't add a wait if the next action is a wait
|
||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
return result;
|
||||
}
|
||||
return [action as Action];
|
||||
}) ?? [] as Action[];
|
||||
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
...(engine === "chrome-cdp" ? ({
|
||||
actions: [
|
||||
...(pageOptions.waitFor ? [{
|
||||
type: "wait" as const,
|
||||
milliseconds: pageOptions.waitFor,
|
||||
}] : []),
|
||||
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: !!pageOptions.fullPageScreenshot,
|
||||
}] : []),
|
||||
...processedActions,
|
||||
],
|
||||
}) : ({
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
})),
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
atsv: pageOptions.atsv,
|
||||
disableJsDom: pageOptions.disableJsDom,
|
||||
},
|
||||
priority,
|
||||
teamId,
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||
}
|
||||
if (pageOptions.actions) {
|
||||
scraperResponse.actions = {
|
||||
screenshots: response.screenshots ?? [],
|
||||
scrapes: response.scrapeActionContent ?? [],
|
||||
};
|
||||
}
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBee":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
const response = await scrapWithScrapingBee(
|
||||
url,
|
||||
"domcontentloaded",
|
||||
pageOptions.fallback === false ? 7000 : 15000
|
||||
);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "playwright":
|
||||
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||
const response = await scrapWithPlaywright(
|
||||
url,
|
||||
pageOptions.waitFor,
|
||||
pageOptions.headers
|
||||
);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "scrapingBeeLoad":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
const response = await scrapWithScrapingBee(url, "networkidle2");
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
}
|
||||
break;
|
||||
case "fetch":
|
||||
const response = await scrapWithFetch(url);
|
||||
scraperResponse.text = response.content;
|
||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||
scraperResponse.metadata.pageError = response.pageError;
|
||||
break;
|
||||
}
|
||||
|
||||
let customScrapedContent: FireEngineResponse | null = null;
|
||||
|
||||
// Check for custom scraping conditions
|
||||
const customScraperResult = await handleCustomScraping(
|
||||
scraperResponse.text,
|
||||
url
|
||||
);
|
||||
|
||||
if (customScraperResult) {
|
||||
switch (customScraperResult.scraper) {
|
||||
case "fire-engine":
|
||||
customScrapedContent = await scrapWithFireEngine({
|
||||
url: customScraperResult.url,
|
||||
actions: customScraperResult.waitAfterLoad ? ([
|
||||
{
|
||||
type: "wait",
|
||||
milliseconds: customScraperResult.waitAfterLoad,
|
||||
}
|
||||
]) : ([]),
|
||||
pageOptions: customScraperResult.pageOptions,
|
||||
});
|
||||
break;
|
||||
case "pdf":
|
||||
const { content, pageStatusCode, pageError } =
|
||||
await fetchAndProcessPdf(
|
||||
customScraperResult.url,
|
||||
pageOptions?.parsePDF
|
||||
);
|
||||
customScrapedContent = {
|
||||
html: content,
|
||||
pageStatusCode,
|
||||
pageError,
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (customScrapedContent) {
|
||||
scraperResponse.text = customScrapedContent.html;
|
||||
}
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
let text = await parseMarkdown(cleanedHtml);
|
||||
if (pageOptions.removeBase64Images) {
|
||||
text = await removeBase64Images(text);
|
||||
}
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: scraperResponse.text.length,
|
||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
error: scraperResponse.metadata.pageError,
|
||||
response_code: scraperResponse.metadata.pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
text,
|
||||
html: cleanedHtml,
|
||||
rawHtml: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
actions: scraperResponse.actions,
|
||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||
pageError: scraperResponse.metadata.pageError || undefined,
|
||||
};
|
||||
};
|
||||
|
||||
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
||||
text: "",
|
||||
html: "",
|
||||
rawHtml: "",
|
||||
screenshot: "",
|
||||
actions: undefined,
|
||||
pageStatusCode: 200,
|
||||
pageError: undefined,
|
||||
};
|
||||
try {
|
||||
let urlKey = urlToScrap;
|
||||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||
);
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
|
||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||
text = await parseMarkdown(cleanedHtml);
|
||||
html = cleanedHtml;
|
||||
break;
|
||||
}
|
||||
|
||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||
text = attempt.text ?? "";
|
||||
html = attempt.html ?? "";
|
||||
rawHtml = attempt.rawHtml ?? "";
|
||||
screenshot = attempt.screenshot ?? "";
|
||||
actions = attempt.actions ?? undefined;
|
||||
|
||||
if (attempt.pageStatusCode) {
|
||||
pageStatusCode = attempt.pageStatusCode;
|
||||
}
|
||||
|
||||
if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
|
||||
pageError = attempt.pageError;
|
||||
|
||||
if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
|
||||
pageStatusCode = 500;
|
||||
}
|
||||
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
|
||||
break;
|
||||
}
|
||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
// if (nextScraperIndex < scrapersInOrder.length) {
|
||||
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
// }
|
||||
}
|
||||
|
||||
if (!text) {
|
||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||
}
|
||||
|
||||
const soup = cheerio.load(rawHtml);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document = {
|
||||
content: text,
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
actions,
|
||||
metadata: {
|
||||
...metadata,
|
||||
...(screenshot && screenshot.length > 0 ? ({
|
||||
screenshot,
|
||||
}) : {}),
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
};
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||
ScrapeEvents.insert(jobId, {
|
||||
type: "error",
|
||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||
stack: error.stack,
|
||||
});
|
||||
|
||||
return {
|
||||
content: "",
|
||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
} as Document;
|
||||
}
|
||||
}
|
|
@ -1,10 +1,9 @@
|
|||
import axios from "axios";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { scrapeURL } from "../scrapeURL";
|
||||
import { scrapeOptions } from "../../controllers/v1/types";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
{
|
||||
|
@ -18,20 +17,17 @@ export async function getLinksFromSitemap(
|
|||
}
|
||||
): Promise<string[]> {
|
||||
try {
|
||||
let content: string = "";
|
||||
let content: string;
|
||||
try {
|
||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
|
||||
if (!response.success) {
|
||||
throw response.error;
|
||||
}
|
||||
content = response.document.rawHtml!;
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } });
|
||||
content = response.html;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
@ -51,7 +47,7 @@ export async function getLinksFromSitemap(
|
|||
allUrls.push(...validUrls);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
import * as docxProcessor from "../docxProcessor";
|
||||
|
||||
describe("DOCX Processing Module - Integration Test", () => {
|
||||
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
|
||||
);
|
||||
expect(content.trim()).toContain(
|
||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||
);
|
||||
expect(pageStatusCode).toBe(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
});
|
|
@ -0,0 +1,128 @@
|
|||
import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
describe('parseTablesToMarkdown', () => {
|
||||
it('converts a simple HTML table to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single row to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single column to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th></tr>
|
||||
<tr><td>Row 1 Col 1</td></tr>
|
||||
<tr><td>Row 2 Col 1</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with a single cell to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th>Header 1</th></tr>
|
||||
<tr><td>Row 1 Col 1</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no header to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no rows to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no cells to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no columns to Markdown', async () => {
|
||||
const html = `
|
||||
<table>
|
||||
<tr><th></th></tr>
|
||||
</table>
|
||||
`;
|
||||
const expectedMarkdown = `<div></div>`;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table with no table to Markdown', async () => {
|
||||
const html = ``;
|
||||
const expectedMarkdown = ``;
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
it('converts a table inside of a bunch of html noise', async () => {
|
||||
const html = `
|
||||
<div>
|
||||
<p>Some text before</p>
|
||||
<table>
|
||||
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
|
||||
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
|
||||
</table>
|
||||
<p>Some text after</p>
|
||||
</div>
|
||||
`;
|
||||
const expectedMarkdown = `<div>
|
||||
<p>Some text before</p>
|
||||
<div>| Row 1 Col 1 | Row 1 Col 2 |
|
||||
| Row 2 Col 1 | Row 2 Col 2 |</div>
|
||||
<p>Some text after</p>
|
||||
</div>`;
|
||||
|
||||
const markdown = await parseTablesToMarkdown(html);
|
||||
expect(markdown).toBe(expectedMarkdown);
|
||||
});
|
||||
|
||||
});
|
|
@ -0,0 +1,19 @@
|
|||
import * as pdfProcessor from '../pdfProcessor';
|
||||
|
||||
describe('PDF Processing Module - Integration Test', () => {
|
||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||
expect(content.trim()).toEqual("Dummy PDF file");
|
||||
expect(pageStatusCode).toEqual(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
|
||||
expect(pageStatusCode).toBe(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
});
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,127 @@
|
|||
import { Document } from "../../../../lib/entities";
|
||||
import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
|
||||
|
||||
describe('replacePaths', () => {
|
||||
describe('replacePathsWithAbsolutePaths', () => {
|
||||
it('should replace relative paths with absolute paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is a [link](/path/to/resource).',
|
||||
markdown: 'This is a [link](/path/to/resource).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is a [link](https://example.com/path/to/resource).',
|
||||
markdown: 'This is a [link](https://example.com/path/to/resource).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not alter absolute URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is an [external link](https://external.com/path).',
|
||||
markdown: 'This is an [external link](https://external.com/path).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should not alter data URLs for images', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
|
||||
markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should handle multiple links and images correctly', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
|
||||
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
|
||||
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should correctly handle a mix of absolute and relative paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
|
||||
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
|
||||
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
||||
}];
|
||||
|
||||
const result = replacePathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||||
it('should replace relative image paths with absolute paths', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
|
||||
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
|
||||
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
|
||||
it('should not alter data:image URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
|
||||
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(documents); // Expect no change
|
||||
});
|
||||
|
||||
it('should handle multiple images with a mix of data and relative URLs', () => {
|
||||
const documents: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
|
||||
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
|
||||
}];
|
||||
|
||||
const expectedDocuments: Document[] = [{
|
||||
metadata: { sourceURL: 'https://example.com' },
|
||||
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
|
||||
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
|
||||
}];
|
||||
|
||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||
expect(result).toEqual(expectedDocuments);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -0,0 +1,66 @@
|
|||
import { Logger } from '../../../../lib/logger';
|
||||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
it('should return true for blocked social media URLs', () => {
|
||||
const blockedUrls = [
|
||||
'https://www.facebook.com',
|
||||
'https://twitter.com/someuser',
|
||||
'https://instagram.com/someuser',
|
||||
'https://www.linkedin.com/in/someuser',
|
||||
'https://snapchat.com/someuser',
|
||||
'https://tiktok.com/@someuser',
|
||||
'https://reddit.com/r/somesubreddit',
|
||||
'https://flickr.com/photos/someuser',
|
||||
'https://whatsapp.com/someuser',
|
||||
'https://wechat.com/someuser',
|
||||
'https://telegram.org/someuser',
|
||||
];
|
||||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
Logger.debug(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for URLs containing allowed keywords', () => {
|
||||
const allowedUrls = [
|
||||
'https://www.facebook.com/privacy',
|
||||
'https://twitter.com/terms',
|
||||
'https://instagram.com/legal',
|
||||
'https://www.linkedin.com/help',
|
||||
'https://pinterest.com/about',
|
||||
'https://snapchat.com/support',
|
||||
'https://tiktok.com/contact',
|
||||
'https://reddit.com/user-agreement',
|
||||
'https://tumblr.com/policy',
|
||||
'https://flickr.com/blog',
|
||||
'https://whatsapp.com/press',
|
||||
'https://wechat.com/careers',
|
||||
'https://telegram.org/conditions',
|
||||
'https://wix.com/careers',
|
||||
];
|
||||
|
||||
allowedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for non-blocked URLs', () => {
|
||||
const nonBlockedUrls = [
|
||||
'https://www.example.com',
|
||||
'https://www.somewebsite.org',
|
||||
'https://subdomain.example.com',
|
||||
'firecrawl.dev',
|
||||
'amazon.com',
|
||||
'wix.com',
|
||||
'https://wix.com'
|
||||
];
|
||||
|
||||
nonBlockedUrls.forEach(url => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -1,4 +1,4 @@
|
|||
import { logger } from "../../../lib/logger";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
|
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||
return isBlocked;
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
logger.error(`Error parsing the following URL: ${url}`);
|
||||
Logger.error(`Error parsing the following URL: ${url}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
198
apps/api/src/scraper/WebScraper/utils/custom/website_params.ts
Normal file
198
apps/api/src/scraper/WebScraper/utils/custom/website_params.ts
Normal file
|
@ -0,0 +1,198 @@
|
|||
export const urlSpecificParams = {
|
||||
|
||||
"support.greenpay.me":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"docs.pdw.co":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 3000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"developers.notion.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"docs2.hubitat.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"scrapethissite.com":{
|
||||
defaultScraper: "fetch",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"rsseau.fr":{
|
||||
defaultScraper: "fetch",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"help.salesforce.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
},
|
||||
"ir.veeva.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
},
|
||||
"eonhealth.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
mobileProxy: true,
|
||||
method: "get",
|
||||
engine: "request",
|
||||
},
|
||||
},
|
||||
},
|
||||
"notion.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
engine: "playwright",
|
||||
}
|
||||
},
|
||||
"developer.apple.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
engine: "playwright",
|
||||
wait: 2000,
|
||||
fireEngineOptions: {
|
||||
blockMedia: false,
|
||||
}
|
||||
},
|
||||
},
|
||||
"amazon.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"digikey.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
},
|
||||
"zoopla.co.uk":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"lorealparis.hu":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
79
apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
Normal file
79
apps/api/src/scraper/WebScraper/utils/docxProcessor.ts
Normal file
|
@ -0,0 +1,79 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import { createWriteStream } from "node:fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||
let tempFilePath = '';
|
||||
let pageStatusCode = 200;
|
||||
let pageError = '';
|
||||
let content = '';
|
||||
|
||||
try {
|
||||
const downloadResult = await downloadDocx(url);
|
||||
tempFilePath = downloadResult.tempFilePath;
|
||||
pageStatusCode = downloadResult.pageStatusCode;
|
||||
pageError = downloadResult.pageError;
|
||||
content = await processDocxToText(tempFilePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
||||
pageStatusCode = 500;
|
||||
pageError = error.message;
|
||||
content = '';
|
||||
} finally {
|
||||
if (tempFilePath) {
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
}
|
||||
}
|
||||
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||
try {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", () => {
|
||||
Logger.error('Failed to write DOCX file to disk');
|
||||
reject(new Error('Failed to write DOCX file to disk'));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to download DOCX: ${error.message}`);
|
||||
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
try {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
try {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
42
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
42
apps/api/src/scraper/WebScraper/utils/excludeTags.ts
Normal file
|
@ -0,0 +1,42 @@
|
|||
export const excludeNonMainTags = [
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
".top",
|
||||
".navbar",
|
||||
".footer",
|
||||
".bottom",
|
||||
"#footer",
|
||||
".sidebar",
|
||||
".side",
|
||||
".aside",
|
||||
"#sidebar",
|
||||
".modal",
|
||||
".popup",
|
||||
"#modal",
|
||||
".overlay",
|
||||
".ad",
|
||||
".ads",
|
||||
".advert",
|
||||
"#ad",
|
||||
".lang-selector",
|
||||
".language",
|
||||
"#language-selector",
|
||||
".social",
|
||||
".social-media",
|
||||
".social-links",
|
||||
"#social",
|
||||
".menu",
|
||||
".navigation",
|
||||
"#nav",
|
||||
".breadcrumbs",
|
||||
"#breadcrumbs",
|
||||
"#search-form",
|
||||
".search",
|
||||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
89
apps/api/src/scraper/WebScraper/utils/imageDescription.ts
Normal file
89
apps/api/src/scraper/WebScraper/utils/imageDescription.ts
Normal file
|
@ -0,0 +1,89 @@
|
|||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import axios from 'axios';
|
||||
import { Logger } from '../../../lib/logger';
|
||||
|
||||
export async function getImageDescription(
|
||||
imageUrl: string,
|
||||
backText: string,
|
||||
frontText: string,
|
||||
model: string = "gpt-4-turbo"
|
||||
): Promise<string> {
|
||||
try {
|
||||
const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
|
||||
backText +
|
||||
" and the following text: " +
|
||||
frontText +
|
||||
". Be super concise."
|
||||
|
||||
switch (model) {
|
||||
case 'claude-3-opus': {
|
||||
if (!process.env.ANTHROPIC_API_KEY) {
|
||||
throw new Error("No Anthropic API key provided");
|
||||
}
|
||||
const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
|
||||
const imageMediaType = 'image/png';
|
||||
const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
|
||||
|
||||
const anthropic = new Anthropic();
|
||||
const response = await anthropic.messages.create({
|
||||
model: "claude-3-opus-20240229",
|
||||
max_tokens: 1024,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: imageMediaType,
|
||||
data: imageData,
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "text",
|
||||
text: prompt
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
});
|
||||
|
||||
return response[0].content.text;
|
||||
}
|
||||
default: {
|
||||
if (!process.env.OPENAI_API_KEY) {
|
||||
throw new Error("No OpenAI API key provided");
|
||||
}
|
||||
|
||||
const { OpenAI } = require("openai");
|
||||
const openai = new OpenAI();
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: "gpt-4-turbo",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: prompt,
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: imageUrl,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
return response.choices[0].message.content;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error generating image alt text: ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -7,6 +7,6 @@ export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number
|
|||
}
|
||||
|
||||
export function getURLDepth(url: string): number {
|
||||
const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html");
|
||||
return pathSplits.length;
|
||||
const pathSplits = new URL(url).pathname.split('/');
|
||||
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
|
||||
}
|
||||
|
|
185
apps/api/src/scraper/WebScraper/utils/metadata.ts
Normal file
185
apps/api/src/scraper/WebScraper/utils/metadata.ts
Normal file
|
@ -0,0 +1,185 @@
|
|||
import { CheerioAPI } from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
interface Metadata {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dctermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dctermsType?: string;
|
||||
dcType?: string;
|
||||
dctermsAudience?: string;
|
||||
dctermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dctermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
}
|
||||
|
||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
let title: string | null = null;
|
||||
let description: string | null = null;
|
||||
let language: string | null = null;
|
||||
let keywords: string | null = null;
|
||||
let robots: string | null = null;
|
||||
let ogTitle: string | null = null;
|
||||
let ogDescription: string | null = null;
|
||||
let ogUrl: string | null = null;
|
||||
let ogImage: string | null = null;
|
||||
let ogAudio: string | null = null;
|
||||
let ogDeterminer: string | null = null;
|
||||
let ogLocale: string | null = null;
|
||||
let ogLocaleAlternate: string[] | null = null;
|
||||
let ogSiteName: string | null = null;
|
||||
let ogVideo: string | null = null;
|
||||
let dctermsCreated: string | null = null;
|
||||
let dcDateCreated: string | null = null;
|
||||
let dcDate: string | null = null;
|
||||
let dctermsType: string | null = null;
|
||||
let dcType: string | null = null;
|
||||
let dctermsAudience: string | null = null;
|
||||
let dctermsSubject: string | null = null;
|
||||
let dcSubject: string | null = null;
|
||||
let dcDescription: string | null = null;
|
||||
let dctermsKeywords: string | null = null;
|
||||
let modifiedTime: string | null = null;
|
||||
let publishedTime: string | null = null;
|
||||
let articleTag: string | null = null;
|
||||
let articleSection: string | null = null;
|
||||
let sourceURL: string | null = null;
|
||||
let pageStatusCode: number | null = null;
|
||||
let pageError: string | null = null;
|
||||
|
||||
const customMetadata: Record<string, string | string[]> = {};
|
||||
|
||||
try {
|
||||
// TODO: remove this as it is redundant with the below implementation
|
||||
title = soup("title").text() || null;
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
language = soup("html").attr("lang") || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||
ogDescription =
|
||||
soup('meta[property="og:description"]').attr("content") || null;
|
||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||
ogDeterminer =
|
||||
soup('meta[property="og:determiner"]').attr("content") || null;
|
||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||
ogLocaleAlternate =
|
||||
soup('meta[property="og:locale:alternate"]')
|
||||
.map((i, el) => soup(el).attr("content"))
|
||||
.get() || null;
|
||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||
articleSection =
|
||||
soup('meta[name="article:section"]').attr("content") || null;
|
||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||
publishedTime =
|
||||
soup('meta[property="article:published_time"]').attr("content") || null;
|
||||
modifiedTime =
|
||||
soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||
dctermsKeywords =
|
||||
soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||
dctermsSubject =
|
||||
soup('meta[name="dcterms.subject"]').attr("content") || null;
|
||||
dctermsAudience =
|
||||
soup('meta[name="dcterms.audience"]').attr("content") || null;
|
||||
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
||||
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
||||
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
||||
dcDateCreated =
|
||||
soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||
dctermsCreated =
|
||||
soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
|
||||
try {
|
||||
// Extract all meta tags for custom metadata
|
||||
soup("meta").each((i, elem) => {
|
||||
try {
|
||||
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
||||
const content = soup(elem).attr("content");
|
||||
|
||||
if (name && content) {
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else if (Array.isArray(customMetadata[name])) {
|
||||
(customMetadata[name] as string[]).push(content);
|
||||
} else {
|
||||
customMetadata[name] = [customMetadata[name] as string, content];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata (in): ${error}`);
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata: ${error}`);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting metadata: ${error}`);
|
||||
}
|
||||
|
||||
return {
|
||||
...(title ? { title } : {}),
|
||||
...(description ? { description } : {}),
|
||||
...(language ? { language } : {}),
|
||||
...(keywords ? { keywords } : {}),
|
||||
...(robots ? { robots } : {}),
|
||||
...(ogTitle ? { ogTitle } : {}),
|
||||
...(ogDescription ? { ogDescription } : {}),
|
||||
...(ogUrl ? { ogUrl } : {}),
|
||||
...(ogImage ? { ogImage } : {}),
|
||||
...(ogAudio ? { ogAudio } : {}),
|
||||
...(ogDeterminer ? { ogDeterminer } : {}),
|
||||
...(ogLocale ? { ogLocale } : {}),
|
||||
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
|
||||
...(ogSiteName ? { ogSiteName } : {}),
|
||||
...(ogVideo ? { ogVideo } : {}),
|
||||
...(dctermsCreated ? { dctermsCreated } : {}),
|
||||
...(dcDateCreated ? { dcDateCreated } : {}),
|
||||
...(dcDate ? { dcDate } : {}),
|
||||
...(dctermsType ? { dctermsType } : {}),
|
||||
...(dcType ? { dcType } : {}),
|
||||
...(dctermsAudience ? { dctermsAudience } : {}),
|
||||
...(dctermsSubject ? { dctermsSubject } : {}),
|
||||
...(dcSubject ? { dcSubject } : {}),
|
||||
...(dcDescription ? { dcDescription } : {}),
|
||||
...(dctermsKeywords ? { dctermsKeywords } : {}),
|
||||
...(modifiedTime ? { modifiedTime } : {}),
|
||||
...(publishedTime ? { publishedTime } : {}),
|
||||
...(articleTag ? { articleTag } : {}),
|
||||
...(articleSection ? { articleSection } : {}),
|
||||
...(sourceURL ? { sourceURL } : {}),
|
||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||
...(pageError ? { pageError } : {}),
|
||||
...customMetadata,
|
||||
};
|
||||
}
|
74
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
74
apps/api/src/scraper/WebScraper/utils/parseTable.ts
Normal file
|
@ -0,0 +1,74 @@
|
|||
import cheerio, { CheerioAPI } from "cheerio";
|
||||
|
||||
interface Replacement {
|
||||
start: number;
|
||||
end: number;
|
||||
markdownTable: string;
|
||||
}
|
||||
|
||||
export const parseTablesToMarkdown = async (html: string): Promise<string> => {
|
||||
const soup: CheerioAPI = cheerio.load(html, {
|
||||
xmlMode: true,
|
||||
withStartIndices: true,
|
||||
withEndIndices: true
|
||||
});
|
||||
let tables = soup("table");
|
||||
let replacements: Replacement[] = [];
|
||||
|
||||
if (tables.length) {
|
||||
tables.each((_, tableElement) => {
|
||||
const start: number = tableElement.startIndex;
|
||||
const end: number = tableElement.endIndex + 1; // Include the closing tag properly
|
||||
let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
|
||||
const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
|
||||
if (isTableEmpty) {
|
||||
markdownTable = '';
|
||||
}
|
||||
replacements.push({ start, end, markdownTable });
|
||||
});
|
||||
}
|
||||
|
||||
replacements.sort((a, b) => b.start - a.start);
|
||||
|
||||
let modifiedHtml: string = html;
|
||||
replacements.forEach(({ start, end, markdownTable }) => {
|
||||
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
|
||||
});
|
||||
|
||||
return modifiedHtml.trim();
|
||||
};
|
||||
|
||||
export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
|
||||
let rows: string[] = [];
|
||||
let headerRowFound: boolean = false;
|
||||
tableSoup("tr").each((i, tr) => {
|
||||
const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
|
||||
let cellText: string = tableSoup(cell).text().trim();
|
||||
if (tableSoup(cell).is("th") && !headerRowFound) {
|
||||
headerRowFound = true;
|
||||
}
|
||||
return ` ${cellText} |`;
|
||||
}).get().join("");
|
||||
if (cells) {
|
||||
rows.push(`|${cells}`);
|
||||
}
|
||||
if (headerRowFound && i === 0) { // Header row
|
||||
rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
|
||||
}
|
||||
});
|
||||
|
||||
return rows.join('\n').trim();
|
||||
};
|
||||
|
||||
export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
|
||||
const cells: string = rowSoup("td, th").map((_, cell) => {
|
||||
let cellText: string = rowSoup(cell).text().trim();
|
||||
return ` ${cellText} |`;
|
||||
}).get().join("");
|
||||
|
||||
return `|${cells}`;
|
||||
};
|
||||
|
||||
export function createMarkdownDividerRow(cellCount: number): string {
|
||||
return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
|
||||
}
|
140
apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
Normal file
140
apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
Normal file
|
@ -0,0 +1,140 @@
|
|||
import axios, { AxiosResponse } from "axios";
|
||||
import fs from "fs/promises";
|
||||
import { createReadStream, createWriteStream } from "node:fs";
|
||||
import FormData from "form-data";
|
||||
import dotenv from "dotenv";
|
||||
import pdf from "pdf-parse";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { axiosTimeout } from "../../../lib/timeout";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
await fs.unlink(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
||||
return { content: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
Logger.debug("Processing pdf document w/ LlamaIndex");
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
};
|
||||
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
|
||||
const fileType2 = "application/pdf";
|
||||
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append("file", createReadStream(filePath), {
|
||||
filename: filePath,
|
||||
contentType: fileType2,
|
||||
});
|
||||
|
||||
const uploadUrl = `${base_url}/upload`;
|
||||
const uploadResponse = await axios.post(uploadUrl, formData, {
|
||||
headers: {
|
||||
...headers,
|
||||
...formData.getHeaders(),
|
||||
},
|
||||
});
|
||||
|
||||
const jobId = uploadResponse.data.id;
|
||||
const resultType = "text";
|
||||
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
|
||||
|
||||
let resultResponse: AxiosResponse;
|
||||
let attempt = 0;
|
||||
const maxAttempts = 10; // Maximum number of attempts
|
||||
let resultAvailable = false;
|
||||
while (attempt < maxAttempts && !resultAvailable) {
|
||||
try {
|
||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||
if (resultResponse.status === 200) {
|
||||
resultAvailable = true; // Exit condition met
|
||||
} else {
|
||||
// If the status code is not 200, increment the attempt counter and wait
|
||||
attempt++;
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||
attempt++;
|
||||
if (attempt >= maxAttempts) {
|
||||
Logger.error("Max attempts reached, unable to fetch result.");
|
||||
break; // Exit the loop if max attempts are reached
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
}
|
||||
}
|
||||
|
||||
if (!resultAvailable) {
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
content = resultResponse.data[resultType];
|
||||
} catch (error) {
|
||||
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else if (parsePDF) {
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
content = await fs.readFile(filePath, "utf-8");
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to read PDF file: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
async function processPdf(file: string) {
|
||||
try {
|
||||
const fileContent = await fs.readFile(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
} catch (error) {
|
||||
throw error;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
import { AnyNode, Cheerio, load } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions,
|
||||
) => {
|
||||
let soup = load(html);
|
||||
|
||||
if (
|
||||
pageOptions.onlyIncludeTags &&
|
||||
pageOptions.onlyIncludeTags.length > 0 &&
|
||||
pageOptions.onlyIncludeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
|
||||
soup = load(newRoot.html());
|
||||
}
|
||||
}
|
||||
|
||||
soup("script, style, noscript, meta, head").remove();
|
||||
|
||||
if (
|
||||
pageOptions.removeTags &&
|
||||
pageOptions.removeTags.length > 0 &&
|
||||
pageOptions.removeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
}
|
||||
|
||||
if (Array.isArray(pageOptions.removeTags)) {
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
let elementsToRemove: Cheerio<AnyNode>;
|
||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||
let classMatch = false;
|
||||
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), "i");
|
||||
elementsToRemove = soup("*").filter((i, element) => {
|
||||
if (element.type === "tag") {
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||
);
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`),
|
||||
);
|
||||
}
|
||||
return tagNameMatches || attributesMatch || classMatch;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
} else {
|
||||
elementsToRemove = soup(tag);
|
||||
}
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
const cleanedHtml = soup.html();
|
||||
return cleanedHtml;
|
||||
};
|
85
apps/api/src/scraper/WebScraper/utils/replacePaths.ts
Normal file
85
apps/api/src/scraper/WebScraper/utils/replacePaths.ts
Normal file
|
@ -0,0 +1,85 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
import { Document } from "../../../lib/entities";
|
||||
|
||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
try {
|
||||
documents.forEach((document) => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const paths =
|
||||
document.content.match(
|
||||
/!?\[.*?\]\(.*?\)|href=".+?"/g
|
||||
) || [];
|
||||
|
||||
paths.forEach((path: string) => {
|
||||
try {
|
||||
const isImage = path.startsWith("!");
|
||||
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
|
||||
let url = matchedUrl[1];
|
||||
|
||||
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
||||
if (url.startsWith("/")) {
|
||||
url = url.substring(1);
|
||||
}
|
||||
url = new URL(url, baseUrl).toString();
|
||||
}
|
||||
|
||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||
// Image is handled afterwards
|
||||
if (!isImage) {
|
||||
document.content = document.content.replace(
|
||||
path,
|
||||
`${markdownLinkOrImageText}(${url})`
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
});
|
||||
document.markdown = document.content;
|
||||
});
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
||||
|
||||
export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
try {
|
||||
documents.forEach((document) => {
|
||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const images =
|
||||
document.content.match(
|
||||
/!\[.*?\]\(.*?\)/g
|
||||
) || [];
|
||||
|
||||
images.forEach((image: string) => {
|
||||
let imageUrl = image.match(/\((.*?)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (!imageUrl.startsWith("data:image")) {
|
||||
if (!imageUrl.startsWith("http")) {
|
||||
if (imageUrl.startsWith("/")) {
|
||||
imageUrl = imageUrl.substring(1);
|
||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||
} else {
|
||||
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.content = document.content.replace(
|
||||
image,
|
||||
`![${altText}](${imageUrl})`
|
||||
);
|
||||
});
|
||||
document.markdown = document.content;
|
||||
});
|
||||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
59
apps/api/src/scraper/WebScraper/utils/utils.ts
Normal file
59
apps/api/src/scraper/WebScraper/utils/utils.ts
Normal file
|
@ -0,0 +1,59 @@
|
|||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
|
||||
export async function attemptScrapWithRequests(
|
||||
urlToScrap: string
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||
|
||||
if (!response.data) {
|
||||
Logger.debug("Failed normal requests as well");
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function sanitizeText(text: string): string {
|
||||
return text.replace("\u0000", "");
|
||||
}
|
||||
|
||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const $ = cheerio.load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
$('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (href) {
|
||||
try {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
} catch (error) {
|
||||
// Log the error and continue
|
||||
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates and return
|
||||
return [...new Set(links)];
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
# `scrapeURL`
|
||||
New URL scraper for Firecrawl
|
||||
|
||||
## Signal flow
|
||||
```mermaid
|
||||
flowchart TD;
|
||||
scrapeURL-.->buildFallbackList;
|
||||
buildFallbackList-.->scrapeURLWithEngine;
|
||||
scrapeURLWithEngine-.->parseMarkdown;
|
||||
parseMarkdown-.->wasScrapeSuccessful{{Was scrape successful?}};
|
||||
wasScrapeSuccessful-."No".->areEnginesLeft{{Are there engines left to try?}};
|
||||
areEnginesLeft-."Yes, try next engine".->scrapeURLWithEngine;
|
||||
areEnginesLeft-."No".->NoEnginesLeftError[/NoEnginesLeftError/]
|
||||
wasScrapeSuccessful-."Yes".->asd;
|
||||
```
|
||||
|
||||
## Differences from `WebScraperDataProvider`
|
||||
- The job of `WebScraperDataProvider.validateInitialUrl` has been delegated to the zod layer above `scrapeUrl`.
|
||||
- `WebScraperDataProvider.mode` has no equivalent, only `scrape_url` is supported.
|
||||
- You may no longer specify multiple URLs.
|
||||
- Built on `v1` definitons, instead of `v0`.
|
||||
- PDFs are now converted straight to markdown using LlamaParse, instead of converting to just plaintext.
|
||||
- DOCXs are now converted straight to HTML (and then later to markdown) using mammoth, instead of converting to just plaintext.
|
||||
- Using new JSON Schema OpenAI API -- schema fails with LLM Extract will be basically non-existant.
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { downloadFile } from "../utils/downloadFile";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
statusCode: response.status,
|
||||
|
||||
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000;
|
||||
|
||||
const response = await Promise.race([
|
||||
fetch(meta.url, {
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
|
||||
})()
|
||||
]);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
html: await response.text(),
|
||||
statusCode: response.status,
|
||||
// TODO: error?
|
||||
};
|
||||
}
|
|
@ -1,106 +0,0 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { EngineError } from "../../error";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.literal("completed"),
|
||||
processing: z.literal(false),
|
||||
|
||||
// timeTaken: z.number(),
|
||||
content: z.string(),
|
||||
url: z.string().optional(),
|
||||
|
||||
pageStatusCode: z.number(),
|
||||
pageError: z.string().optional(),
|
||||
|
||||
// TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
|
||||
responseHeaders: z.record(z.string(), z.string()).optional(),
|
||||
|
||||
// timeTakenCookie: z.number().optional(),
|
||||
// timeTakenRequest: z.number().optional(),
|
||||
|
||||
// legacy: playwright only
|
||||
screenshot: z.string().optional(),
|
||||
|
||||
// new: actions
|
||||
screenshots: z.string().array().optional(),
|
||||
actionContent: z.object({
|
||||
url: z.string(),
|
||||
html: z.string(),
|
||||
}).array().optional(),
|
||||
})
|
||||
|
||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||
|
||||
const processingSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown", "prioritized"]),
|
||||
processing: z.boolean(),
|
||||
});
|
||||
|
||||
const failedSchema = z.object({
|
||||
jobId: z.string(),
|
||||
state: z.literal("failed"),
|
||||
processing: z.literal(false),
|
||||
error: z.string(),
|
||||
});
|
||||
|
||||
export class StillProcessingError extends Error {
|
||||
constructor(jobId: string) {
|
||||
super("Job is still under processing", { cause: { jobId } })
|
||||
}
|
||||
}
|
||||
|
||||
export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
const status = await Sentry.startSpan({
|
||||
name: "fire-engine: Check status",
|
||||
attributes: {
|
||||
jobId,
|
||||
}
|
||||
}, async span => {
|
||||
return await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||
method: "GET",
|
||||
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
}
|
||||
)
|
||||
});
|
||||
|
||||
const successParse = successSchema.safeParse(status);
|
||||
const processingParse = processingSchema.safeParse(status);
|
||||
const failedParse = failedSchema.safeParse(status);
|
||||
|
||||
if (successParse.success) {
|
||||
logger.debug("Scrape succeeded!", { jobId });
|
||||
return successParse.data;
|
||||
} else if (processingParse.success) {
|
||||
throw new StillProcessingError(jobId);
|
||||
} else if (failedParse.success) {
|
||||
logger.debug("Scrape job failed", { status, jobId });
|
||||
throw new EngineError("Scrape job failed", {
|
||||
cause: {
|
||||
status, jobId
|
||||
}
|
||||
});
|
||||
} else {
|
||||
logger.debug("Check status returned response not matched by any schema", { status, jobId });
|
||||
throw new Error("Check status returned response not matched by any schema", {
|
||||
cause: {
|
||||
status, jobId
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
await Sentry.startSpan({
|
||||
name: "fire-engine: Delete scrape",
|
||||
attributes: {
|
||||
jobId,
|
||||
}
|
||||
}, async span => {
|
||||
await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||
method: "DELETE",
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
ignoreResponse: true,
|
||||
ignoreFailure: true,
|
||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||
}
|
||||
)
|
||||
});
|
||||
|
||||
// We do not care whether this fails or not.
|
||||
}
|
|
@ -1,202 +0,0 @@
|
|||
import { Logger } from "winston";
|
||||
import { Meta } from "../..";
|
||||
import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
|
||||
import { EngineError, TimeoutError } from "../../error";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export const defaultTimeout = 10000;
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
// `scrapeURLWithFireEngine*` functions.
|
||||
async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
timeout = defaultTimeout,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
|
||||
|
||||
const startTime = Date.now();
|
||||
const errorLimit = 3;
|
||||
let errors: any[] = [];
|
||||
let status: FireEngineCheckStatusSuccess | undefined = undefined;
|
||||
|
||||
while (status === undefined) {
|
||||
if (errors.length >= errorLimit) {
|
||||
logger.error("Error limit hit.", { errors });
|
||||
throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
|
||||
}
|
||||
|
||||
if (Date.now() - startTime > timeout) {
|
||||
logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
|
||||
throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
|
||||
}
|
||||
|
||||
try {
|
||||
status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
|
||||
} catch (error) {
|
||||
if (error instanceof StillProcessingError) {
|
||||
// nop
|
||||
} else if (error instanceof EngineError) {
|
||||
logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
errors.push(error);
|
||||
logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const actions: Action[] = [
|
||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||
...(meta.options.waitFor !== 0 ? [{
|
||||
type: "wait" as const,
|
||||
milliseconds: meta.options.waitFor,
|
||||
}] : []),
|
||||
|
||||
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||
...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
|
||||
type: "screenshot" as const,
|
||||
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
||||
}] : []),
|
||||
|
||||
// Include specified actions
|
||||
...(meta.options.actions ?? []),
|
||||
];
|
||||
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
|
||||
url: meta.url,
|
||||
engine: "chrome-cdp",
|
||||
instantReturn: true,
|
||||
skipTlsVerification: meta.options.skipTlsVerification,
|
||||
headers: meta.options.headers,
|
||||
...(actions.length > 0 ? ({
|
||||
actions,
|
||||
}) : {}),
|
||||
priority: meta.internalOptions.priority,
|
||||
geolocation: meta.options.geolocation,
|
||||
mobile: meta.options.mobile,
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
const totalWait = actions.reduce((a,x) => x.type === "wait" ? (x.milliseconds ?? 1000) + a : a, 0);
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + totalWait,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
|
||||
meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
|
||||
response.screenshot = (response.screenshots ?? [])[0];
|
||||
(response.screenshots ?? []).splice(0, 1);
|
||||
meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
|
||||
}
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
screenshot: response.screenshot,
|
||||
...(actions.length > 0 ? {
|
||||
actions: {
|
||||
screenshots: response.screenshots ?? [],
|
||||
scrapes: response.actionContent ?? [],
|
||||
}
|
||||
} : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
|
||||
url: meta.url,
|
||||
engine: "playwright",
|
||||
instantReturn: true,
|
||||
|
||||
headers: meta.options.headers,
|
||||
priority: meta.internalOptions.priority,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||
wait: meta.options.waitFor,
|
||||
geolocation: meta.options.geolocation,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
defaultTimeout + meta.options.waitFor
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
|
||||
screenshot: response.screenshots[0],
|
||||
}) : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
|
||||
url: meta.url,
|
||||
engine: "tlsclient",
|
||||
instantReturn: true,
|
||||
|
||||
headers: meta.options.headers,
|
||||
priority: meta.internalOptions.priority,
|
||||
|
||||
atsv: meta.internalOptions.atsv,
|
||||
geolocation: meta.options.geolocation,
|
||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||
};
|
||||
|
||||
let response = await performFireEngineScrape(
|
||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
||||
request,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
||||
}
|
||||
|
||||
return {
|
||||
url: response.url ?? meta.url,
|
||||
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
};
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export type FireEngineScrapeRequestCommon = {
|
||||
url: string;
|
||||
|
||||
headers?: { [K: string]: string };
|
||||
|
||||
blockMedia?: boolean; // default: true
|
||||
blockAds?: boolean; // default: true
|
||||
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
|
||||
|
||||
// useProxy?: boolean; // unused, default: true
|
||||
// customProxy?: string; // unused
|
||||
|
||||
// disableSmartWaitCache?: boolean; // unused, default: false
|
||||
// skipDnsCheck?: boolean; // unused, default: false
|
||||
|
||||
priority?: number; // default: 1
|
||||
// team_id?: string; // unused
|
||||
logRequest?: boolean; // default: true
|
||||
instantReturn?: boolean; // default: false
|
||||
geolocation?: { country?: string; languages?: string[]; };
|
||||
}
|
||||
|
||||
export type FireEngineScrapeRequestChromeCDP = {
|
||||
engine: "chrome-cdp";
|
||||
skipTlsVerification?: boolean;
|
||||
actions?: Action[];
|
||||
blockMedia?: true; // cannot be false
|
||||
mobile?: boolean;
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestPlaywright = {
|
||||
engine: "playwright";
|
||||
blockAds?: boolean; // default: true
|
||||
|
||||
// mutually exclusive, default: false
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
|
||||
wait?: number; // default: 0
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestTLSClient = {
|
||||
engine: "tlsclient";
|
||||
atsv?: boolean; // v0 only, default: false
|
||||
disableJsDom?: boolean; // v0 only, default: false
|
||||
// blockAds?: boolean; // default: true
|
||||
};
|
||||
|
||||
const schema = z.object({
|
||||
jobId: z.string(),
|
||||
processing: z.boolean(),
|
||||
});
|
||||
|
||||
export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
): Promise<z.infer<typeof schema>> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
// TODO: retries
|
||||
|
||||
const scrapeRequest = await Sentry.startSpan({
|
||||
name: "fire-engine: Scrape",
|
||||
attributes: {
|
||||
url: request.url,
|
||||
},
|
||||
}, async span => {
|
||||
return await robustFetch(
|
||||
{
|
||||
url: `${fireEngineURL}/scrape`,
|
||||
method: "POST",
|
||||
headers: {
|
||||
...(Sentry.isInitialized() ? ({
|
||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||
"baggage": Sentry.spanToBaggageHeader(span),
|
||||
}) : {}),
|
||||
},
|
||||
body: request,
|
||||
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||
schema,
|
||||
tryCount: 3,
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
return scrapeRequest;
|
||||
}
|
|
@ -1,295 +0,0 @@
|
|||
import { ScrapeActionContent } from "../../../lib/entities";
|
||||
import { Meta } from "..";
|
||||
import { scrapeDOCX } from "./docx";
|
||||
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
|
||||
import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
|
||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
|
||||
|
||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
||||
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
||||
...(usePlaywright ? [ "playwright" as const ] : []),
|
||||
"fetch",
|
||||
"pdf",
|
||||
"docx",
|
||||
];
|
||||
|
||||
export const featureFlags = [
|
||||
"actions",
|
||||
"waitFor",
|
||||
"screenshot",
|
||||
"screenshot@fullScreen",
|
||||
"pdf",
|
||||
"docx",
|
||||
"atsv",
|
||||
"location",
|
||||
"mobile",
|
||||
"skipTlsVerification",
|
||||
"useFastMode",
|
||||
] as const;
|
||||
|
||||
export type FeatureFlag = typeof featureFlags[number];
|
||||
|
||||
export const featureFlagOptions: {
|
||||
[F in FeatureFlag]: {
|
||||
priority: number;
|
||||
}
|
||||
} = {
|
||||
"actions": { priority: 20 },
|
||||
"waitFor": { priority: 1 },
|
||||
"screenshot": { priority: 10 },
|
||||
"screenshot@fullScreen": { priority: 10 },
|
||||
"pdf": { priority: 100 },
|
||||
"docx": { priority: 100 },
|
||||
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
||||
"useFastMode": { priority: 90 },
|
||||
"location": { priority: 10 },
|
||||
"mobile": { priority: 10 },
|
||||
"skipTlsVerification": { priority: 10 },
|
||||
} as const;
|
||||
|
||||
export type EngineScrapeResult = {
|
||||
url: string;
|
||||
|
||||
html: string;
|
||||
markdown?: string;
|
||||
statusCode: number;
|
||||
error?: string;
|
||||
|
||||
screenshot?: string;
|
||||
actions?: {
|
||||
screenshots: string[];
|
||||
scrapes: ScrapeActionContent[];
|
||||
};
|
||||
}
|
||||
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
||||
} = {
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
|
||||
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
|
||||
"playwright": scrapeURLWithPlaywright,
|
||||
"fetch": scrapeURLWithFetch,
|
||||
"pdf": scrapePDF,
|
||||
"docx": scrapeDOCX,
|
||||
};
|
||||
|
||||
export const engineOptions: {
|
||||
[E in Engine]: {
|
||||
// A list of feature flags the engine supports.
|
||||
features: { [F in FeatureFlag]: boolean },
|
||||
|
||||
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
||||
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
||||
quality: number,
|
||||
}
|
||||
} = {
|
||||
"fire-engine;chrome-cdp": {
|
||||
features: {
|
||||
"actions": true,
|
||||
"waitFor": true, // through actions transform
|
||||
"screenshot": true, // through actions transform
|
||||
"screenshot@fullScreen": true, // through actions transform
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": true,
|
||||
"mobile": true,
|
||||
"skipTlsVerification": true,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 50,
|
||||
},
|
||||
"fire-engine;playwright": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 40,
|
||||
},
|
||||
"scrapingbee": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 30,
|
||||
},
|
||||
"scrapingbeeLoad": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": true,
|
||||
"screenshot@fullScreen": true,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 29,
|
||||
},
|
||||
"playwright": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": true,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": false,
|
||||
},
|
||||
quality: 20,
|
||||
},
|
||||
"fire-engine;tlsclient": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": true,
|
||||
"location": true,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: 10,
|
||||
},
|
||||
"fetch": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: 5,
|
||||
},
|
||||
"pdf": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": true,
|
||||
"docx": false,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: -10,
|
||||
},
|
||||
"docx": {
|
||||
features: {
|
||||
"actions": false,
|
||||
"waitFor": false,
|
||||
"screenshot": false,
|
||||
"screenshot@fullScreen": false,
|
||||
"pdf": false,
|
||||
"docx": true,
|
||||
"atsv": false,
|
||||
"location": false,
|
||||
"mobile": false,
|
||||
"skipTlsVerification": false,
|
||||
"useFastMode": true,
|
||||
},
|
||||
quality: -10,
|
||||
},
|
||||
};
|
||||
|
||||
export function buildFallbackList(meta: Meta): {
|
||||
engine: Engine,
|
||||
unsupportedFeatures: Set<FeatureFlag>,
|
||||
}[] {
|
||||
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||
const priorityThreshold = Math.floor(prioritySum / 2);
|
||||
let selectedEngines: {
|
||||
engine: Engine,
|
||||
supportScore: number,
|
||||
unsupportedFeatures: Set<FeatureFlag>,
|
||||
}[] = [];
|
||||
|
||||
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
|
||||
|
||||
for (const engine of currentEngines) {
|
||||
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
|
||||
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
||||
|
||||
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
||||
for (const flag of meta.featureFlags) {
|
||||
if (supportedFlags.has(flag)) {
|
||||
unsupportedFeatures.delete(flag);
|
||||
}
|
||||
}
|
||||
|
||||
if (supportScore >= priorityThreshold) {
|
||||
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
||||
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
|
||||
} else {
|
||||
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
|
||||
}
|
||||
}
|
||||
|
||||
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
|
||||
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
|
||||
}
|
||||
|
||||
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
|
||||
|
||||
return selectedEngines;
|
||||
}
|
||||
|
||||
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
|
||||
const fn = engineHandlers[engine];
|
||||
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
|
||||
const _meta = {
|
||||
...meta,
|
||||
logger,
|
||||
};
|
||||
|
||||
return await fn(_meta);
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
import { createReadStream, promises as fs } from "node:fs";
|
||||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import * as marked from "marked";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { z } from "zod";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import escapeHtml from "escape-html";
|
||||
import PdfParse from "pdf-parse";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
|
||||
type PDFProcessorResult = {html: string, markdown?: string};
|
||||
|
||||
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
||||
|
||||
const uploadForm = new FormData();
|
||||
|
||||
// This is utterly stupid but it works! - mogery
|
||||
uploadForm.append("file", {
|
||||
[Symbol.toStringTag]: "Blob",
|
||||
name: tempFilePath,
|
||||
stream() {
|
||||
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
|
||||
},
|
||||
arrayBuffer() {
|
||||
throw Error("Unimplemented in mock Blob: arrayBuffer")
|
||||
},
|
||||
size: (await fs.stat(tempFilePath)).size,
|
||||
text() {
|
||||
throw Error("Unimplemented in mock Blob: text")
|
||||
},
|
||||
slice(start, end, contentType) {
|
||||
throw Error("Unimplemented in mock Blob: slice")
|
||||
},
|
||||
type: "application/pdf",
|
||||
} as Blob);
|
||||
|
||||
const upload = await robustFetch({
|
||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||
},
|
||||
body: uploadForm,
|
||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
|
||||
schema: z.object({
|
||||
id: z.string(),
|
||||
}),
|
||||
});
|
||||
|
||||
const jobId = upload.id;
|
||||
|
||||
// TODO: timeout, retries
|
||||
const result = await robustFetch({
|
||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||
},
|
||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
|
||||
schema: z.object({
|
||||
markdown: z.string(),
|
||||
}),
|
||||
tryCount: 32,
|
||||
tryCooldown: 250,
|
||||
});
|
||||
|
||||
return {
|
||||
markdown: result.markdown,
|
||||
html: await marked.parse(result.markdown, { async: true }),
|
||||
};
|
||||
}
|
||||
|
||||
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||
|
||||
const result = await PdfParse(await fs.readFile(tempFilePath));
|
||||
const escaped = escapeHtml(result.text);
|
||||
|
||||
return {
|
||||
markdown: escaped,
|
||||
html: escaped,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
if (!meta.options.parsePDF) {
|
||||
const file = await fetchFileToBuffer(meta.url);
|
||||
const content = file.buffer.toString("base64");
|
||||
return {
|
||||
url: file.response.url,
|
||||
statusCode: file.response.status,
|
||||
|
||||
html: content,
|
||||
markdown: content,
|
||||
};
|
||||
}
|
||||
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
try {
|
||||
result = await scrapePDFWithLlamaParse({
|
||||
...meta,
|
||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
||||
}, tempFilePath);
|
||||
} catch (error) {
|
||||
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
||||
Sentry.captureException(error);
|
||||
}
|
||||
}
|
||||
|
||||
if (result === null) {
|
||||
result = await scrapePDFWithParsePDF({
|
||||
...meta,
|
||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
|
||||
}, tempFilePath);
|
||||
}
|
||||
|
||||
await fs.unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
statusCode: response.status,
|
||||
|
||||
html: result.html,
|
||||
markdown: result.markdown,
|
||||
}
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
import { z } from "zod";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
|
||||
export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const timeout = 20000 + meta.options.waitFor;
|
||||
|
||||
const response = await Promise.race([
|
||||
await robustFetch({
|
||||
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: {
|
||||
url: meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
},
|
||||
method: "POST",
|
||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||
schema: z.object({
|
||||
content: z.string(),
|
||||
pageStatusCode: z.number(),
|
||||
pageError: z.string().optional(),
|
||||
}),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||
throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
|
||||
})(),
|
||||
]);
|
||||
|
||||
return {
|
||||
url: meta.url, // TODO: impove redirect following
|
||||
html: response.content,
|
||||
statusCode: response.pageStatusCode,
|
||||
error: response.pageError,
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user