Gergő Móricz 8d467c8ca7
WebScraper refactor into scrapeURL (#714)
* feat: use strictNullChecking

* feat: switch logger to Winston

* feat(scrapeURL): first batch

* fix(scrapeURL): error swallow

* fix(scrapeURL): add timeout to EngineResultsTracker

* fix(scrapeURL): report unexpected error to sentry

* chore: remove unused modules

* feat(transfomers/coerce): warn when a format's response is missing

* feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support

* (add note)

* feat(scrapeURL): wip readme

* feat(scrapeURL): LLM extract

* feat(scrapeURL): better warnings

* fix(scrapeURL/engines/fire-engine;playwright): fix screenshot

* feat(scrapeURL): add forceEngine internal option

* feat(scrapeURL/engines): scrapingbee

* feat(scrapeURL/transformars): uploadScreenshot

* feat(scrapeURL): more intense tests

* bunch of stuff

* get rid of WebScraper (mostly)

* adapt batch scrape

* add staging deploy workflow

* fix yaml

* fix logger issues

* fix v1 test schema

* feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions

* scrapeURL: v0 backwards compat

* logger fixes

* feat(scrapeurl): v0 returnOnlyUrls support

* fix(scrapeURL/v0): URL leniency

* fix(batch-scrape): ts non-nullable

* fix(scrapeURL/fire-engine/chromecdp): fix wait action

* fix(logger): remove error debug key

* feat(requests.http): use dotenv expression

* fix(scrapeURL/extractMetadata): extract custom metadata

* fix crawl option conversion

* feat(scrapeURL): Add retry logic to robustFetch

* fix(scrapeURL): crawl stuff

* fix(scrapeURL): LLM extract

* fix(scrapeURL/v0): search fix

* fix(tests/v0): grant larger response size to v0 crawl status

* feat(scrapeURL): basic fetch engine

* feat(scrapeURL): playwright engine

* feat(scrapeURL): add url-specific parameters

* Update readme and examples

* added e2e tests for most parameters. Still a few actions, location and iframes to be done.

* fixed type

* Nick:

* Update scrape.ts

* Update index.ts

* added actions and base64 check

* Nick: skipTls feature flag?

* 403

* todo

* todo

* fixes

* yeet headers from url specific params

* add warning when final engine has feature deficit

* expose engine results tracker for ScrapeEvents implementation

* ingest scrape events

* fixed some tests

* comment

* Update index.test.ts

* fixed rawHtml

* Update index.test.ts

* update comments

* move geolocation to global f-e option, fix removeBase64Images

* Nick:

* trim url-specific params

* Update index.ts


Co-authored-by: Eric Ciarla <>
Co-authored-by: rafaelmmiller <>
Co-authored-by: Nicolas <>
2024-11-07 20:57:33 +01:00

134 lines
4.7 KiB

"name": "firecrawl-scraper-js",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"start": "nodemon --exec ts-node src/index.ts",
"start:production": "tsc && node dist/src/index.js",
"format": "prettier --write \"src/**/*.(js|ts)\"",
"flyio": "node dist/src/index.js",
"start:dev": "nodemon --exec ts-node src/index.ts",
"build": "tsc && pnpm sentry:sourcemaps",
"build:nosentry": "tsc",
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
"worker:production": "node dist/src/services/queue-worker.js",
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
"mongo-docker-console": "docker exec -it mongodb mongosh",
"run-example": "npx ts-node src/example.ts",
"deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN) --depot=false",
"deploy:fly:staging": "fly deploy -c fly.staging.toml --depot=false",
"sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
"author": "",
"license": "ISC",
"devDependencies": {
"@flydotio/dockerfile": "^0.4.10",
"@jest/globals": "^29.7.0",
"@tsconfig/recommended": "^1.0.3",
"@types/body-parser": "^1.19.2",
"@types/cors": "^2.8.13",
"@types/escape-html": "^1.0.4",
"@types/express": "^4.17.17",
"@types/jest": "^29.5.12",
"@types/node": "^20.14.1",
"@types/pdf-parse": "^1.1.4",
"body-parser": "^1.20.1",
"express": "^4.18.2",
"jest": "^29.6.3",
"jest-fetch-mock": "^3.0.3",
"mammoth": "^1.7.2",
"nodemon": "^2.0.20",
"supabase": "^1.77.9",
"supertest": "^6.3.3",
"ts-jest": "^29.1.1",
"ts-node": "^10.9.1",
"typescript": "^5.4.2"
"dependencies": {
"@anthropic-ai/sdk": "^0.24.3",
"@brillout/import": "^0.2.2",
"@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.16",
"@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",
"@supabase/supabase-js": "^2.44.2",
"@types/express-ws": "^3.0.4",
"@types/ws": "^8.5.12",
"ajv": "^8.16.0",
"async": "^3.2.5",
"async-mutex": "^0.5.0",
"axios": "^1.3.4",
"axios-retry": "^4.5.0",
"bottleneck": "^2.19.5",
"bullmq": "^5.11.0",
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1",
"cors": "^2.8.5",
"cron-parser": "^4.9.0",
"date-fns": "^3.6.0",
"dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2",
"escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2",
"form-data": "^4.0.0",
"glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1",
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-schema-to-zod": "^2.3.0",
"keyword-extractor": "^0.0.28",
"koffi": "^2.9.0",
"langchain": "^0.2.8",
"languagedetect": "^2.0.0",
"logsnag": "^1.0.0",
"luxon": "^3.4.3",
"marked": "^14.1.2",
"md5": "^2.3.0",
"moment": "^2.29.4",
"mongoose": "^8.4.4",
"natural": "^7.0.7",
"openai": "^4.57.0",
"pdf-parse": "^1.1.1",
"pos": "^0.4.2",
"posthog-node": "^4.0.1",
"promptable": "^0.0.10",
"puppeteer": "^22.12.1",
"rate-limiter-flexible": "2.4.2",
"redlock": "5.0.0-beta.2",
"resend": "^3.4.0",
"robots-parser": "^3.0.1",
"scrapingbee": "^1.7.4",
"stripe": "^16.1.0",
"systeminformation": "^5.22.11",
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",
"unstructured-client": "^0.11.3",
"uuid": "^10.0.0",
"winston": "^3.14.2",
"winston-transport": "^4.8.0",
"wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1"
"nodemonConfig": {
"ignore": [