mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
8d467c8ca7
* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
134 lines
4.7 KiB
JSON
134 lines
4.7 KiB
JSON
{
|
|
"name": "firecrawl-scraper-js",
|
|
"version": "1.0.0",
|
|
"description": "",
|
|
"main": "index.js",
|
|
"scripts": {
|
|
"start": "nodemon --exec ts-node src/index.ts",
|
|
"start:production": "tsc && node dist/src/index.js",
|
|
"format": "prettier --write \"src/**/*.(js|ts)\"",
|
|
"flyio": "node dist/src/index.js",
|
|
"start:dev": "nodemon --exec ts-node src/index.ts",
|
|
"build": "tsc && pnpm sentry:sourcemaps",
|
|
"build:nosentry": "tsc",
|
|
"test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
|
|
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
|
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
|
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
|
|
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
|
"worker:production": "node dist/src/services/queue-worker.js",
|
|
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
|
"mongo-docker-console": "docker exec -it mongodb mongosh",
|
|
"run-example": "npx ts-node src/example.ts",
|
|
"deploy:fly": "flyctl deploy --build-secret SENTRY_AUTH_TOKEN=$(dotenv -p SENTRY_AUTH_TOKEN) --depot=false",
|
|
"deploy:fly:staging": "fly deploy -c fly.staging.toml --depot=false",
|
|
"sentry:sourcemaps": "sentry-cli sourcemaps inject --org caleb-peffer --project firecrawl-scraper-js ./dist && sentry-cli sourcemaps upload --org caleb-peffer --project firecrawl-scraper-js ./dist"
|
|
},
|
|
"author": "",
|
|
"license": "ISC",
|
|
"devDependencies": {
|
|
"@flydotio/dockerfile": "^0.4.10",
|
|
"@jest/globals": "^29.7.0",
|
|
"@tsconfig/recommended": "^1.0.3",
|
|
"@types/body-parser": "^1.19.2",
|
|
"@types/cors": "^2.8.13",
|
|
"@types/escape-html": "^1.0.4",
|
|
"@types/express": "^4.17.17",
|
|
"@types/jest": "^29.5.12",
|
|
"@types/node": "^20.14.1",
|
|
"@types/pdf-parse": "^1.1.4",
|
|
"body-parser": "^1.20.1",
|
|
"express": "^4.18.2",
|
|
"jest": "^29.6.3",
|
|
"jest-fetch-mock": "^3.0.3",
|
|
"mammoth": "^1.7.2",
|
|
"nodemon": "^2.0.20",
|
|
"supabase": "^1.77.9",
|
|
"supertest": "^6.3.3",
|
|
"ts-jest": "^29.1.1",
|
|
"ts-node": "^10.9.1",
|
|
"typescript": "^5.4.2"
|
|
},
|
|
"dependencies": {
|
|
"@anthropic-ai/sdk": "^0.24.3",
|
|
"@brillout/import": "^0.2.2",
|
|
"@bull-board/api": "^5.20.5",
|
|
"@bull-board/express": "^5.20.5",
|
|
"@devil7softwares/pos": "^1.0.2",
|
|
"@dqbd/tiktoken": "^1.0.16",
|
|
"@nangohq/node": "^0.40.8",
|
|
"@sentry/cli": "^2.33.1",
|
|
"@sentry/node": "^8.26.0",
|
|
"@sentry/profiling-node": "^8.26.0",
|
|
"@supabase/supabase-js": "^2.44.2",
|
|
"@types/express-ws": "^3.0.4",
|
|
"@types/ws": "^8.5.12",
|
|
"ajv": "^8.16.0",
|
|
"async": "^3.2.5",
|
|
"async-mutex": "^0.5.0",
|
|
"axios": "^1.3.4",
|
|
"axios-retry": "^4.5.0",
|
|
"bottleneck": "^2.19.5",
|
|
"bullmq": "^5.11.0",
|
|
"cacheable-lookup": "^6.1.0",
|
|
"cheerio": "^1.0.0-rc.12",
|
|
"cohere": "^1.1.1",
|
|
"cors": "^2.8.5",
|
|
"cron-parser": "^4.9.0",
|
|
"date-fns": "^3.6.0",
|
|
"dotenv": "^16.3.1",
|
|
"dotenv-cli": "^7.4.2",
|
|
"escape-html": "^1.0.3",
|
|
"express-rate-limit": "^7.3.1",
|
|
"express-ws": "^5.0.2",
|
|
"form-data": "^4.0.0",
|
|
"glob": "^10.4.2",
|
|
"gpt3-tokenizer": "^1.1.5",
|
|
"ioredis": "^5.4.1",
|
|
"joplin-turndown-plugin-gfm": "^1.0.12",
|
|
"json-schema-to-zod": "^2.3.0",
|
|
"keyword-extractor": "^0.0.28",
|
|
"koffi": "^2.9.0",
|
|
"langchain": "^0.2.8",
|
|
"languagedetect": "^2.0.0",
|
|
"logsnag": "^1.0.0",
|
|
"luxon": "^3.4.3",
|
|
"marked": "^14.1.2",
|
|
"md5": "^2.3.0",
|
|
"moment": "^2.29.4",
|
|
"mongoose": "^8.4.4",
|
|
"natural": "^7.0.7",
|
|
"openai": "^4.57.0",
|
|
"pdf-parse": "^1.1.1",
|
|
"pos": "^0.4.2",
|
|
"posthog-node": "^4.0.1",
|
|
"promptable": "^0.0.10",
|
|
"puppeteer": "^22.12.1",
|
|
"rate-limiter-flexible": "2.4.2",
|
|
"redlock": "5.0.0-beta.2",
|
|
"resend": "^3.4.0",
|
|
"robots-parser": "^3.0.1",
|
|
"scrapingbee": "^1.7.4",
|
|
"stripe": "^16.1.0",
|
|
"systeminformation": "^5.22.11",
|
|
"turndown": "^7.1.3",
|
|
"turndown-plugin-gfm": "^1.0.2",
|
|
"typesense": "^1.5.4",
|
|
"unstructured-client": "^0.11.3",
|
|
"uuid": "^10.0.0",
|
|
"winston": "^3.14.2",
|
|
"winston-transport": "^4.8.0",
|
|
"wordpos": "^2.1.0",
|
|
"ws": "^8.18.0",
|
|
"xml2js": "^0.6.2",
|
|
"zod": "^3.23.8",
|
|
"zod-to-json-schema": "^3.23.1"
|
|
},
|
|
"nodemonConfig": {
|
|
"ignore": [
|
|
"*.docx",
|
|
"*.json",
|
|
"temp"
|
|
]
|
|
}
|
|
} |