Compare commits

...

22 Commits

Author SHA1 Message Date
dependabot[bot]
18352bc153
Merge 5453539fb4 into 3c1b1909f8 2024-11-15 15:34:43 +08:00
Nicolas
3c1b1909f8 Update map.ts
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
2024-11-14 17:52:15 -05:00
Nicolas
9519897102 Merge branch 'nsc/sitemap-only' 2024-11-14 17:44:39 -05:00
Nicolas
7f084c6c43 Nick: 2024-11-14 17:44:32 -05:00
Nicolas
e8bd089c8a
Merge pull request #901 from mendableai/nsc/sitemap-only
Allows `/map` to only return links present in the sitemap
2024-11-14 17:32:37 -05:00
Nicolas
3fcdf57d2f Update fireEngine.ts 2024-11-14 17:31:30 -05:00
Nicolas
d62f12c9d9 Nick: moved away from axios 2024-11-14 17:31:23 -05:00
Nicolas
f155449458 Nick: sitemap only 2024-11-14 17:29:53 -05:00
Móricz Gergő
431e64e752 fix(batch/scrape/webhook): add batch_scrape.started 2024-11-14 22:40:03 +01:00
Nicolas
7bca4486b4 Update package.json 2024-11-14 16:37:53 -05:00
Móricz Gergő
df05124ef5 feat(v1/batch/scrape): webhooks 2024-11-14 22:36:28 +01:00
Gergő Móricz
86a78a03cb fix(sitemap): scrape with tlsclient 2024-11-14 18:56:32 +01:00
Eric Ciarla
62c8b63b84 Create README.md 2024-11-14 11:55:00 -05:00
Móricz Gergő
5519f077aa fix(scrapeURL): adjust error message for clarity
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
2024-11-14 10:13:48 +01:00
Móricz Gergő
0a1c99074f fix(html-to-markdown): make error reporting less intrusive 2024-11-14 08:58:00 +01:00
Nicolas
bd928b1512 Nick: changed email from hello to help
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
2024-11-13 20:27:20 -05:00
Gergő Móricz
0310cd2afa fix(crawl): redirect rebase
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
2024-11-13 21:38:44 +01:00
Nicolas
0d1c4e4e09 Update package.json 2024-11-13 13:54:22 -05:00
Gergő Móricz
32be2cf786
feat(v1/webhook): complex webhook object w/ headers (#899)
* feat(v1/webhook): complex webhook object w/ headers

* feat(js-sdk/crawl): add complex webhook support
2024-11-13 19:36:44 +01:00
Nicolas
ea1302960f
Merge pull request #895 from mendableai/nsc/redlock-email
Redlock for sending email notifications
2024-11-13 12:45:55 -05:00
Nicolas
1a636b4e59 Update email_notification.ts 2024-11-12 20:09:01 -05:00
dependabot[bot]
5453539fb4
apps/api(deps): bump the prod-deps group across 1 directory with 40 updates
Bumps the prod-deps group with 40 updates in the /apps/api directory:

| Package | From | To |
| --- | --- | --- |
| [@anthropic-ai/sdk](https://github.com/anthropics/anthropic-sdk-typescript) | `0.24.3` | `0.32.1` |
| [@bull-board/api](https://github.com/felixmosh/bull-board/tree/HEAD/packages/api) | `5.20.5` | `6.4.0` |
| [@bull-board/express](https://github.com/felixmosh/bull-board/tree/HEAD/packages/express) | `5.20.5` | `6.4.0` |
| [@dqbd/tiktoken](https://github.com/dqbd/tiktoken) | `1.0.16` | `1.0.17` |
| [@nangohq/node](https://github.com/NangoHQ/nango/tree/HEAD/packages/node-client) | `0.40.8` | `0.42.22` |
| [@sentry/cli](https://github.com/getsentry/sentry-cli) | `2.33.1` | `2.38.2` |
| [@sentry/node](https://github.com/getsentry/sentry-javascript) | `8.26.0` | `8.38.0` |
| [@sentry/profiling-node](https://github.com/getsentry/sentry-javascript) | `8.26.0` | `8.38.0` |
| [@supabase/supabase-js](https://github.com/supabase/supabase-js) | `2.44.2` | `2.46.1` |
| [@types/express-ws](https://github.com/DefinitelyTyped/DefinitelyTyped/tree/HEAD/types/express-ws) | `3.0.4` | `3.0.5` |
| [@types/ws](https://github.com/DefinitelyTyped/DefinitelyTyped/tree/HEAD/types/ws) | `8.5.12` | `8.5.13` |
| [ajv](https://github.com/ajv-validator/ajv) | `8.16.0` | `8.17.1` |
| [async](https://github.com/caolan/async) | `3.2.5` | `3.2.6` |
| [axios](https://github.com/axios/axios) | `1.7.2` | `1.7.7` |
| [bullmq](https://github.com/taskforcesh/bullmq) | `5.11.0` | `5.25.6` |
| [cacheable-lookup](https://github.com/szmarczak/cacheable-lookup) | `6.1.0` | `7.0.0` |
| [cheerio](https://github.com/cheeriojs/cheerio) | `1.0.0-rc.12` | `1.0.0` |
| [date-fns](https://github.com/date-fns/date-fns) | `3.6.0` | `4.1.0` |
| [express-rate-limit](https://github.com/express-rate-limit/express-rate-limit) | `7.3.1` | `7.4.1` |
| [glob](https://github.com/isaacs/node-glob) | `10.4.2` | `11.0.0` |
| [json-schema-to-zod](https://github.com/StefanTerdell/json-schema-to-zod) | `2.3.0` | `2.4.1` |
| [koffi](https://github.com/Koromix/koffi) | `2.9.0` | `2.9.2` |
| [langchain](https://github.com/langchain-ai/langchainjs) | `0.2.8` | `0.3.5` |
| [luxon](https://github.com/moment/luxon) | `3.4.4` | `3.5.0` |
| [marked](https://github.com/markedjs/marked) | `14.1.2` | `15.0.0` |
| [mongoose](https://github.com/Automattic/mongoose) | `8.4.4` | `8.8.1` |
| [natural](https://github.com/NaturalNode/natural) | `7.0.7` | `8.0.1` |
| [openai](https://github.com/openai/openai-node) | `4.57.0` | `4.72.0` |
| [posthog-node](https://github.com/PostHog/posthog-js-lite/tree/HEAD/posthog-node) | `4.0.1` | `4.2.1` |
| [puppeteer](https://github.com/puppeteer/puppeteer) | `22.12.1` | `23.7.1` |
| [rate-limiter-flexible](https://github.com/animir/node-rate-limiter-flexible) | `2.4.2` | `5.0.4` |
| [resend](https://github.com/resendlabs/resend-node) | `3.4.0` | `4.0.0` |
| [scrapingbee](https://github.com/ScrapingBee/scrapingbee-node) | `1.7.4` | `1.7.5` |
| [stripe](https://github.com/stripe/stripe-node) | `16.1.0` | `17.3.1` |
| [systeminformation](https://github.com/sebhildebrandt/systeminformation) | `5.22.11` | `5.23.5` |
| [unstructured-client](https://github.com/Unstructured-IO/unstructured-js-client) | `0.11.3` | `0.18.2` |
| [uuid](https://github.com/uuidjs/uuid) | `10.0.0` | `11.0.3` |
| [winston](https://github.com/winstonjs/winston) | `3.14.2` | `3.17.0` |
| [winston-transport](https://github.com/winstonjs/winston-transport) | `4.8.0` | `4.9.0` |
| [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) | `3.23.1` | `3.23.5` |



Updates `@anthropic-ai/sdk` from 0.24.3 to 0.32.1
- [Release notes](https://github.com/anthropics/anthropic-sdk-typescript/releases)
- [Changelog](https://github.com/anthropics/anthropic-sdk-typescript/blob/main/CHANGELOG.md)
- [Commits](https://github.com/anthropics/anthropic-sdk-typescript/compare/sdk-v0.24.3...sdk-v0.32.1)

Updates `@bull-board/api` from 5.20.5 to 6.4.0
- [Release notes](https://github.com/felixmosh/bull-board/releases)
- [Changelog](https://github.com/felixmosh/bull-board/blob/master/CHANGELOG.md)
- [Commits](https://github.com/felixmosh/bull-board/commits/v6.4.0/packages/api)

Updates `@bull-board/express` from 5.20.5 to 6.4.0
- [Release notes](https://github.com/felixmosh/bull-board/releases)
- [Changelog](https://github.com/felixmosh/bull-board/blob/master/CHANGELOG.md)
- [Commits](https://github.com/felixmosh/bull-board/commits/v6.4.0/packages/express)

Updates `@dqbd/tiktoken` from 1.0.16 to 1.0.17
- [Release notes](https://github.com/dqbd/tiktoken/releases)
- [Changelog](https://github.com/dqbd/tiktoken/blob/main/CHANGELOG.md)
- [Commits](https://github.com/dqbd/tiktoken/compare/@dqbd/tiktoken@1.0.16...@dqbd/tiktoken@1.0.17)

Updates `@nangohq/node` from 0.40.8 to 0.42.22
- [Release notes](https://github.com/NangoHQ/nango/releases)
- [Changelog](https://github.com/NangoHQ/nango/blob/master/CHANGELOG.md)
- [Commits](https://github.com/NangoHQ/nango/commits/v0.42.22/packages/node-client)

Updates `@sentry/cli` from 2.33.1 to 2.38.2
- [Release notes](https://github.com/getsentry/sentry-cli/releases)
- [Changelog](https://github.com/getsentry/sentry-cli/blob/master/CHANGELOG.md)
- [Commits](https://github.com/getsentry/sentry-cli/compare/2.33.1...2.38.2)

Updates `@sentry/node` from 8.26.0 to 8.38.0
- [Release notes](https://github.com/getsentry/sentry-javascript/releases)
- [Changelog](https://github.com/getsentry/sentry-javascript/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/getsentry/sentry-javascript/compare/8.26.0...8.38.0)

Updates `@sentry/profiling-node` from 8.26.0 to 8.38.0
- [Release notes](https://github.com/getsentry/sentry-javascript/releases)
- [Changelog](https://github.com/getsentry/sentry-javascript/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/getsentry/sentry-javascript/compare/8.26.0...8.38.0)

Updates `@supabase/supabase-js` from 2.44.2 to 2.46.1
- [Release notes](https://github.com/supabase/supabase-js/releases)
- [Changelog](https://github.com/supabase/supabase-js/blob/master/RELEASE.md)
- [Commits](https://github.com/supabase/supabase-js/compare/v2.44.2...v2.46.1)

Updates `@types/express-ws` from 3.0.4 to 3.0.5
- [Release notes](https://github.com/DefinitelyTyped/DefinitelyTyped/releases)
- [Commits](https://github.com/DefinitelyTyped/DefinitelyTyped/commits/HEAD/types/express-ws)

Updates `@types/ws` from 8.5.12 to 8.5.13
- [Release notes](https://github.com/DefinitelyTyped/DefinitelyTyped/releases)
- [Commits](https://github.com/DefinitelyTyped/DefinitelyTyped/commits/HEAD/types/ws)

Updates `ajv` from 8.16.0 to 8.17.1
- [Release notes](https://github.com/ajv-validator/ajv/releases)
- [Commits](https://github.com/ajv-validator/ajv/compare/v8.16.0...v8.17.1)

Updates `async` from 3.2.5 to 3.2.6
- [Release notes](https://github.com/caolan/async/releases)
- [Changelog](https://github.com/caolan/async/blob/master/CHANGELOG.md)
- [Commits](https://github.com/caolan/async/compare/v3.2.5...v3.2.6)

Updates `axios` from 1.7.2 to 1.7.7
- [Release notes](https://github.com/axios/axios/releases)
- [Changelog](https://github.com/axios/axios/blob/v1.x/CHANGELOG.md)
- [Commits](https://github.com/axios/axios/compare/v1.7.2...v1.7.7)

Updates `bullmq` from 5.11.0 to 5.25.6
- [Release notes](https://github.com/taskforcesh/bullmq/releases)
- [Commits](https://github.com/taskforcesh/bullmq/compare/v5.11.0...v5.25.6)

Updates `cacheable-lookup` from 6.1.0 to 7.0.0
- [Release notes](https://github.com/szmarczak/cacheable-lookup/releases)
- [Commits](https://github.com/szmarczak/cacheable-lookup/compare/v6.1.0...v7.0.0)

Updates `cheerio` from 1.0.0-rc.12 to 1.0.0
- [Release notes](https://github.com/cheeriojs/cheerio/releases)
- [Commits](https://github.com/cheeriojs/cheerio/compare/v1.0.0-rc.12...v1.0.0)

Updates `date-fns` from 3.6.0 to 4.1.0
- [Release notes](https://github.com/date-fns/date-fns/releases)
- [Changelog](https://github.com/date-fns/date-fns/blob/main/CHANGELOG.md)
- [Commits](https://github.com/date-fns/date-fns/compare/v3.6.0...v4.1.0)

Updates `express-rate-limit` from 7.3.1 to 7.4.1
- [Release notes](https://github.com/express-rate-limit/express-rate-limit/releases)
- [Commits](https://github.com/express-rate-limit/express-rate-limit/compare/v7.3.1...v7.4.1)

Updates `glob` from 10.4.2 to 11.0.0
- [Changelog](https://github.com/isaacs/node-glob/blob/main/changelog.md)
- [Commits](https://github.com/isaacs/node-glob/compare/v10.4.2...v11.0.0)

Updates `json-schema-to-zod` from 2.3.0 to 2.4.1
- [Commits](https://github.com/StefanTerdell/json-schema-to-zod/commits)

Updates `koffi` from 2.9.0 to 2.9.2
- [Commits](https://github.com/Koromix/koffi/commits)

Updates `langchain` from 0.2.8 to 0.3.5
- [Release notes](https://github.com/langchain-ai/langchainjs/releases)
- [Changelog](https://github.com/langchain-ai/langchainjs/blob/main/release_workspace.js)
- [Commits](https://github.com/langchain-ai/langchainjs/compare/0.2.8...0.3.5)

Updates `luxon` from 3.4.4 to 3.5.0
- [Changelog](https://github.com/moment/luxon/blob/master/CHANGELOG.md)
- [Commits](https://github.com/moment/luxon/compare/3.4.4...3.5.0)

Updates `marked` from 14.1.2 to 15.0.0
- [Release notes](https://github.com/markedjs/marked/releases)
- [Changelog](https://github.com/markedjs/marked/blob/master/.releaserc.json)
- [Commits](https://github.com/markedjs/marked/compare/v14.1.2...v15.0.0)

Updates `mongoose` from 8.4.4 to 8.8.1
- [Release notes](https://github.com/Automattic/mongoose/releases)
- [Changelog](https://github.com/Automattic/mongoose/blob/master/CHANGELOG.md)
- [Commits](https://github.com/Automattic/mongoose/compare/8.4.4...8.8.1)

Updates `natural` from 7.0.7 to 8.0.1
- [Release notes](https://github.com/NaturalNode/natural/releases)
- [Commits](https://github.com/NaturalNode/natural/compare/v7.0.7...v8.0.1)

Updates `openai` from 4.57.0 to 4.72.0
- [Release notes](https://github.com/openai/openai-node/releases)
- [Changelog](https://github.com/openai/openai-node/blob/master/CHANGELOG.md)
- [Commits](https://github.com/openai/openai-node/compare/v4.57.0...v4.72.0)

Updates `posthog-node` from 4.0.1 to 4.2.1
- [Release notes](https://github.com/PostHog/posthog-js-lite/releases)
- [Changelog](https://github.com/PostHog/posthog-js-lite/blob/main/posthog-node/CHANGELOG.md)
- [Commits](https://github.com/PostHog/posthog-js-lite/commits/posthog-node-v4.2.1/posthog-node)

Updates `puppeteer` from 22.12.1 to 23.7.1
- [Release notes](https://github.com/puppeteer/puppeteer/releases)
- [Changelog](https://github.com/puppeteer/puppeteer/blob/main/release-please-config.json)
- [Commits](https://github.com/puppeteer/puppeteer/compare/puppeteer-v22.12.1...puppeteer-v23.7.1)

Updates `rate-limiter-flexible` from 2.4.2 to 5.0.4
- [Release notes](https://github.com/animir/node-rate-limiter-flexible/releases)
- [Commits](https://github.com/animir/node-rate-limiter-flexible/commits)

Updates `resend` from 3.4.0 to 4.0.0
- [Release notes](https://github.com/resendlabs/resend-node/releases)
- [Commits](https://github.com/resendlabs/resend-node/compare/3.4.0...v4.0.0)

Updates `scrapingbee` from 1.7.4 to 1.7.5
- [Release notes](https://github.com/ScrapingBee/scrapingbee-node/releases)
- [Changelog](https://github.com/ScrapingBee/scrapingbee-node/blob/master/CHANGELOG.md)
- [Commits](https://github.com/ScrapingBee/scrapingbee-node/compare/v1.7.4...v1.7.5)

Updates `stripe` from 16.1.0 to 17.3.1
- [Release notes](https://github.com/stripe/stripe-node/releases)
- [Changelog](https://github.com/stripe/stripe-node/blob/master/CHANGELOG.md)
- [Commits](https://github.com/stripe/stripe-node/compare/v16.1.0...v17.3.1)

Updates `systeminformation` from 5.22.11 to 5.23.5
- [Changelog](https://github.com/sebhildebrandt/systeminformation/blob/master/CHANGELOG.md)
- [Commits](https://github.com/sebhildebrandt/systeminformation/compare/v5.22.11...v5.23.5)

Updates `unstructured-client` from 0.11.3 to 0.18.2
- [Release notes](https://github.com/Unstructured-IO/unstructured-js-client/releases)
- [Changelog](https://github.com/Unstructured-IO/unstructured-js-client/blob/main/CHANGELOG.md)
- [Commits](https://github.com/Unstructured-IO/unstructured-js-client/compare/v0.11.3...v0.18.2)

Updates `uuid` from 10.0.0 to 11.0.3
- [Release notes](https://github.com/uuidjs/uuid/releases)
- [Changelog](https://github.com/uuidjs/uuid/blob/main/CHANGELOG.md)
- [Commits](https://github.com/uuidjs/uuid/compare/v10.0.0...v11.0.3)

Updates `winston` from 3.14.2 to 3.17.0
- [Release notes](https://github.com/winstonjs/winston/releases)
- [Changelog](https://github.com/winstonjs/winston/blob/master/CHANGELOG.md)
- [Commits](https://github.com/winstonjs/winston/compare/v3.14.2...v3.17.0)

Updates `winston-transport` from 4.8.0 to 4.9.0
- [Release notes](https://github.com/winstonjs/winston-transport/releases)
- [Changelog](https://github.com/winstonjs/winston-transport/blob/master/CHANGELOG.md)
- [Commits](https://github.com/winstonjs/winston-transport/compare/v4.8.0...v4.9.0)

Updates `zod-to-json-schema` from 3.23.1 to 3.23.5
- [Release notes](https://github.com/StefanTerdell/zod-to-json-schema/releases)
- [Changelog](https://github.com/StefanTerdell/zod-to-json-schema/blob/master/changelog.md)
- [Commits](https://github.com/StefanTerdell/zod-to-json-schema/commits)

---
updated-dependencies:
- dependency-name: "@anthropic-ai/sdk"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@bull-board/api"
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: "@bull-board/express"
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: "@dqbd/tiktoken"
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: "@nangohq/node"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@sentry/cli"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@sentry/node"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@sentry/profiling-node"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@supabase/supabase-js"
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: "@types/express-ws"
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: "@types/ws"
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: ajv
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: async
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: axios
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: bullmq
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: cacheable-lookup
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: cheerio
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: date-fns
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: express-rate-limit
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: glob
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: json-schema-to-zod
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: koffi
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: langchain
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: luxon
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: marked
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: mongoose
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: natural
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: openai
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: posthog-node
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: puppeteer
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: rate-limiter-flexible
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: resend
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: scrapingbee
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
- dependency-name: stripe
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: systeminformation
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: unstructured-client
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: uuid
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: prod-deps
- dependency-name: winston
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: winston-transport
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-deps
- dependency-name: zod-to-json-schema
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-deps
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-12 21:54:15 +00:00
22 changed files with 1394 additions and 1236 deletions

View File

@ -50,79 +50,79 @@
"typescript": "^5.4.2"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.24.3",
"@anthropic-ai/sdk": "^0.32.1",
"@brillout/import": "^0.2.2",
"@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5",
"@bull-board/api": "^6.4.0",
"@bull-board/express": "^6.4.0",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.16",
"@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",
"@supabase/supabase-js": "^2.44.2",
"@types/express-ws": "^3.0.4",
"@types/ws": "^8.5.12",
"ajv": "^8.16.0",
"async": "^3.2.5",
"@dqbd/tiktoken": "^1.0.17",
"@nangohq/node": "^0.42.22",
"@sentry/cli": "^2.38.2",
"@sentry/node": "^8.38.0",
"@sentry/profiling-node": "^8.38.0",
"@supabase/supabase-js": "^2.46.1",
"@types/express-ws": "^3.0.5",
"@types/ws": "^8.5.13",
"ajv": "^8.17.1",
"async": "^3.2.6",
"async-mutex": "^0.5.0",
"axios": "^1.3.4",
"axios": "^1.7.7",
"axios-retry": "^4.5.0",
"bottleneck": "^2.19.5",
"bullmq": "^5.11.0",
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12",
"bullmq": "^5.25.6",
"cacheable-lookup": "^7.0.0",
"cheerio": "^1.0.0",
"cohere": "^1.1.1",
"cors": "^2.8.5",
"cron-parser": "^4.9.0",
"date-fns": "^3.6.0",
"date-fns": "^4.1.0",
"dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2",
"escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1",
"express-rate-limit": "^7.4.1",
"express-ws": "^5.0.2",
"glob": "^10.4.2",
"glob": "^11.0.0",
"gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1",
"joplin-turndown-plugin-gfm": "^1.0.12",
"json-schema-to-zod": "^2.3.0",
"json-schema-to-zod": "^2.4.1",
"keyword-extractor": "^0.0.28",
"koffi": "^2.9.0",
"langchain": "^0.2.8",
"koffi": "^2.9.2",
"langchain": "^0.3.5",
"languagedetect": "^2.0.0",
"logsnag": "^1.0.0",
"luxon": "^3.4.3",
"marked": "^14.1.2",
"luxon": "^3.5.0",
"marked": "^15.0.0",
"md5": "^2.3.0",
"moment": "^2.29.4",
"mongoose": "^8.4.4",
"natural": "^7.0.7",
"openai": "^4.57.0",
"mongoose": "^8.8.1",
"natural": "^8.0.1",
"openai": "^4.72.0",
"pdf-parse": "^1.1.1",
"pos": "^0.4.2",
"posthog-node": "^4.0.1",
"posthog-node": "^4.2.1",
"promptable": "^0.0.10",
"puppeteer": "^22.12.1",
"rate-limiter-flexible": "2.4.2",
"puppeteer": "^23.7.1",
"rate-limiter-flexible": "5.0.4",
"redlock": "5.0.0-beta.2",
"resend": "^3.4.0",
"resend": "^4.0.0",
"robots-parser": "^3.0.1",
"scrapingbee": "^1.7.4",
"stripe": "^16.1.0",
"systeminformation": "^5.22.11",
"scrapingbee": "^1.7.5",
"stripe": "^17.3.1",
"systeminformation": "^5.23.5",
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",
"undici": "^6.20.1",
"unstructured-client": "^0.11.3",
"uuid": "^10.0.0",
"winston": "^3.14.2",
"winston-transport": "^4.8.0",
"unstructured-client": "^0.18.2",
"uuid": "^11.0.3",
"winston": "^3.17.0",
"winston-transport": "^4.9.0",
"wordpos": "^2.1.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1"
"zod-to-json-schema": "^3.23.5"
},
"nodemonConfig": {
"ignore": [

File diff suppressed because it is too large Load Diff

View File

@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
await checkTeamCredits(chunk, team_id, limitCheck);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
}
// TODO: need to do this to v1

View File

@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact hello@firecrawl.com for help.",
"Error checking team credits. Please contact help@firecrawl.com for help.",
});
}

View File

@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@ -66,6 +67,7 @@ export async function batchScrapeController(
crawl_id: id,
sitemapped: true,
v1: true,
webhook: req.body.webhook,
},
opts: {
jobId: uuidv4(),
@ -85,6 +87,10 @@ export async function batchScrapeController(
);
await addScrapeJobs(jobs);
if(req.body.webhook) {
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
}
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({

View File

@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, {
type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
});
}
}

View File

@ -1,10 +1,6 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
} from "./types";
import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
@ -46,6 +42,7 @@ export async function mapController(
originUrl: req.body.url,
crawlerOptions: {
...req.body,
limit: req.body.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined,
},
scrapeOptions: scrapeOptions.parse({}),
@ -57,77 +54,93 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc);
let urlWithoutWww = req.body.url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
const resultsPerPage = 100;
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
let allResults: any[] = [];
let pagePromises: Promise<any>[] = [];
if (cachedResult) {
allResults = JSON.parse(cachedResult);
} else {
const fetchPage = async (page: number) => {
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page,
// If sitemapOnly is true, only get links from sitemap
if (req.body.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
};
links = links.slice(1, limit);
}
} else {
let urlWithoutWww = req.body.url.replace("www.", "");
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
allResults = await Promise.all(pagePromises);
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
}
const resultsPerPage = 100;
const maxPages = Math.ceil(
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
);
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises),
]);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
if (!cachedResult) {
allResults = searchResults;
}
let allResults: any[] = [];
let pagePromises: Promise<any>[] = [];
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
let mapResults = allResults
.flat()
.filter((result) => result !== null && result !== undefined);
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
if (mapResults.length > minumumCutoff) {
mapResults = mapResults.slice(0, minumumCutoff);
}
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
if (cachedResult) {
allResults = JSON.parse(cachedResult);
} else {
mapResults.map((x) => {
const fetchPage = async (page: number) => {
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page,
});
};
pagePromises = Array.from({ length: maxPages }, (_, i) =>
fetchPage(i + 1)
);
allResults = await Promise.all(pagePromises);
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
}
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
...(cachedResult ? [] : pagePromises),
]);
if (!cachedResult) {
allResults = searchResults;
}
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
});
}
}
let mapResults = allResults
.flat()
.filter((result) => result !== null && result !== undefined);
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
if (mapResults.length > minumumCutoff) {
mapResults = mapResults.slice(0, minumumCutoff);
}
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
});
}
}
}
// Perform cosine similarity between the search query and the list of links
if (req.body.search) {
const searchQuery = req.body.search.toLowerCase();

View File

@ -175,9 +175,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
@ -224,7 +236,7 @@ export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
webhook: webhookSchema.optional(),
limit: z.number().default(10000),
}).strict(strictMessage);
@ -249,6 +261,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000),
}).strict(strictMessage);

View File

@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
}
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
});
logger.info(`Worker ${process.pid} started`);

View File

@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
initialUrl: sc.originUrl!,
baseUrl: newBase ? new URL(newBase).origin : undefined,
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,

View File

@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node";
import dotenv from 'dotenv';
import { logger } from './logger';
import { stat } from 'fs/promises';
dotenv.config();
// TODO: add a timeout to the Go parser
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
class GoMarkdownConverter {
private static instance: GoMarkdownConverter;
private convert: any;
private constructor() {
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
const lib = koffi.load(goExecutablePath);
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
}
public static getInstance(): GoMarkdownConverter {
public static async getInstance(): Promise<GoMarkdownConverter> {
if (!GoMarkdownConverter.instance) {
try {
await stat(goExecutablePath);
} catch (_) {
throw Error("Go shared library not found");
}
GoMarkdownConverter.instance = new GoMarkdownConverter();
}
return GoMarkdownConverter.instance;
@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
try {
if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
const converter = GoMarkdownConverter.getInstance();
const converter = await GoMarkdownConverter.getInstance();
let markdownContent = await converter.convertHTMLToMarkdown(html);
markdownContent = processMultiLineLinks(markdownContent);
@ -56,8 +62,12 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent;
}
} catch (error) {
Sentry.captureException(error);
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
Sentry.captureException(error);
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
} else {
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
}
}
// Fallback to TurndownService if Go parser fails or is not enabled
@ -89,7 +99,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
return markdownContent;
} catch (error) {
console.error("Error converting HTML to Markdown: ", error);
logger.error("Error converting HTML to Markdown", {error});
return ""; // Optionally return an empty string or handle the error as needed
}
}

View File

@ -27,6 +27,7 @@ export class WebCrawler {
constructor({
jobId,
initialUrl,
baseUrl,
includes,
excludes,
maxCrawledLinks = 10000,
@ -38,6 +39,7 @@ export class WebCrawler {
}: {
jobId: string;
initialUrl: string;
baseUrl?: string;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
@ -49,7 +51,7 @@ export class WebCrawler {
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin;
this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit;
@ -63,7 +65,12 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
}
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
return sitemapLinks.slice(0, limit);
}
return sitemapLinks
.filter((link) => {
let url: URL;
@ -157,11 +164,14 @@ export class WebCrawler {
this.robots = robotsParser(this.robotsTxtUrl, txt);
}
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if(fromMap && onlySitemap) {
return sitemapLinks.map(link => ({ url: link, html: "" }));
}
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
return filteredLinks.map(link => ({ url: link, html: "" }));
}
return null;
@ -351,6 +361,7 @@ export class WebCrawler {
return url;
};
const sitemapUrl = url.endsWith("/sitemap.xml")
? url
: `${url}/sitemap.xml`;

View File

@ -24,7 +24,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
if (!response.success) {
throw response.error;
}

View File

@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error {
public results: EngineResultsTracker;
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
super("All scraping engines failed!");
super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
this.fallbackList = fallbackList;
this.results = results;
}

View File

@ -1,4 +1,3 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
import * as Sentry from "@sentry/node";
@ -6,7 +5,6 @@ import { logger } from "../lib/logger";
dotenv.config();
export async function fireEngineMap(
q: string,
options: {
@ -37,18 +35,18 @@ export async function fireEngineMap(
return [];
}
let config = {
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
method: "POST",
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true"
"X-Disable-Cache": "true",
},
data: data,
};
const response = await axios(config);
if (response && response.data) {
return response.data;
body: data,
});
if (response.ok) {
const responseData = await response.json();
return responseData;
} else {
return [];
}

View File

@ -6,6 +6,7 @@ import { logger } from "../../../src/lib/logger";
import { sendSlackWebhook } from "../alerts/slack";
import { getNotificationString } from "./notification_string";
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
import { redlock } from "../redlock";
const emailTemplates: Record<
NotificationType,
@ -22,7 +23,7 @@ const emailTemplates: Record<
},
[NotificationType.RATE_LIMIT_REACHED]: {
subject: "Rate Limit Reached - Firecrawl",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
},
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
subject: "Auto recharge successful - Firecrawl",
@ -30,7 +31,7 @@ const emailTemplates: Record<
},
[NotificationType.AUTO_RECHARGE_FAILED]: {
subject: "Auto recharge failed - Firecrawl",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
},
};
@ -62,7 +63,7 @@ export async function sendEmailNotification(
const { data, error } = await resend.emails.send({
from: "Firecrawl <firecrawl@getmendableai.com>",
to: [email],
reply_to: "hello@firecrawl.com",
reply_to: "help@firecrawl.com",
subject: emailTemplates[notificationType].subject,
html: emailTemplates[notificationType].html,
});
@ -88,6 +89,7 @@ export async function sendNotificationInternal(
if (team_id === "preview") {
return { success: true };
}
return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => {
if (!bypassRecentChecks) {
const fifteenDaysAgo = new Date();
@ -171,5 +173,6 @@ export async function sendNotificationInternal(
return { success: false };
}
return { success: true };
return { success: true };
});
}

View File

@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) {
document: null,
project_id: job.data.project_id,
error:
"URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
"URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
};
return data;
}
@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),

View File

@ -1,15 +1,17 @@
import axios from "axios";
import { logger } from "../../src/lib/logger";
import { logger } from "../lib/logger";
import { supabase_service } from "./supabase";
import { WebhookEventType } from "../types";
import { configDotenv } from "dotenv";
import { z } from "zod";
import { webhookSchema } from "../controllers/v1/types";
configDotenv();
export const callWebhook = async (
teamId: string,
id: string,
data: any | null,
specified?: string,
specified?: z.infer<typeof webhookSchema>,
v1 = false,
eventType: WebhookEventType = "crawl.page",
awaitWebhook: boolean = false
@ -20,7 +22,7 @@ export const callWebhook = async (
id
);
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
let webhookUrl = specified ?? selfHostedUrl;
let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined);
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
// and the USE_DB_AUTHENTICATION environment variable is set to true
@ -73,7 +75,7 @@ export const callWebhook = async (
if (awaitWebhook) {
try {
await axios.post(
webhookUrl,
webhookUrl.url,
{
success: !v1
? data.success
@ -92,6 +94,7 @@ export const callWebhook = async (
{
headers: {
"Content-Type": "application/json",
...webhookUrl.headers,
},
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
}
@ -104,7 +107,7 @@ export const callWebhook = async (
} else {
axios
.post(
webhookUrl,
webhookUrl.url,
{
success: !v1
? data.success
@ -123,6 +126,7 @@ export const callWebhook = async (
{
headers: {
"Content-Type": "application/json",
...webhookUrl.headers,
},
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
}

View File

@ -1,4 +1,5 @@
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types";
import { z } from "zod";
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types";
import { ExtractorOptions, Document } from "./lib/entities";
import { InternalOptions } from "./scraper/scrapeURL";
@ -33,7 +34,7 @@ export interface WebScraperOptions {
origin?: string;
crawl_id?: string;
sitemapped?: boolean;
webhook?: string;
webhook?: z.infer<typeof webhookSchema>;
v1?: boolean;
is_scrape?: boolean;
}
@ -165,4 +166,4 @@ export type PlanType =
| "";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.8.1",
"version": "1.8.4",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -153,7 +153,10 @@ export interface CrawlParams {
allowExternalLinks?: boolean;
ignoreSitemap?: boolean;
scrapeOptions?: CrawlScrapeOptions;
webhook?: string;
webhook?: string | {
url: string;
headers?: Record<string, string>;
};
deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean;
}
@ -218,6 +221,7 @@ export interface MapParams {
search?: string;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
sitemapOnly?: boolean;
limit?: number;
}
@ -540,16 +544,18 @@ export default class FirecrawlApp {
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @param webhook - Optional webhook for the batch scrape.
* @returns The response from the crawl operation.
*/
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
idempotencyKey?: string
idempotencyKey?: string,
webhook?: CrawlParams["webhook"],
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
let jsonData: any = { urls, ...(params ?? {}), webhook };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,

View File

@ -0,0 +1,6 @@
# AGI News ✨
AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/)
Here is a link to the repo:
[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews)