mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge remote-tracking branch 'origin/main' into f/rust-sdk
This commit is contained in:
commit
697501cc8a
30
.github/dependabot.yml
vendored
30
.github/dependabot.yml
vendored
|
@ -5,11 +5,8 @@ updates:
|
|||
directory: "/apps/playwright-service"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
groups:
|
||||
prod-deps:
|
||||
dependency-type: "production"
|
||||
dev-deps:
|
||||
dependency-type: "development"
|
||||
open-pull-requests-limit: 0 # Disable version updates
|
||||
security-updates: "all"
|
||||
commit-message:
|
||||
prefix: "apps/playwright-service"
|
||||
include: "scope"
|
||||
|
@ -19,11 +16,8 @@ updates:
|
|||
directory: "/apps/python-sdk"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
groups:
|
||||
prod-deps:
|
||||
dependency-type: "production"
|
||||
dev-deps:
|
||||
dependency-type: "development"
|
||||
open-pull-requests-limit: 0 # Disable version updates
|
||||
security-updates: "all"
|
||||
commit-message:
|
||||
prefix: "apps/python-sdk"
|
||||
include: "scope"
|
||||
|
@ -33,11 +27,8 @@ updates:
|
|||
directory: "/apps/api"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
groups:
|
||||
prod-deps:
|
||||
dependency-type: "production"
|
||||
dev-deps:
|
||||
dependency-type: "development"
|
||||
open-pull-requests-limit: 0 # Disable version updates
|
||||
security-updates: "all"
|
||||
commit-message:
|
||||
prefix: "apps/api"
|
||||
include: "scope"
|
||||
|
@ -47,11 +38,8 @@ updates:
|
|||
directory: "/apps/test-suite"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
groups:
|
||||
prod-deps:
|
||||
dependency-type: "production"
|
||||
dev-deps:
|
||||
dependency-type: "development"
|
||||
open-pull-requests-limit: 0 # Disable version updates
|
||||
security-updates: "all"
|
||||
commit-message:
|
||||
prefix: "apps/test-suite"
|
||||
include: "scope"
|
||||
include: "scope"
|
20
.github/workflows/check-queues.yml
vendored
Normal file
20
.github/workflows/check-queues.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
name: Check Queues
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/5 * * * *'
|
||||
|
||||
env:
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
||||
jobs:
|
||||
clean-jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send GET request to check queues
|
||||
run: |
|
||||
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/check-queues)
|
||||
if [ "$response" -ne 200 ]; then
|
||||
echo "Failed to check queues. Response: $response"
|
||||
exit 1
|
||||
fi
|
||||
echo "Successfully checked queues. Response: $response"
|
6
.github/workflows/fly-direct.yml
vendored
6
.github/workflows/fly-direct.yml
vendored
|
@ -29,9 +29,9 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Change directory
|
||||
run: cd apps/api
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy ./apps/api --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
|
42
.github/workflows/fly.yml
vendored
42
.github/workflows/fly.yml
vendored
|
@ -169,6 +169,41 @@ jobs:
|
|||
run: npm run test
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
go-sdk-tests:
|
||||
name: Go SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: "go.mod"
|
||||
- name: Install pnpm
|
||||
run: npm install -g pnpm
|
||||
- name: Install dependencies
|
||||
run: pnpm install
|
||||
working-directory: ./apps/api
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start workers
|
||||
run: npm run workers &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Install dependencies for Go SDK
|
||||
run: go mod tidy
|
||||
working-directory: ./apps/go-sdk
|
||||
- name: Run tests for Go SDK
|
||||
run: go test -v ./... -timeout 180s
|
||||
working-directory: ./apps/go-sdk/firecrawl
|
||||
|
||||
rust-sdk-tests:
|
||||
name: Rust SDK Tests
|
||||
needs: pre-deploy-e2e-tests
|
||||
|
@ -210,12 +245,12 @@ jobs:
|
|||
needs: [pre-deploy-test-suite, python-sdk-tests, js-sdk-tests, rust-sdk-tests]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Change directory
|
||||
run: cd apps/api
|
||||
- uses: superfly/flyctl-actions/setup-flyctl@master
|
||||
- run: flyctl deploy ./apps/api --remote-only -a firecrawl-scraper-js
|
||||
- run: flyctl deploy --remote-only -a firecrawl-scraper-js
|
||||
working-directory: ./apps/api
|
||||
env:
|
||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
||||
build-and-publish-python-sdk:
|
||||
name: Build and publish Python SDK
|
||||
|
@ -297,7 +332,6 @@ jobs:
|
|||
run: |
|
||||
npm run build-and-publish
|
||||
working-directory: ./apps/js-sdk/firecrawl
|
||||
|
||||
build-and-publish-rust-sdk:
|
||||
name: Build and publish Rust SDK
|
||||
runs-on: ubuntu-latest
|
||||
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -17,4 +17,5 @@ apps/test-suite/logs
|
|||
apps/test-suite/load-test-results/test-run-report.json
|
||||
|
||||
apps/playwright-service-ts/node_modules/
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
|
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
|
@ -24,6 +24,7 @@ NUM_WORKERS_PER_QUEUE=8
|
|||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://localhost:6379
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
USE_DB_AUTHENTICATION=false
|
||||
|
|
329
LICENSE
329
LICENSE
|
@ -1,178 +1,178 @@
|
|||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
|
@ -180,9 +180,9 @@ modification of the work as a means of enforcing, against the work's
|
|||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
|
@ -190,12 +190,12 @@ non-permissive terms added in accord with section 7 apply to the code;
|
|||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
|
@ -220,19 +220,19 @@ terms of section 4, provided that you also meet all of these conditions:
|
|||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
@ -278,75 +278,75 @@ in one of these ways:
|
|||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
|
@ -373,74 +373,74 @@ that material) supplement the terms of this License with terms:
|
|||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
|
@ -448,43 +448,43 @@ give under the previous paragraph, plus a right to possession of the
|
|||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
|
@ -492,13 +492,13 @@ then you must either (1) cause the Corresponding Source to be so
|
|||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
|
@ -506,10 +506,10 @@ or convey a specific copy of the covered work, then the patent license
|
|||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
|
@ -521,83 +521,83 @@ for and in connection with specific products or compilations that
|
|||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
|
@ -607,9 +607,9 @@ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
|||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
|
@ -620,11 +620,11 @@ copy of the Program in return for a fee.
|
|||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
@ -647,15 +647,34 @@ the "copyright" line and a pointer to where the full notice is found.
|
|||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
||||
Firecrawl - Web scraping and crawling tool
|
||||
Copyright (c) 2024 Sideguide Technologies Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
For more information, please contact:
|
||||
Sideguide Technologies Inc.
|
||||
|
|
97
README.md
97
README.md
|
@ -20,8 +20,14 @@ We provide an easy to use API with our hosted version. You can find the playgrou
|
|||
- [x] [Python SDK](https://github.com/mendableai/firecrawl/tree/main/apps/python-sdk)
|
||||
- [x] [Node SDK](https://github.com/mendableai/firecrawl/tree/main/apps/js-sdk)
|
||||
- [x] [Langchain Integration 🦜🔗](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/)
|
||||
- [x] [Langchain JS Integration 🦜🔗](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/firecrawl)
|
||||
- [x] [Llama Index Integration 🦙](https://docs.llamaindex.ai/en/latest/examples/data_connectors/WebPageDemo/#using-firecrawl-reader)
|
||||
- [X] [Langchain JS Integration 🦜🔗](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/firecrawl)
|
||||
- [x] [Dify Integration](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl)
|
||||
- [x] [Langflow Integration](https://docs.langflow.org/)
|
||||
- [x] [Crew.ai Integration](https://docs.crewai.com/)
|
||||
- [x] [Flowise AI Integration](https://docs.flowiseai.com/integrations/langchain/document-loaders/firecrawl)
|
||||
- [x] [PraisonAI Integration](https://docs.praison.ai/firecrawl/)
|
||||
- [x] [Zapier Integration](https://zapier.com/apps/firecrawl/integrations)
|
||||
- [ ] Want an SDK or Integration? Let us know by opening an issue.
|
||||
|
||||
To run locally, refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md).
|
||||
|
@ -189,30 +195,29 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
|
|||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"content": "Raw Content",
|
||||
"metadata": {
|
||||
"title": "Mendable",
|
||||
"description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
|
||||
"robots": "follow, index",
|
||||
"ogTitle": "Mendable",
|
||||
"ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
|
||||
"ogUrl": "https://mendable.ai/",
|
||||
"ogImage": "https://mendable.ai/mendable_new_og1.png",
|
||||
"ogLocaleAlternate": [],
|
||||
"ogSiteName": "Mendable",
|
||||
"sourceURL": "https://mendable.ai/"
|
||||
},
|
||||
"llm_extraction": {
|
||||
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
|
||||
"supports_sso": true,
|
||||
"is_open_source": false,
|
||||
"is_in_yc": true
|
||||
}
|
||||
"success": true,
|
||||
"data": {
|
||||
"content": "Raw Content",
|
||||
"metadata": {
|
||||
"title": "Mendable",
|
||||
"description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
|
||||
"robots": "follow, index",
|
||||
"ogTitle": "Mendable",
|
||||
"ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
|
||||
"ogUrl": "https://mendable.ai/",
|
||||
"ogImage": "https://mendable.ai/mendable_new_og1.png",
|
||||
"ogLocaleAlternate": [],
|
||||
"ogSiteName": "Mendable",
|
||||
"sourceURL": "https://mendable.ai/"
|
||||
},
|
||||
"llm_extraction": {
|
||||
"company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
|
||||
"supports_sso": true,
|
||||
"is_open_source": false,
|
||||
"is_in_yc": true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Using Python SDK
|
||||
|
@ -253,7 +258,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
|
|||
```python
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
|
@ -302,34 +307,29 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
|
|||
|
||||
```js
|
||||
try {
|
||||
const url = 'https://example.com';
|
||||
const url = "https://example.com";
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
console.log(scrapedData);
|
||||
|
||||
} catch (error) {
|
||||
console.error(
|
||||
'Error occurred while scraping:',
|
||||
error.message
|
||||
);
|
||||
console.error("Error occurred while scraping:", error.message);
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```js
|
||||
const crawlUrl = 'https://example.com';
|
||||
const crawlUrl = "https://example.com";
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ['blog/'],
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true
|
||||
}
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
const waitUntilDone = true;
|
||||
const timeout = 5;
|
||||
|
@ -339,10 +339,8 @@ const crawlResult = await app.crawlUrl(
|
|||
waitUntilDone,
|
||||
timeout
|
||||
);
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
@ -352,12 +350,10 @@ const status = await app.checkCrawlStatus(jobId);
|
|||
console.log(status);
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. We support zod schema to make it easier for you too. Here is how you to use it:
|
||||
|
||||
|
||||
```js
|
||||
import FirecrawlApp from "@mendable/firecrawl-js";
|
||||
import { z } from "zod";
|
||||
|
@ -393,17 +389,28 @@ console.log(scrapeResult.data["llm_extraction"]);
|
|||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
```js
|
||||
const query = 'what is mendable?';
|
||||
const query = "what is mendable?";
|
||||
const searchResults = await app.search(query, {
|
||||
pageOptions: {
|
||||
fetchPageContent: true // Fetch the page content for each search result
|
||||
}
|
||||
fetchPageContent: true, // Fetch the page content for each search result
|
||||
},
|
||||
});
|
||||
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
||||
*It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
|
||||
_It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions._
|
||||
|
||||
## License Disclaimer
|
||||
|
||||
This project is primarily licensed under the GNU Affero General Public License v3.0 (AGPL-3.0), as specified in the LICENSE file in the root directory of this repository. However, certain components of this project are licensed under the MIT License. Refer to the LICENSE files in these specific directories for details.
|
||||
|
||||
Please note:
|
||||
|
||||
- The AGPL-3.0 license applies to all parts of the project unless otherwise specified.
|
||||
- The SDKs and some UI components are licensed under the MIT License. Refer to the LICENSE files in these specific directories for details.
|
||||
- When using or contributing to this project, ensure you comply with the appropriate license terms for the specific component you are working with.
|
||||
|
||||
For more details on the licensing of specific components, please refer to the LICENSE files in the respective directories or contact the project maintainers.
|
||||
|
|
176
SELF_HOST.md
176
SELF_HOST.md
|
@ -1,36 +1,77 @@
|
|||
## Self-hosting Firecrawl
|
||||
# Self-hosting Firecrawl
|
||||
|
||||
_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._
|
||||
#### Contributor?
|
||||
|
||||
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
|
||||
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally so you can run it on your own and contribute.
|
||||
|
||||
## Getting Started
|
||||
If you're contributing, note that the process is similar to other open-source repos, i.e., fork Firecrawl, make changes, run tests, PR.
|
||||
|
||||
First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`.
|
||||
If you have any questions or would like help getting on board, join our Discord community [here](https://discord.gg/gSmWdAkdwd) for more information or submit an issue on Github [here](https://github.com/mendableai/firecrawl/issues/new/choose)!
|
||||
|
||||
### Steps
|
||||
## Why?
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/mendableai/firecrawl.git
|
||||
cd firecrawl
|
||||
cp ./apps/api/.env.example ./.env
|
||||
```
|
||||
|
||||
2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication:
|
||||
|
||||
```plaintext
|
||||
USE_DB_AUTHENTICATION=false
|
||||
```
|
||||
|
||||
3. Update the Redis URL in the .env file to align with the Docker configuration:
|
||||
|
||||
```plaintext
|
||||
REDIS_URL=redis://redis:6379
|
||||
```
|
||||
|
||||
4. #### Option: Running with TypeScript Playwright Service
|
||||
Self-hosting Firecrawl is particularly beneficial for organizations with stringent security policies that require data to remain within controlled environments. Here are some key reasons to consider self-hosting:
|
||||
|
||||
- **Enhanced Security and Compliance:** By self-hosting, you ensure that all data handling and processing complies with internal and external regulations, keeping sensitive information within your secure infrastructure. Note that Firecrawl is a Mendable product and relies on SOC2 Type2 certification, which means that the platform adheres to high industry standards for managing data security.
|
||||
- **Customizable Services:** Self-hosting allows you to tailor the services, such as the Playwright service, to meet specific needs or handle particular use cases that may not be supported by the standard cloud offering.
|
||||
- **Learning and Community Contribution:** By setting up and maintaining your own instance, you gain a deeper understanding of how Firecrawl works, which can also lead to more meaningful contributions to the project.
|
||||
|
||||
### Considerations
|
||||
|
||||
However, there are some limitations and additional responsibilities to be aware of:
|
||||
|
||||
1. **Limited Access to Fire-engine:** Currently, self-hosted instances of Firecrawl do not have access to Fire-engine, which includes advanced features for handling IP blocks, robot detection mechanisms, and more. This means that while you can manage basic scraping tasks, more complex scenarios might require additional configuration or might not be supported.
|
||||
2. **Manual Configuration Required:** If you need to use scraping methods beyond the basic fetch and Playwright options, you will need to manually configure these in the `.env` file. This requires a deeper understanding of the technologies and might involve more setup time.
|
||||
|
||||
Self-hosting Firecrawl is ideal for those who need full control over their scraping and data processing environments but comes with the trade-off of additional maintenance and configuration efforts.
|
||||
|
||||
## Steps
|
||||
|
||||
1. First, start by installing the dependencies
|
||||
|
||||
- Docker [instructions](https://docs.docker.com/get-docker/)
|
||||
|
||||
|
||||
2. Set environment variables
|
||||
|
||||
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
|
||||
|
||||
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
|
||||
|
||||
`.env:`
|
||||
```
|
||||
# ===== Required ENVS ======
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://redis:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
USE_DB_AUTHENTICATION=false
|
||||
|
||||
# ===== Optional ENVS ======
|
||||
|
||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
```
|
||||
|
||||
3. *(Optional) Running with TypeScript Playwright Service*
|
||||
|
||||
* Update the `docker-compose.yml` file to change the Playwright service:
|
||||
|
||||
|
@ -49,16 +90,91 @@ First, clone this repository and copy the example env file from the API folder `
|
|||
```
|
||||
|
||||
* Don't forget to set the proxy server in your `.env` file as needed.
|
||||
5. Build and run the Docker containers:
|
||||
|
||||
4. Build and run the Docker containers:
|
||||
|
||||
```bash
|
||||
docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
|
||||
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
|
||||
|
||||
You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/@/queues`.
|
||||
|
||||
5. *(Optional)* Test the API
|
||||
|
||||
If you’d like to test the crawl endpoint, you can run this:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3002/v0/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
}'
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.
|
||||
|
||||
### Supabase client is not configured
|
||||
|
||||
**Symptom:**
|
||||
```bash
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Attempted to access Supabase client when it's not configured.
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Error inserting scrape event: Error: Supabase client is not configured.
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
|
||||
|
||||
### You're bypassing authentication
|
||||
|
||||
**Symptom:**
|
||||
```bash
|
||||
[YYYY-MM-DDTHH:MM:SS.SSSz]WARN - You're bypassing authentication
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances.
|
||||
|
||||
### Docker containers fail to start
|
||||
|
||||
**Symptom:**
|
||||
Docker containers exit unexpectedly or fail to start.
|
||||
|
||||
**Solution:**
|
||||
Check the Docker logs for any error messages using the command:
|
||||
```bash
|
||||
docker logs [container_name]
|
||||
```
|
||||
|
||||
- Ensure all required environment variables are set correctly in the .env file.
|
||||
- Verify that all Docker services defined in docker-compose.yml are correctly configured and the necessary images are available.
|
||||
|
||||
### Connection issues with Redis
|
||||
|
||||
**Symptom:**
|
||||
Errors related to connecting to Redis, such as timeouts or "Connection refused".
|
||||
|
||||
**Solution:**
|
||||
- Ensure that the Redis service is up and running in your Docker environment.
|
||||
- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance, ensure that it points to the same URL in the `docker-compose.yaml` file (`redis://redis:6379`)
|
||||
- Check network settings and firewall rules that may block the connection to the Redis port.
|
||||
|
||||
### API endpoint does not respond
|
||||
|
||||
**Symptom:**
|
||||
API requests to the Firecrawl instance timeout or return no response.
|
||||
|
||||
**Solution:**
|
||||
- Ensure that the Firecrawl service is running by checking the Docker container status.
|
||||
- Verify that the PORT and HOST settings in your .env file are correct and that no other service is using the same port.
|
||||
- Check the network configuration to ensure that the host is accessible from the client making the API request.
|
||||
|
||||
By addressing these common issues, you can ensure a smoother setup and operation of your self-hosted Firecrawl instance.
|
||||
|
||||
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
|
||||
|
||||
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
|
@ -2,7 +2,8 @@
|
|||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://localhost:6379
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
|
@ -16,18 +17,29 @@ SUPABASE_URL=
|
|||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE= # set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL= # set if you'd like to test the crawling rate limit
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
# use if you've set up authentication and want to test with a real API key
|
||||
TEST_API_KEY=
|
||||
# set if you'd like to test the scraping rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_SCRAPE=
|
||||
# set if you'd like to test the crawling rate limit
|
||||
RATE_LIMIT_TEST_API_KEY_CRAWL=
|
||||
# set if you'd like to use scraping Be to handle JS blocking
|
||||
SCRAPING_BEE_API_KEY=
|
||||
# add for LLM dependednt features (image alt generation, etc.)
|
||||
OPENAI_API_KEY=
|
||||
BULL_AUTH_KEY=@
|
||||
# use if you're configuring basic logging with logtail
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you have a serper key you'd like to use as a search api
|
||||
SERPER_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_API_KEY=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST=
|
||||
|
||||
STRIPE_PRICE_ID_STANDARD=
|
||||
STRIPE_PRICE_ID_SCALE=
|
||||
|
@ -42,7 +54,8 @@ STRIPE_PRICE_ID_GROWTH_YEARLY=
|
|||
HYPERDX_API_KEY=
|
||||
HDX_NODE_BETA_MODE=1
|
||||
|
||||
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
|
||||
# set if you'd like to use the fire engine closed beta
|
||||
FIRE_ENGINE_BETA_URL=
|
||||
|
||||
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
|
||||
PROXY_SERVER=
|
||||
|
@ -56,3 +69,14 @@ SELF_HOSTED_WEBHOOK_URL=
|
|||
|
||||
# Resend API Key for transactional emails
|
||||
RESEND_API_KEY=
|
||||
|
||||
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
|
||||
# Available levels are:
|
||||
# NONE - No logs will be output.
|
||||
# ERROR - For logging error messages that indicate a failure in a specific operation.
|
||||
# WARN - For logging potentially harmful situations that are not necessarily errors.
|
||||
# INFO - For logging informational messages that highlight the progress of the application.
|
||||
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
# TRACE - For logging more detailed information than the DEBUG level.
|
||||
# Set LOGGING_LEVEL to one of the above options to control logging output.
|
||||
LOGGING_LEVEL=INFO
|
||||
|
|
|
@ -5,6 +5,7 @@ SUPABASE_ANON_TOKEN=
|
|||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
REDIS_URL=
|
||||
REDIS_RATE_LIMIT_URL=
|
||||
SCRAPING_BEE_API_KEY=
|
||||
OPENAI_API_KEY=
|
||||
ANTHROPIC_API_KEY=
|
||||
|
|
4
apps/api/.gitignore
vendored
4
apps/api/.gitignore
vendored
|
@ -3,4 +3,6 @@
|
|||
.env
|
||||
*.csv
|
||||
dump.rdb
|
||||
/mongo-data
|
||||
/mongo-data
|
||||
|
||||
/.next/
|
||||
|
|
|
@ -31,6 +31,3 @@ COPY --from=build /app /app
|
|||
# Start the server by default, this can be overwritten at runtime
|
||||
EXPOSE 8080
|
||||
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
||||
CMD [ "pnpm", "run", "start:production" ]
|
||||
CMD [ "pnpm", "run", "worker:production" ]
|
||||
|
||||
|
|
|
@ -6,13 +6,13 @@
|
|||
app = 'staging-firecrawl-scraper-js'
|
||||
primary_region = 'mia'
|
||||
kill_signal = 'SIGINT'
|
||||
kill_timeout = '5s'
|
||||
kill_timeout = '30s'
|
||||
|
||||
[build]
|
||||
|
||||
[processes]
|
||||
app = 'npm run start:production'
|
||||
worker = 'npm run worker:production'
|
||||
app = 'node dist/src/index.js'
|
||||
worker = 'node dist/src/services/queue-worker.js'
|
||||
|
||||
[http_service]
|
||||
internal_port = 8080
|
||||
|
|
|
@ -4,15 +4,15 @@
|
|||
#
|
||||
|
||||
app = 'firecrawl-scraper-js'
|
||||
primary_region = 'mia'
|
||||
primary_region = 'iad'
|
||||
kill_signal = 'SIGINT'
|
||||
kill_timeout = '5s'
|
||||
kill_timeout = '30s'
|
||||
|
||||
[build]
|
||||
|
||||
[processes]
|
||||
app = 'npm run start:production'
|
||||
worker = 'npm run worker:production'
|
||||
app = 'node --max-old-space-size=8192 dist/src/index.js'
|
||||
worker = 'node --max-old-space-size=8192 dist/src/services/queue-worker.js'
|
||||
|
||||
[http_service]
|
||||
internal_port = 8080
|
||||
|
@ -24,8 +24,8 @@ kill_timeout = '5s'
|
|||
|
||||
[http_service.concurrency]
|
||||
type = "requests"
|
||||
hard_limit = 100
|
||||
soft_limit = 50
|
||||
hard_limit = 200
|
||||
soft_limit = 75
|
||||
|
||||
[[http_service.checks]]
|
||||
grace_period = "20s"
|
||||
|
|
|
@ -41,33 +41,20 @@
|
|||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -75,34 +62,58 @@
|
|||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for LLM-based extraction of structured information from the page content",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["llm-extraction", "llm-extraction-from-raw-html"],
|
||||
"description": "The extraction mode to use. llm-extraction: Extracts information from the cleaned and parsed content. llm-extraction-from-raw-html: Extracts information directly from the raw HTML."
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page"
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted",
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
|
@ -134,13 +145,52 @@
|
|||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -216,7 +266,12 @@
|
|||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
|
@ -224,25 +279,32 @@
|
|||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -254,6 +316,21 @@
|
|||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -275,13 +352,52 @@
|
|||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -323,7 +439,12 @@
|
|||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
|
@ -355,13 +476,52 @@
|
|||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -403,14 +563,6 @@
|
|||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"current_url": {
|
||||
"type": "string",
|
||||
"description": "Current URL being scraped"
|
||||
},
|
||||
"current_step": {
|
||||
"type": "string",
|
||||
"description": "Current step in the process"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
|
@ -427,7 +579,7 @@
|
|||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -435,13 +587,52 @@
|
|||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -485,13 +676,52 @@
|
|||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required"
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests"
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error"
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -523,7 +753,12 @@
|
|||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -583,7 +818,12 @@
|
|||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
|
|
|
@ -19,12 +19,14 @@
|
|||
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
||||
"mongo-docker-console": "docker exec -it mongodb mongosh",
|
||||
"run-example": "npx ts-node src/example.ts",
|
||||
"deploy:fly": "flyctl deploy",
|
||||
"deploy:fly:staging": "fly deploy -c fly.staging.toml"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"devDependencies": {
|
||||
"@flydotio/dockerfile": "^0.4.10",
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@tsconfig/recommended": "^1.0.3",
|
||||
"@types/body-parser": "^1.19.2",
|
||||
"@types/bull": "^4.10.0",
|
||||
|
@ -62,6 +64,7 @@
|
|||
"axios": "^1.3.4",
|
||||
"bottleneck": "^2.19.5",
|
||||
"bull": "^4.15.0",
|
||||
"cacheable-lookup": "^6.1.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"cohere": "^1.1.1",
|
||||
"cors": "^2.8.5",
|
||||
|
@ -72,7 +75,7 @@
|
|||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"ioredis": "^5.3.2",
|
||||
"ioredis": "^5.4.1",
|
||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"json-schema-to-zod": "^2.3.0",
|
||||
"keyword-extractor": "^0.0.28",
|
||||
|
@ -91,7 +94,7 @@
|
|||
"promptable": "^0.0.10",
|
||||
"puppeteer": "^22.12.1",
|
||||
"rate-limiter-flexible": "2.4.2",
|
||||
"redis": "^4.6.7",
|
||||
"redlock": "5.0.0-beta.2",
|
||||
"resend": "^3.4.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"scrapingbee": "^1.7.4",
|
||||
|
|
|
@ -59,6 +59,9 @@ importers:
|
|||
bull:
|
||||
specifier: ^4.15.0
|
||||
version: 4.15.0
|
||||
cacheable-lookup:
|
||||
specifier: ^6.1.0
|
||||
version: 6.1.0
|
||||
cheerio:
|
||||
specifier: ^1.0.0-rc.12
|
||||
version: 1.0.0-rc.12
|
||||
|
@ -90,7 +93,7 @@ importers:
|
|||
specifier: ^1.1.5
|
||||
version: 1.1.5
|
||||
ioredis:
|
||||
specifier: ^5.3.2
|
||||
specifier: ^5.4.1
|
||||
version: 5.4.1
|
||||
joplin-turndown-plugin-gfm:
|
||||
specifier: ^1.0.12
|
||||
|
@ -146,9 +149,9 @@ importers:
|
|||
rate-limiter-flexible:
|
||||
specifier: 2.4.2
|
||||
version: 2.4.2
|
||||
redis:
|
||||
specifier: ^4.6.7
|
||||
version: 4.6.14
|
||||
redlock:
|
||||
specifier: 5.0.0-beta.2
|
||||
version: 5.0.0-beta.2
|
||||
resend:
|
||||
specifier: ^3.4.0
|
||||
version: 3.4.0
|
||||
|
@ -192,6 +195,9 @@ importers:
|
|||
'@flydotio/dockerfile':
|
||||
specifier: ^0.4.10
|
||||
version: 0.4.11
|
||||
'@jest/globals':
|
||||
specifier: ^29.7.0
|
||||
version: 29.7.0
|
||||
'@tsconfig/recommended':
|
||||
specifier: ^1.0.3
|
||||
version: 1.0.6
|
||||
|
@ -1940,6 +1946,10 @@ packages:
|
|||
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
||||
cacheable-lookup@6.1.0:
|
||||
resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==}
|
||||
engines: {node: '>=10.6.0'}
|
||||
|
||||
call-bind@1.0.7:
|
||||
resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
@ -3526,6 +3536,9 @@ packages:
|
|||
resolution: {integrity: sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==}
|
||||
engines: {node: '>= 0.4.0'}
|
||||
|
||||
node-abort-controller@3.1.1:
|
||||
resolution: {integrity: sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==}
|
||||
|
||||
node-domexception@1.0.0:
|
||||
resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
|
||||
engines: {node: '>=10.5.0'}
|
||||
|
@ -3949,6 +3962,10 @@ packages:
|
|||
redis@4.6.14:
|
||||
resolution: {integrity: sha512-GrNg/e33HtsQwNXL7kJT+iNFPSwE1IPmd7wzV3j4f2z0EYxZfZE7FVTmUysgAtqQQtg5NXF5SNLR9OdO/UHOfw==}
|
||||
|
||||
redlock@5.0.0-beta.2:
|
||||
resolution: {integrity: sha512-2RDWXg5jgRptDrB1w9O/JgSZC0j7y4SlaXnor93H/UJm/QyDiFgBKNtrh0TI6oCXqYSaSoXxFh6Sd3VtYfhRXw==}
|
||||
engines: {node: '>=12'}
|
||||
|
||||
regenerator-runtime@0.14.1:
|
||||
resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==}
|
||||
|
||||
|
@ -4372,8 +4389,8 @@ packages:
|
|||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
typescript@5.5.3:
|
||||
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==}
|
||||
typescript@5.5.4:
|
||||
resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
|
@ -6920,6 +6937,8 @@ snapshots:
|
|||
|
||||
bytes@3.1.2: {}
|
||||
|
||||
cacheable-lookup@6.1.0: {}
|
||||
|
||||
call-bind@1.0.7:
|
||||
dependencies:
|
||||
es-define-property: 1.0.0
|
||||
|
@ -8596,6 +8615,8 @@ snapshots:
|
|||
|
||||
netmask@2.0.2: {}
|
||||
|
||||
node-abort-controller@3.1.1: {}
|
||||
|
||||
node-domexception@1.0.0: {}
|
||||
|
||||
node-ensure@0.0.0: {}
|
||||
|
@ -8930,7 +8951,7 @@ snapshots:
|
|||
csv-parse: 5.5.6
|
||||
gpt3-tokenizer: 1.1.5
|
||||
openai: 3.3.0
|
||||
typescript: 5.5.3
|
||||
typescript: 5.5.4
|
||||
uuid: 9.0.1
|
||||
zod: 3.23.8
|
||||
transitivePeerDependencies:
|
||||
|
@ -9099,6 +9120,10 @@ snapshots:
|
|||
'@redis/search': 1.1.6(@redis/client@1.5.16)
|
||||
'@redis/time-series': 1.0.5(@redis/client@1.5.16)
|
||||
|
||||
redlock@5.0.0-beta.2:
|
||||
dependencies:
|
||||
node-abort-controller: 3.1.1
|
||||
|
||||
regenerator-runtime@0.14.1: {}
|
||||
|
||||
require-directory@2.1.1: {}
|
||||
|
@ -9522,7 +9547,7 @@ snapshots:
|
|||
|
||||
typescript@5.4.5: {}
|
||||
|
||||
typescript@5.5.3: {}
|
||||
typescript@5.5.4: {}
|
||||
|
||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||
dependencies:
|
||||
|
|
|
@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => {
|
|||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
console.log(crawlData)
|
||||
expect(crawlData.length).toBeGreaterThan(0);
|
||||
expect(crawlData).toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
|
||||
|
|
|
@ -311,7 +311,10 @@ describe("E2E Tests for API Routes", () => {
|
|||
}
|
||||
}
|
||||
|
||||
const completedResponse = response;
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
|
@ -331,7 +334,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
}, 180000); // 180 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
|
||||
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
|
||||
|
@ -363,7 +366,10 @@ describe("E2E Tests for API Routes", () => {
|
|||
}
|
||||
}
|
||||
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = response;
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
|
@ -481,7 +487,7 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body).toHaveProperty("success");
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
}, 30000); // 30 seconds timeout
|
||||
}, 60000); // 60 seconds timeout
|
||||
});
|
||||
|
||||
describe("GET /v0/crawl/status/:jobId", () => {
|
||||
|
@ -513,7 +519,6 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
let completedResponse;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
|
@ -524,11 +529,16 @@ describe("E2E Tests for API Routes", () => {
|
|||
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
completedResponse = response;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
|
@ -619,7 +629,13 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("failed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data).toBeNull();
|
||||
|
||||
let isNullOrEmptyArray = false;
|
||||
if (completedResponse.body.data === null || completedResponse.body.data.length === 0) {
|
||||
isNullOrEmptyArray = true;
|
||||
}
|
||||
expect(isNullOrEmptyArray).toBe(true);
|
||||
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
|
||||
expect(completedResponse.body).toHaveProperty("partial_data");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||
|
@ -679,61 +695,4 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||
}, 60000); // 60 secs
|
||||
});
|
||||
|
||||
describe("POST /v0/crawl with fast mode", () => {
|
||||
it.concurrent("should complete the crawl under 20 seconds", async () => {
|
||||
const startTime = Date.now();
|
||||
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://flutterbricks.com",
|
||||
crawlerOptions: {
|
||||
mode: "fast"
|
||||
}
|
||||
});
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const jobId = crawlResponse.body.jobId;
|
||||
let statusResponse;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
statusResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(statusResponse.statusCode).toBe(200);
|
||||
isFinished = statusResponse.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
// const endTime = Date.now();
|
||||
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
|
||||
|
||||
// console.log(`Time elapsed: ${timeElapsed} seconds`);
|
||||
|
||||
expect(statusResponse.body.status).toBe("completed");
|
||||
expect(statusResponse.body).toHaveProperty("data");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(statusResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||
expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
|
||||
const results = statusResponse.body.data;
|
||||
// results.forEach((result, i) => {
|
||||
// console.log(result.metadata.sourceURL);
|
||||
// });
|
||||
expect(results.length).toBeGreaterThanOrEqual(10);
|
||||
expect(results.length).toBeLessThanOrEqual(15);
|
||||
|
||||
}, 20000);
|
||||
});
|
||||
});
|
||||
|
|
87
apps/api/src/controllers/admin/queue.ts
Normal file
87
apps/api/src/controllers/admin/queue.ts
Normal file
|
@ -0,0 +1,87 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getWebScraperQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
res: Response
|
||||
) {
|
||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(
|
||||
webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
const completedJobs: Job[] = (
|
||||
await Promise.all(completedJobsPromises)
|
||||
).flat();
|
||||
const before24hJobs =
|
||||
completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove();
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function checkQueuesController(req: Request, res: Response) {
|
||||
try {
|
||||
await checkAlerts();
|
||||
return res.status(200).send("Alerts initialized");
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to initialize alerts: ${error}`);
|
||||
return res.status(500).send("Failed to initialize alerts");
|
||||
}
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
export async function queuesController(req: Request, res: Response) {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
85
apps/api/src/controllers/admin/redis-health.ts
Normal file
85
apps/api/src/controllers/admin/redis-health.ts
Normal file
|
@ -0,0 +1,85 @@
|
|||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
try {
|
||||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt === retries) throw error;
|
||||
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const queueRedis = new Redis(process.env.REDIS_URL);
|
||||
|
||||
const testKey = "test";
|
||||
const testValue = "test";
|
||||
|
||||
// Test queueRedis
|
||||
let queueRedisHealth;
|
||||
try {
|
||||
await retryOperation(() => queueRedis.set(testKey, testValue));
|
||||
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
|
||||
await retryOperation(() => queueRedis.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`queueRedis health check failed: ${error}`);
|
||||
queueRedisHealth = null;
|
||||
}
|
||||
|
||||
// Test redisRateLimitClient
|
||||
let redisRateLimitHealth;
|
||||
try {
|
||||
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
|
||||
redisRateLimitHealth = await retryOperation(() =>
|
||||
redisRateLimitClient.get(testKey)
|
||||
);
|
||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||
} catch (error) {
|
||||
Logger.error(`redisRateLimitClient health check failed: ${error}`);
|
||||
redisRateLimitHealth = null;
|
||||
}
|
||||
|
||||
const healthStatus = {
|
||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||
redisRateLimitClient:
|
||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
||||
};
|
||||
|
||||
if (
|
||||
healthStatus.queueRedis === "healthy" &&
|
||||
healthStatus.redisRateLimitClient === "healthy"
|
||||
) {
|
||||
Logger.info("Both Redis instances are healthy");
|
||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||
} else {
|
||||
Logger.info(
|
||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
||||
);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||
// healthStatus
|
||||
// )}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", details: healthStatus });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Redis health check failed: ${error}`);
|
||||
// await sendSlackWebhook(
|
||||
// `[REDIS DOWN] Redis instances health check: ${error.message}`,
|
||||
// true
|
||||
// );
|
||||
return res
|
||||
.status(500)
|
||||
.json({ status: "unhealthy", message: error.message });
|
||||
}
|
||||
}
|
|
@ -1,25 +1,77 @@
|
|||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter, } from "../../src/services/rate-limiter";
|
||||
import { AuthResponse, NotificationType, RateLimiterMode } from "../../src/types";
|
||||
import { getRateLimiter } from "../../src/services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
RateLimiterMode,
|
||||
} from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../../src/services/redlock";
|
||||
import { getValue } from "../../src/services/redis";
|
||||
import { setValue } from "../../src/services/redis";
|
||||
import { validate } from "uuid";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||
// Check if the string is a valid UUID
|
||||
return validate(potentialUuid);
|
||||
}
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
api_key,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error setting trace attributes:', error);
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function getKeyAndPriceId(normalizedApi: string): Promise<{
|
||||
success: boolean;
|
||||
teamId?: string;
|
||||
priceId?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
}> {
|
||||
const { data, error } = await supabase_service.rpc("get_key_and_price_id_2", {
|
||||
api_key: normalizedApi,
|
||||
});
|
||||
if (error) {
|
||||
Logger.error(`RPC ERROR (get_key_and_price_id_2): ${error.message}`);
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"The server seems overloaded. Please contact hello@firecrawl.com if you aren't sending too many requests at once.",
|
||||
status: 500,
|
||||
};
|
||||
}
|
||||
if (!data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
// TODO: change this error code ?
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
teamId: data[0].team_id,
|
||||
priceId: data[0].price_id,
|
||||
};
|
||||
}
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
|
@ -50,20 +102,83 @@ export async function supaAuthenticateUser(
|
|||
const iptoken = incomingIP + token;
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let subscriptionData: { team_id: string; plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
let team_id: string;
|
||||
let cacheKey = "";
|
||||
let redLockKey = "";
|
||||
const lockTTL = 15000; // 10 seconds
|
||||
let teamId: string | null = null;
|
||||
let priceId: string | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
team_id = "preview";
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
if (!normalizedApiIsUuid(normalizedApi)) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
cacheKey = `api_key:${normalizedApi}`;
|
||||
|
||||
try {
|
||||
const teamIdPriceId = await getValue(cacheKey);
|
||||
if (teamIdPriceId) {
|
||||
const { team_id, price_id } = JSON.parse(teamIdPriceId);
|
||||
teamId = team_id;
|
||||
priceId = price_id;
|
||||
} else {
|
||||
const {
|
||||
success,
|
||||
teamId: tId,
|
||||
priceId: pId,
|
||||
error,
|
||||
status,
|
||||
} = await getKeyAndPriceId(normalizedApi);
|
||||
if (!success) {
|
||||
return { success, error, status };
|
||||
}
|
||||
teamId = tId;
|
||||
priceId = pId;
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
10
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error with auth function: ${error.message}`);
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
// const {
|
||||
// success,
|
||||
// teamId: tId,
|
||||
// priceId: pId,
|
||||
// error: e,
|
||||
// status,
|
||||
// } = await getKeyAndPriceId(normalizedApi);
|
||||
// if (!success) {
|
||||
// return { success, error: e, status };
|
||||
// }
|
||||
// teamId = tId;
|
||||
// priceId = pId;
|
||||
}
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
|
@ -81,43 +196,39 @@ export async function supaAuthenticateUser(
|
|||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
console.error('Error fetching key and price_id:', error);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const internal_team_id = data[0].team_id;
|
||||
team_id = internal_team_id;
|
||||
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
const plan = getPlanByPriceId(priceId);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
setTrace(teamId, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan
|
||||
}
|
||||
team_id: teamId,
|
||||
plan: plan,
|
||||
};
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Crawl,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Scrape,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Search,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
|
@ -130,12 +241,13 @@ export async function supaAuthenticateUser(
|
|||
}
|
||||
}
|
||||
|
||||
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
|
||||
const team_endpoint_token =
|
||||
token === "this_is_just_a_preview_token" ? iptoken : teamId;
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
console.error(rateLimiterRes);
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
|
@ -143,7 +255,17 @@ export async function supaAuthenticateUser(
|
|||
const startDate = new Date();
|
||||
const endDate = new Date();
|
||||
endDate.setDate(endDate.getDate() + 7);
|
||||
|
||||
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||
// Cache longer for 429s
|
||||
if (teamId && priceId && mode !== RateLimiterMode.Preview) {
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
60 // 10 seconds, cache for everything
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
|
@ -153,7 +275,9 @@ export async function supaAuthenticateUser(
|
|||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
||||
(mode === RateLimiterMode.Scrape ||
|
||||
mode === RateLimiterMode.Preview ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
|
@ -178,6 +302,7 @@ export async function supaAuthenticateUser(
|
|||
.eq("key", normalizedApi);
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
|
@ -188,24 +313,30 @@ export async function supaAuthenticateUser(
|
|||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: subscriptionData.plan ?? "",
|
||||
};
|
||||
}
|
||||
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
return "starter";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
return "standard";
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
return "scale";
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return "hobby";
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return "standardnew";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return "growth";
|
||||
default:
|
||||
return 'free';
|
||||
return "free";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,9 +5,12 @@ import { addWebScraperJob } from "../../src/services/queue-jobs";
|
|||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
|
@ -22,41 +25,47 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
// check if the job belongs to the team
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
if (useDbAuthentication) {
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
const jobState = await job.getState();
|
||||
const { partialDocs } = await job.progress();
|
||||
|
||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
||||
console.log("Billing team for partial docs...");
|
||||
Logger.info("Billing team for partial docs...");
|
||||
// Note: the credits that we will bill them here might be lower than the actual
|
||||
// due to promises that are not yet resolved
|
||||
await billTeam(team_id, partialDocs.length);
|
||||
}
|
||||
|
||||
try {
|
||||
await getWebScraperQueue().client.del(job.lockKey());
|
||||
await job.takeLock();
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
const newJobState = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: newJobState === "failed" ? "cancelled" : "Cancelling...",
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,8 @@ import { authenticateUser } from "./auth";
|
|||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -20,18 +22,30 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: await job.getState(),
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current: current,
|
||||
current_url: current_url,
|
||||
current_step: current_step,
|
||||
total: total,
|
||||
data: job.returnvalue,
|
||||
partial_data: partialDocs ?? [],
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,6 +10,8 @@ import { logCrawl } from "../../src/services/logging/crawl_log";
|
|||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -30,7 +32,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -60,10 +62,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
try {
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId: uuidv4(),
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
|
@ -83,7 +86,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
documents: docs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
@ -101,7 +104,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ import { authenticateUser } from "./auth";
|
|||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
|
||||
res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
6
apps/api/src/controllers/liveness.ts
Normal file
6
apps/api/src/controllers/liveness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
6
apps/api/src/controllers/readiness.ts
Normal file
6
apps/api/src/controllers/readiness.ts
Normal file
|
@ -0,0 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
|
@ -9,8 +9,11 @@ import { Document } from "../lib/entities";
|
|||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
|
@ -35,6 +38,7 @@ export async function scrapeHelper(
|
|||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: [url],
|
||||
crawlerOptions: {
|
||||
|
@ -73,28 +77,6 @@ export async function scrapeHelper(
|
|||
});
|
||||
}
|
||||
|
||||
let creditsToBeBilled = filteredDocs.length;
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
|
||||
|
||||
if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
|
||||
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
returnCode: 402,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: filteredDocs[0],
|
||||
|
@ -104,6 +86,7 @@ export async function scrapeHelper(
|
|||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
|
@ -113,27 +96,40 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
pageOptions.onlyMainContent = true;
|
||||
}
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
const timeout = req.body.timeout ?? defaultTimeout;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
const checkCredits = async () => {
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
await checkCredits();
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
|
@ -146,7 +142,35 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
|
@ -161,9 +185,12 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,8 +7,11 @@ import { logJob } from "../services/logging/log_job";
|
|||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
|
@ -75,8 +78,9 @@ export async function searchHelper(
|
|||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
jobId,
|
||||
mode: "single_urls",
|
||||
urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7),
|
||||
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
|
||||
crawlerOptions: {
|
||||
...crawlerOptions,
|
||||
},
|
||||
|
@ -146,7 +150,10 @@ export async function searchController(req: Request, res: Response) {
|
|||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 7 };
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
|
@ -155,11 +162,12 @@ export async function searchController(req: Request, res: Response) {
|
|||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
const startTime = new Date().getTime();
|
||||
const result = await searchHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
|
@ -169,6 +177,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: result.data ? result.data.length : 0,
|
||||
|
@ -183,7 +192,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
});
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
|
||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -9,18 +11,32 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||
}
|
||||
|
||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
let data = job.returnvalue;
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
if (supabaseData) {
|
||||
data = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
let jobStatus = await job.getState();
|
||||
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
|
||||
jobStatus = 'active';
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: await job.getState(),
|
||||
status: jobStatus,
|
||||
// progress: job.progress(),
|
||||
current: current,
|
||||
current_url: current_url,
|
||||
current_step: current_step,
|
||||
total: total,
|
||||
data: job.returnvalue,
|
||||
partial_data: partialDocs ?? [],
|
||||
current,
|
||||
current_url,
|
||||
current_step,
|
||||
total,
|
||||
data: data ? data : null,
|
||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ async function example() {
|
|||
const example = new WebScraperDataProvider();
|
||||
|
||||
await example.setOptions({
|
||||
jobId: "TEST",
|
||||
mode: "crawl",
|
||||
urls: ["https://mendable.ai"],
|
||||
crawlerOptions: {},
|
||||
|
|
|
@ -3,22 +3,34 @@ import bodyParser from "body-parser";
|
|||
import cors from "cors";
|
||||
import "dotenv/config";
|
||||
import { getWebScraperQueue } from "./services/queue-service";
|
||||
import { redisClient } from "./services/rate-limiter";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||
import cluster from "cluster";
|
||||
import os from "os";
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "./lib/logger";
|
||||
import { adminRouter } from "./routes/admin";
|
||||
import { ScrapeEvents } from "./lib/scrape-events";
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
const { ExpressAdapter } = require("@bull-board/express");
|
||||
|
||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||
console.log(`Number of CPUs: ${numCPUs} available`);
|
||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
|
||||
const cacheable = new CacheableLookup({
|
||||
// this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme
|
||||
lookup:false
|
||||
});
|
||||
|
||||
cacheable.install(http.globalAgent);
|
||||
cacheable.install(https.globalAgent)
|
||||
|
||||
if (cluster.isMaster) {
|
||||
console.log(`Master ${process.pid} is running`);
|
||||
Logger.info(`Master ${process.pid} is running`);
|
||||
|
||||
// Fork workers.
|
||||
for (let i = 0; i < numCPUs; i++) {
|
||||
|
@ -26,9 +38,11 @@ if (cluster.isMaster) {
|
|||
}
|
||||
|
||||
cluster.on("exit", (worker, code, signal) => {
|
||||
console.log(`Worker ${worker.process.pid} exited`);
|
||||
console.log("Starting a new worker");
|
||||
cluster.fork();
|
||||
if (code !== null) {
|
||||
Logger.info(`Worker ${worker.process.pid} exited`);
|
||||
Logger.info("Starting a new worker");
|
||||
cluster.fork();
|
||||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
|
@ -64,10 +78,10 @@ if (cluster.isMaster) {
|
|||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
const HOST = process.env.HOST ?? "localhost";
|
||||
redisClient.connect();
|
||||
|
||||
// HyperDX OpenTelemetry
|
||||
if (process.env.ENV === "production") {
|
||||
|
@ -76,14 +90,9 @@ if (cluster.isMaster) {
|
|||
|
||||
function startServer(port = DEFAULT_PORT) {
|
||||
const server = app.listen(Number(port), HOST, () => {
|
||||
console.log(`Worker ${process.pid} listening on port ${port}`);
|
||||
console.log(
|
||||
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
console.log("");
|
||||
console.log("1. Make sure Redis is running on port 6379 by default");
|
||||
console.log(
|
||||
"2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 "
|
||||
Logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||
Logger.info(
|
||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||
);
|
||||
});
|
||||
return server;
|
||||
|
@ -93,26 +102,6 @@ if (cluster.isMaster) {
|
|||
startServer();
|
||||
}
|
||||
|
||||
// Use this as a "health check" that way we dont destroy the server
|
||||
app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const [webScraperActive] = await Promise.all([
|
||||
webScraperQueue.getActiveCount(),
|
||||
]);
|
||||
|
||||
const noActiveJobs = webScraperActive === 0;
|
||||
// 200 if no active jobs, 503 if there are active jobs
|
||||
return res.status(noActiveJobs ? 200 : 500).json({
|
||||
webScraperActive,
|
||||
noActiveJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
|
@ -126,7 +115,7 @@ if (cluster.isMaster) {
|
|||
waitingJobs,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
@ -171,13 +160,13 @@ if (cluster.isMaster) {
|
|||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error("Failed to send Slack notification");
|
||||
Logger.error("Failed to send Slack notification");
|
||||
}
|
||||
}
|
||||
}, timeout);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.debug(error);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -185,52 +174,20 @@ if (cluster.isMaster) {
|
|||
}
|
||||
});
|
||||
|
||||
app.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
async (req, res) => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const batchSize = 10;
|
||||
const numberOfBatches = 9; // Adjust based on your needs
|
||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||
for (let i = 0; i < numberOfBatches; i++) {
|
||||
completedJobsPromises.push(webScraperQueue.getJobs(
|
||||
["completed"],
|
||||
i * batchSize,
|
||||
i * batchSize + batchSize,
|
||||
true
|
||||
));
|
||||
}
|
||||
const completedJobs: Job[] = (await Promise.all(completedJobsPromises)).flat();
|
||||
const before24hJobs = completedJobs.filter(
|
||||
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||
) || [];
|
||||
|
||||
let count = 0;
|
||||
|
||||
if (!before24hJobs) {
|
||||
return res.status(200).send(`No jobs to remove.`);
|
||||
}
|
||||
|
||||
for (const job of before24hJobs) {
|
||||
try {
|
||||
await job.remove()
|
||||
count++;
|
||||
} catch (jobError) {
|
||||
console.error(`Failed to remove job with ID ${job.id}:`, jobError);
|
||||
}
|
||||
}
|
||||
return res.status(200).send(`Removed ${count} completed jobs.`);
|
||||
} catch (error) {
|
||||
console.error("Failed to clean last 24h complete jobs:", error);
|
||||
return res.status(500).send("Failed to clean jobs");
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
app.get("/is-production", (req, res) => {
|
||||
res.send({ isProduction: global.isProduction });
|
||||
});
|
||||
|
||||
console.log(`Worker ${process.pid} started`);
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
|||
|
||||
import { generateOpenAICompletions } from "./models";
|
||||
import { Document, ExtractorOptions } from "../entities";
|
||||
import { Logger } from "../logger";
|
||||
|
||||
// Generate completion using OpenAI
|
||||
export async function generateCompletions(
|
||||
|
@ -44,7 +45,7 @@ export async function generateCompletions(
|
|||
|
||||
return completionResult;
|
||||
} catch (error) {
|
||||
console.error(`Error generating completions: ${error}`);
|
||||
Logger.error(`Error generating completions: ${error}`);
|
||||
throw new Error(`Error generating completions: ${error.message}`);
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -48,7 +48,7 @@ function prepareOpenAIDoc(
|
|||
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = "gpt-4o",
|
||||
model = process.env.MODEL_NAME || "gpt-4o",
|
||||
document,
|
||||
schema, //TODO - add zod dynamic type checking
|
||||
prompt = defaultPrompt,
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
export const defaultOrigin = "api";
|
||||
|
||||
export const defaultTimeout = 30000; // 30 seconds
|
||||
export const defaultTimeout = 45000; // 45 seconds
|
||||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ export type PageOptions = {
|
|||
fetchPageContent?: boolean;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
|
@ -42,8 +43,8 @@ export type SearchOptions = {
|
|||
|
||||
export type CrawlerOptions = {
|
||||
returnOnlyUrls?: boolean;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
includes?: string | string[];
|
||||
excludes?: string | string[];
|
||||
maxCrawledLinks?: number;
|
||||
maxDepth?: number;
|
||||
limit?: number;
|
||||
|
@ -56,6 +57,7 @@ export type CrawlerOptions = {
|
|||
}
|
||||
|
||||
export type WebScraperOptions = {
|
||||
jobId: string;
|
||||
urls: string[];
|
||||
mode: "single_urls" | "sitemap" | "crawl";
|
||||
crawlerOptions?: CrawlerOptions;
|
||||
|
@ -89,7 +91,8 @@ export class Document {
|
|||
warning?: string;
|
||||
|
||||
index?: number;
|
||||
|
||||
linksOnPage?: string[]; // Add this new field as a separate property
|
||||
|
||||
constructor(data: Partial<Document>) {
|
||||
if (!data.content) {
|
||||
throw new Error("Missing required fields");
|
||||
|
@ -102,6 +105,7 @@ export class Document {
|
|||
this.markdown = data.markdown || "";
|
||||
this.childrenLinks = data.childrenLinks || undefined;
|
||||
this.provider = data.provider || undefined;
|
||||
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -129,3 +133,12 @@ export interface FireEngineResponse {
|
|||
pageError?: string;
|
||||
}
|
||||
|
||||
|
||||
export interface FireEngineOptions{
|
||||
mobileProxy?: boolean;
|
||||
method?: string;
|
||||
engine?: string;
|
||||
blockMedia?: boolean;
|
||||
blockAds?: boolean;
|
||||
disableJsDom?: boolean;
|
||||
}
|
||||
|
|
53
apps/api/src/lib/logger.ts
Normal file
53
apps/api/src/lib/logger.ts
Normal file
|
@ -0,0 +1,53 @@
|
|||
enum LogLevel {
|
||||
NONE = 'NONE', // No logs will be output.
|
||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors.
|
||||
INFO = 'INFO', // For logging informational messages that highlight the progress of the application.
|
||||
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging.
|
||||
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level.
|
||||
}
|
||||
export class Logger {
|
||||
static colors = {
|
||||
ERROR: '\x1b[31m%s\x1b[0m', // Red
|
||||
WARN: '\x1b[33m%s\x1b[0m', // Yellow
|
||||
INFO: '\x1b[34m%s\x1b[0m', // Blue
|
||||
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan
|
||||
TRACE: '\x1b[35m%s\x1b[0m' // Magenta
|
||||
};
|
||||
|
||||
static log (message: string, level: LogLevel) {
|
||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
|
||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||
const currentLevelIndex = levels.indexOf(logLevel);
|
||||
const messageLevelIndex = levels.indexOf(level);
|
||||
|
||||
if (currentLevelIndex >= messageLevelIndex) {
|
||||
const color = Logger.colors[level];
|
||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||
|
||||
// if (process.env.USE_DB_AUTH) {
|
||||
// save to supabase? another place?
|
||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||
// }
|
||||
}
|
||||
}
|
||||
static error(message: string | any) {
|
||||
Logger.log(message, LogLevel.ERROR);
|
||||
}
|
||||
|
||||
static warn(message: string) {
|
||||
Logger.log(message, LogLevel.WARN);
|
||||
}
|
||||
|
||||
static info(message: string) {
|
||||
Logger.log(message, LogLevel.INFO);
|
||||
}
|
||||
|
||||
static debug(message: string) {
|
||||
Logger.log(message, LogLevel.DEBUG);
|
||||
}
|
||||
|
||||
static trace(message: string) {
|
||||
Logger.log(message, LogLevel.TRACE);
|
||||
}
|
||||
}
|
84
apps/api/src/lib/scrape-events.ts
Normal file
84
apps/api/src/lib/scrape-events.ts
Normal file
|
@ -0,0 +1,84 @@
|
|||
import { Job, JobId } from "bull";
|
||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||
import { supabase_service as supabase } from "../services/supabase";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
export type ScrapeErrorEvent = {
|
||||
type: "error",
|
||||
message: string,
|
||||
stack?: string,
|
||||
}
|
||||
|
||||
export type ScrapeScrapeEvent = {
|
||||
type: "scrape",
|
||||
url: string,
|
||||
worker?: string,
|
||||
method: (typeof baseScrapers)[number],
|
||||
result: null | {
|
||||
success: boolean,
|
||||
response_code?: number,
|
||||
response_size?: number,
|
||||
error?: string | object,
|
||||
// proxy?: string,
|
||||
time_taken: number,
|
||||
},
|
||||
}
|
||||
|
||||
export type ScrapeQueueEvent = {
|
||||
type: "queue",
|
||||
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
|
||||
worker?: string,
|
||||
}
|
||||
|
||||
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
|
||||
|
||||
export class ScrapeEvents {
|
||||
static async insert(jobId: string, content: ScrapeEvent) {
|
||||
if (jobId === "TEST") return null;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION) {
|
||||
try {
|
||||
const result = await supabase.from("scrape_events").insert({
|
||||
job_id: jobId,
|
||||
type: content.type,
|
||||
content: content,
|
||||
// created_at
|
||||
}).select().single();
|
||||
return (result.data as any).id;
|
||||
} catch (error) {
|
||||
// Logger.error(`Error inserting scrape event: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
|
||||
if (logId === null) return;
|
||||
|
||||
try {
|
||||
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
|
||||
await supabase.from("scrape_events").update({
|
||||
content: {
|
||||
...previousLog.content,
|
||||
result,
|
||||
}
|
||||
}).eq("id", logId);
|
||||
} catch (error) {
|
||||
Logger.error(`Error updating scrape result: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
|
||||
try {
|
||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||
type: "queue",
|
||||
event,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging job event: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
19
apps/api/src/lib/supabase-jobs.ts
Normal file
19
apps/api/src/lib/supabase-jobs.ts
Normal file
|
@ -0,0 +1,19 @@
|
|||
import { supabase_service } from "../services/supabase";
|
||||
|
||||
export const supabaseGetJobById = async (jobId: string) => {
|
||||
const { data, error } = await supabase_service
|
||||
.from('firecrawl_jobs')
|
||||
.select('*')
|
||||
.eq('job_id', jobId)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
import { AuthResponse } from "../../src/types";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
let warningCount = 0;
|
||||
|
||||
|
@ -8,7 +9,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
return async function (...args: U): Promise<T> {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
if (warningCount < 5) {
|
||||
console.warn("WARNING - You're bypassing authentication");
|
||||
Logger.warn("You're bypassing authentication");
|
||||
warningCount++;
|
||||
}
|
||||
return { success: true } as T;
|
||||
|
@ -16,7 +17,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
|||
try {
|
||||
return await originalFunction(...args);
|
||||
} catch (error) {
|
||||
console.error("Error in withAuth function: ", error);
|
||||
Logger.error(`Error in withAuth function: ${error}`);
|
||||
return { success: false, error: error.message } as T;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,17 @@
|
|||
import { Job } from "bull";
|
||||
import { CrawlResult, WebScraperOptions, RunWebScraperParams, RunWebScraperResult } from "../types";
|
||||
import {
|
||||
CrawlResult,
|
||||
WebScraperOptions,
|
||||
RunWebScraperParams,
|
||||
RunWebScraperResult,
|
||||
} from "../types";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { DocumentUrl, Progress } from "../lib/entities";
|
||||
import { billTeam } from "../services/billing/credit_billing";
|
||||
import { Document } from "../lib/entities";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
|
@ -17,6 +25,7 @@ export async function startWebScraperPipeline({
|
|||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
partialDocs.push(progress.currentDocument);
|
||||
if (partialDocs.length > 50) {
|
||||
|
@ -26,9 +35,12 @@ export async function startWebScraperPipeline({
|
|||
}
|
||||
},
|
||||
onSuccess: (result) => {
|
||||
job.moveToCompleted(result);
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
saveJob(job, result);
|
||||
},
|
||||
onError: (error) => {
|
||||
Logger.error(`🐂 Job failed ${job.id}`);
|
||||
ScrapeEvents.logJobEvent(job, "failed");
|
||||
job.moveToFailed(error);
|
||||
},
|
||||
team_id: job.data.team_id,
|
||||
|
@ -50,6 +62,7 @@ export async function runWebScraper({
|
|||
const provider = new WebScraperDataProvider();
|
||||
if (mode === "crawl") {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: [url],
|
||||
crawlerOptions: crawlerOptions,
|
||||
|
@ -58,6 +71,7 @@ export async function runWebScraper({
|
|||
});
|
||||
} else {
|
||||
await provider.setOptions({
|
||||
jobId: bull_job_id,
|
||||
mode: mode,
|
||||
urls: url.split(","),
|
||||
crawlerOptions: crawlerOptions,
|
||||
|
@ -102,8 +116,34 @@ export async function runWebScraper({
|
|||
// this return doesn't matter too much for the job completion result
|
||||
return { success: true, message: "", docs: filteredDocs };
|
||||
} catch (error) {
|
||||
console.error("Error running web scraper", error);
|
||||
onError(error);
|
||||
return { success: false, message: error.message, docs: [] };
|
||||
}
|
||||
}
|
||||
|
||||
const saveJob = async (job: Job, result: any) => {
|
||||
try {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const { data, error } = await supabase_service
|
||||
.from("firecrawl_jobs")
|
||||
.update({ docs: result })
|
||||
.eq("job_id", job.id);
|
||||
|
||||
if (error) throw new Error(error.message);
|
||||
try {
|
||||
await job.moveToCompleted(null, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
await job.moveToCompleted(result, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
}
|
||||
ScrapeEvents.logJobEvent(job, "completed");
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Failed to update job status: ${error}`);
|
||||
}
|
||||
};
|
||||
|
|
29
apps/api/src/routes/admin.ts
Normal file
29
apps/api/src/routes/admin.ts
Normal file
|
@ -0,0 +1,29 @@
|
|||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import {
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||
redisHealthController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||
cleanBefore24hCompleteJobsController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||
checkQueuesController
|
||||
);
|
||||
|
||||
adminRouter.get(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||
queuesController
|
||||
);
|
|
@ -7,6 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
|||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
|
@ -23,3 +25,6 @@ v0Router.get("/v0/keyAuth", keyAuthController);
|
|||
// Search routes
|
||||
v0Router.post("/v0/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
v0Router.get("/v0/health/liveness", livenessController);
|
||||
v0Router.get("/v0/health/readiness", readinessController);
|
||||
|
|
|
@ -42,6 +42,7 @@ describe('WebCrawler', () => {
|
|||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
@ -76,6 +77,7 @@ describe('WebCrawler', () => {
|
|||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
@ -104,6 +106,7 @@ describe('WebCrawler', () => {
|
|||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
@ -133,6 +136,7 @@ describe('WebCrawler', () => {
|
|||
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
@ -161,6 +165,7 @@ describe('WebCrawler', () => {
|
|||
|
||||
// Setup the crawler with the specific test case options
|
||||
const crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
@ -194,6 +199,7 @@ describe('WebCrawler', () => {
|
|||
const limit = 2; // Set a limit for the number of links
|
||||
|
||||
crawler = new WebCrawler({
|
||||
jobId: "TEST",
|
||||
initialUrl: initialUrl,
|
||||
includes: [],
|
||||
excludes: [],
|
||||
|
|
15
apps/api/src/scraper/WebScraper/__tests__/dns.test.ts
Normal file
15
apps/api/src/scraper/WebScraper/__tests__/dns.test.ts
Normal file
|
@ -0,0 +1,15 @@
|
|||
import CacheableLookup from 'cacheable-lookup';
|
||||
import https from 'node:https';
|
||||
import axios from "axios";
|
||||
|
||||
describe("DNS", () => {
|
||||
it("cached dns", async () => {
|
||||
const cachedDns = new CacheableLookup();
|
||||
cachedDns.install(https.globalAgent);
|
||||
jest.spyOn(cachedDns, "lookupAsync");
|
||||
|
||||
const res = await axios.get("https://example.com");
|
||||
expect(res.status).toBe(200);
|
||||
expect(cachedDns.lookupAsync).toHaveBeenCalled();
|
||||
});
|
||||
});
|
|
@ -1,3 +1,7 @@
|
|||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
|
||||
jest.mock('../single_url', () => {
|
||||
const originalModule = jest.requireActual('../single_url');
|
||||
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
|
||||
|
@ -5,20 +9,29 @@ jest.mock('../single_url', () => {
|
|||
return originalModule;
|
||||
});
|
||||
|
||||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
describe('scrapSingleUrl', () => {
|
||||
it('should handle includeHtml option correctly', async () => {
|
||||
const url = 'https://roastmywebsite.ai';
|
||||
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
|
||||
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
|
||||
|
||||
const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml);
|
||||
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
|
||||
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
|
||||
|
||||
expect(resultWithHtml.html).toBeDefined();
|
||||
expect(resultWithoutHtml.html).toBeUndefined();
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
const url = 'https://example.com';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
|
||||
}, 10000);
|
||||
|
|
|
@ -8,8 +8,10 @@ import { scrapSingleUrl } from "./single_url";
|
|||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
private baseUrl: string;
|
||||
private includes: string[];
|
||||
|
@ -26,6 +28,7 @@ export class WebCrawler {
|
|||
private allowExternalContentLinks: boolean;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
initialUrl,
|
||||
includes,
|
||||
excludes,
|
||||
|
@ -36,6 +39,7 @@ export class WebCrawler {
|
|||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
|
@ -46,6 +50,7 @@ export class WebCrawler {
|
|||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
this.baseUrl = new URL(initialUrl).origin;
|
||||
this.includes = includes ?? [];
|
||||
|
@ -64,7 +69,7 @@ export class WebCrawler {
|
|||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
const url = new URL(link);
|
||||
const url = new URL(link.trim(), this.baseUrl);
|
||||
const path = url.pathname;
|
||||
|
||||
const depth = getURLDepth(url.toString());
|
||||
|
@ -116,7 +121,7 @@ export class WebCrawler {
|
|||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
// Check if the link is disallowed by robots.txt
|
||||
if (!isAllowed) {
|
||||
console.log(`Link disallowed by robots.txt: ${link}`);
|
||||
Logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -133,15 +138,19 @@ export class WebCrawler {
|
|||
limit: number = 10000,
|
||||
maxDepth: number = 10
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
|
||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||
// Fetch and parse robots.txt
|
||||
try {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||
} catch (error) {
|
||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||
}
|
||||
|
||||
if(!crawlerOptions?.ignoreSitemap){
|
||||
if (!crawlerOptions?.ignoreSitemap){
|
||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||
if (sitemapLinks.length > 0) {
|
||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||
|
@ -155,7 +164,7 @@ export class WebCrawler {
|
|||
concurrencyLimit,
|
||||
inProgress
|
||||
);
|
||||
|
||||
|
||||
if (
|
||||
urls.length === 0 &&
|
||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||
|
@ -175,6 +184,7 @@ export class WebCrawler {
|
|||
inProgress?: (progress: Progress) => void,
|
||||
): Promise<{ url: string, html: string }[]> {
|
||||
const queue = async.queue(async (task: string, callback) => {
|
||||
Logger.debug(`Crawling ${task}`);
|
||||
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
|
||||
if (callback && typeof callback === "function") {
|
||||
callback();
|
||||
|
@ -216,16 +226,18 @@ export class WebCrawler {
|
|||
}
|
||||
}, concurrencyLimit);
|
||||
|
||||
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
|
||||
queue.push(
|
||||
urls.filter(
|
||||
(url) =>
|
||||
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
|
||||
),
|
||||
(err) => {
|
||||
if (err) console.error(err);
|
||||
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
|
||||
}
|
||||
);
|
||||
await queue.drain();
|
||||
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
|
||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||
}
|
||||
|
||||
|
@ -253,7 +265,7 @@ export class WebCrawler {
|
|||
|
||||
// If it is the first link, fetch with single url
|
||||
if (this.visited.size === 1) {
|
||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
|
||||
content = page.html ?? "";
|
||||
pageStatusCode = page.metadata?.pageStatusCode;
|
||||
pageError = page.metadata?.pageError || undefined;
|
||||
|
@ -282,7 +294,6 @@ export class WebCrawler {
|
|||
const urlObj = new URL(fullUrl);
|
||||
const path = urlObj.pathname;
|
||||
|
||||
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
|
@ -383,7 +394,7 @@ export class WebCrawler {
|
|||
return linkDomain === baseDomain;
|
||||
}
|
||||
|
||||
private isFile(url: string): boolean {
|
||||
public isFile(url: string): boolean {
|
||||
const fileExtensions = [
|
||||
".png",
|
||||
".jpg",
|
||||
|
@ -393,6 +404,7 @@ export class WebCrawler {
|
|||
".js",
|
||||
".ico",
|
||||
".svg",
|
||||
".tiff",
|
||||
// ".pdf",
|
||||
".zip",
|
||||
".exe",
|
||||
|
@ -408,9 +420,10 @@ export class WebCrawler {
|
|||
".woff",
|
||||
".ttf",
|
||||
".woff2",
|
||||
".webp"
|
||||
".webp",
|
||||
".inc"
|
||||
];
|
||||
return fileExtensions.some((ext) => url.endsWith(ext));
|
||||
return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext));
|
||||
}
|
||||
|
||||
private isSocialMediaOrEmail(url: string): boolean {
|
||||
|
@ -448,10 +461,14 @@ export class WebCrawler {
|
|||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||
}
|
||||
|
||||
if (sitemapLinks.length === 0) {
|
||||
|
@ -459,10 +476,11 @@ export class WebCrawler {
|
|||
try {
|
||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
||||
// Check for Readme Docs special case
|
||||
if (text.includes('<meta name="readme-deploy"')) {
|
||||
console.log(
|
||||
Logger.debug(
|
||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -19,7 +21,7 @@ export async function handleCustomScraping(
|
|||
|
||||
// Check for Vanta security portals
|
||||
if (text.includes('<link href="https://static.vanta.com')) {
|
||||
console.log(
|
||||
Logger.debug(
|
||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
||||
);
|
||||
return {
|
||||
|
@ -34,7 +36,7 @@ export async function handleCustomScraping(
|
|||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||
if (googleDriveMetaMatch) {
|
||||
const url = googleDriveMetaMatch[1];
|
||||
console.log(`Google Drive PDF link detected: ${url}`);
|
||||
Logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||
|
||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||
if (fileIdMatch) {
|
||||
|
|
|
@ -19,13 +19,16 @@ import { generateCompletions } from "../../lib/LLM-extraction";
|
|||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
|
||||
export class WebScraperDataProvider {
|
||||
private jobId: string;
|
||||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string[];
|
||||
private excludes: string[];
|
||||
private includes: string | string[];
|
||||
private excludes: string | string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
|
@ -65,6 +68,7 @@ export class WebScraperDataProvider {
|
|||
batchUrls.map(async (url, index) => {
|
||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||
const result = await scrapSingleUrl(
|
||||
this.jobId,
|
||||
url,
|
||||
this.pageOptions,
|
||||
this.extractorOptions,
|
||||
|
@ -89,14 +93,14 @@ export class WebScraperDataProvider {
|
|||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
||||
const jobStatus = await job.getState();
|
||||
if (jobStatus === "failed") {
|
||||
console.error(
|
||||
Logger.info(
|
||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
||||
);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
Logger.error(error.message);
|
||||
return [] as Document[];
|
||||
}
|
||||
}
|
||||
|
@ -164,11 +168,29 @@ export class WebScraperDataProvider {
|
|||
private async handleCrawlMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let includes: string[];
|
||||
if (Array.isArray(this.includes)) {
|
||||
if (this.includes[0] != "") {
|
||||
includes = this.includes;
|
||||
}
|
||||
} else {
|
||||
includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
let excludes: string[];
|
||||
if (Array.isArray(this.excludes)) {
|
||||
if (this.excludes[0] != "") {
|
||||
excludes = this.excludes;
|
||||
}
|
||||
} else {
|
||||
excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
includes,
|
||||
excludes,
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||
limit: this.limit,
|
||||
|
@ -218,14 +240,13 @@ export class WebScraperDataProvider {
|
|||
private async handleSitemapMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
||||
links = await this.cleanIrrelevantPath(links);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
||||
}
|
||||
|
||||
|
||||
let documents = await this.processLinks(links, inProgress);
|
||||
return this.cacheAndFinalizeDocuments(documents, links);
|
||||
}
|
||||
|
@ -253,35 +274,60 @@ export class WebScraperDataProvider {
|
|||
inProgress?: (progress: Progress) => void,
|
||||
allHtmls?: string[]
|
||||
): Promise<Document[]> {
|
||||
const pdfLinks = links.filter(link => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(link => link.endsWith(".doc") || link.endsWith(".docx"));
|
||||
|
||||
const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||
const docxDocuments = await this.fetchDocxDocuments(docLinks);
|
||||
|
||||
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
||||
|
||||
let documents = await this.convertUrlsToDocuments(
|
||||
links,
|
||||
inProgress,
|
||||
allHtmls
|
||||
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||
const docLinks = links.filter(
|
||||
(link) => link.endsWith(".doc") || link.endsWith(".docx")
|
||||
);
|
||||
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
const [pdfDocuments, docxDocuments] = await Promise.all([
|
||||
this.fetchPdfDocuments(pdfLinks),
|
||||
this.fetchDocxDocuments(docLinks),
|
||||
]);
|
||||
|
||||
links = links.filter(
|
||||
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
|
||||
);
|
||||
|
||||
let [documents, sitemapData] = await Promise.all([
|
||||
this.convertUrlsToDocuments(links, inProgress, allHtmls),
|
||||
this.mode === "single_urls" && links.length > 0
|
||||
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
|
||||
(error) => {
|
||||
Logger.debug(`Failed to fetch sitemap data: ${error}`);
|
||||
return null;
|
||||
}
|
||||
)
|
||||
: Promise.resolve(null),
|
||||
]);
|
||||
|
||||
if (this.mode === "single_urls" && documents.length > 0) {
|
||||
documents[0].metadata.sitemap = sitemapData ?? undefined;
|
||||
} else {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "markdown");
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"markdown"
|
||||
);
|
||||
}
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
|
||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
||||
this.mode === "single_urls"
|
||||
) {
|
||||
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
|
||||
documents = await generateCompletions(
|
||||
documents,
|
||||
this.extractorOptions,
|
||||
"raw-html"
|
||||
);
|
||||
}
|
||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||
}
|
||||
|
@ -289,7 +335,28 @@ export class WebScraperDataProvider {
|
|||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: pdfLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "pdf-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||
pdfLink,
|
||||
this.pageOptions.parsePDF
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
return {
|
||||
content: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
|
@ -300,11 +367,32 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
docxLinks.map(async (p) => {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(p);
|
||||
docxLinks.map(async (docxLink) => {
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||
type: "scrape",
|
||||
url: docxLink,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method: "docx-scrape",
|
||||
result: null,
|
||||
});
|
||||
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||
docxLink
|
||||
);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: content.length,
|
||||
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||
error: pageError,
|
||||
response_code: pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
content,
|
||||
metadata: { sourceURL: p, pageStatusCode, pageError },
|
||||
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
})
|
||||
|
@ -328,7 +416,7 @@ export class WebScraperDataProvider {
|
|||
documents: Document[],
|
||||
links: string[]
|
||||
): Promise<Document[]> {
|
||||
await this.setCachedDocuments(documents, links);
|
||||
// await this.setCachedDocuments(documents, links);
|
||||
documents = this.removeChildLinks(documents);
|
||||
return documents.splice(0, this.limit);
|
||||
}
|
||||
|
@ -375,6 +463,10 @@ export class WebScraperDataProvider {
|
|||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
|
||||
if (!Array.isArray(this.excludes)) {
|
||||
this.excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
// Check if the link should be excluded
|
||||
if (
|
||||
|
@ -386,6 +478,10 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
}
|
||||
|
||||
if (!Array.isArray(this.includes)) {
|
||||
this.includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0) {
|
||||
|
@ -424,7 +520,7 @@ export class WebScraperDataProvider {
|
|||
...document,
|
||||
childrenLinks: childrenLinks || [],
|
||||
}),
|
||||
60 * 60 * 24 * 10
|
||||
60 * 60
|
||||
); // 10 days
|
||||
}
|
||||
}
|
||||
|
@ -433,7 +529,7 @@ export class WebScraperDataProvider {
|
|||
let documents: Document[] = [];
|
||||
for (const url of urls) {
|
||||
const normalizedUrl = this.normalizeUrl(url);
|
||||
console.log(
|
||||
Logger.debug(
|
||||
"Getting cached document for web-scraper-cache:" + normalizedUrl
|
||||
);
|
||||
const cachedDocumentString = await getValue(
|
||||
|
@ -472,6 +568,7 @@ export class WebScraperDataProvider {
|
|||
throw new Error("Urls are required");
|
||||
}
|
||||
|
||||
this.jobId = options.jobId;
|
||||
this.bullJobId = options.bullJobId;
|
||||
this.urls = options.urls;
|
||||
this.mode = options.mode;
|
||||
|
@ -489,16 +586,28 @@ export class WebScraperDataProvider {
|
|||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: []
|
||||
removeTags: [],
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
|
||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks =
|
||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
@ -537,6 +646,34 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
return documents;
|
||||
}
|
||||
private async getSitemapDataForSingleUrl(
|
||||
baseUrl: string,
|
||||
url: string,
|
||||
timeout?: number
|
||||
) {
|
||||
const sitemapData = await fetchSitemapData(baseUrl, timeout);
|
||||
if (sitemapData) {
|
||||
const docInSitemapData = sitemapData.find(
|
||||
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
|
||||
);
|
||||
if (docInSitemapData) {
|
||||
let sitemapDocData: Partial<SitemapEntry> = {};
|
||||
if (docInSitemapData.changefreq) {
|
||||
sitemapDocData.changefreq = docInSitemapData.changefreq;
|
||||
}
|
||||
if (docInSitemapData.priority) {
|
||||
sitemapDocData.priority = Number(docInSitemapData.priority);
|
||||
}
|
||||
if (docInSitemapData.lastmod) {
|
||||
sitemapDocData.lastmod = docInSitemapData.lastmod;
|
||||
}
|
||||
if (Object.keys(sitemapDocData).length !== 0) {
|
||||
return sitemapDocData;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
|
||||
await Promise.all(
|
||||
documents.map(async (document) => {
|
||||
|
|
|
@ -2,6 +2,7 @@ import axios from "axios";
|
|||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Axios
|
||||
|
@ -34,9 +35,7 @@ export async function scrapWithFetch(
|
|||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`);
|
||||
logParams.error_message = response.statusText;
|
||||
logParams.response_code = response.status;
|
||||
return {
|
||||
|
@ -63,10 +62,10 @@ export async function scrapWithFetch(
|
|||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
console.log(`[Axios] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import axios from "axios";
|
||||
import { FireEngineResponse } from "../../../lib/entities";
|
||||
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
|
||||
import { logScrape } from "../../../services/logging/scrape_log";
|
||||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Fire-Engine
|
||||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param screenshot Whether to take a screenshot
|
||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||
* @param pageOptions The options for the page
|
||||
* @param headers The headers to send with the request
|
||||
* @param options The options for the request
|
||||
|
@ -19,14 +21,18 @@ export async function scrapWithFireEngine({
|
|||
url,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
}: {
|
||||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
}): Promise<FireEngineResponse> {
|
||||
|
@ -44,19 +50,35 @@ export async function scrapWithFireEngine({
|
|||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
console.log(
|
||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
||||
if(options?.endpoint === "request") {
|
||||
endpoint = "/request";
|
||||
}
|
||||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + "/scrape",
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
{
|
||||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
...fireEngineOptionsParam,
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
|
@ -67,11 +89,17 @@ export async function scrapWithFireEngine({
|
|||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||
Logger.debug(
|
||||
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`
|
||||
);
|
||||
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
|
||||
if(response.data && response.data?.pageStatusCode !== 200) {
|
||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
|
||||
}
|
||||
|
||||
return {
|
||||
html: "",
|
||||
screenshot: "",
|
||||
|
@ -107,10 +135,10 @@ export async function scrapWithFireEngine({
|
|||
}
|
||||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
console.log(`[Fire-Engine] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
|
||||
logParams.error_message = "Request timed out";
|
||||
} else {
|
||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
}
|
||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
|
|
|
@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
|||
import { generateRequestParams } from "../single_url";
|
||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with Playwright
|
||||
|
@ -51,8 +52,8 @@ export async function scrapWithPlaywright(
|
|||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
|
||||
);
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
|
@ -86,8 +87,8 @@ export async function scrapWithPlaywright(
|
|||
};
|
||||
} catch (jsonError) {
|
||||
logParams.error_message = jsonError.message || jsonError;
|
||||
console.error(
|
||||
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
|
||||
Logger.debug(
|
||||
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
|
||||
);
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
}
|
||||
|
@ -95,10 +96,10 @@ export async function scrapWithPlaywright(
|
|||
} catch (error) {
|
||||
if (error.code === "ECONNABORTED") {
|
||||
logParams.error_message = "Request timed out";
|
||||
console.log(`[Playwright] Request timed out for ${url}`);
|
||||
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
|
||||
} else {
|
||||
logParams.error_message = error.message || error;
|
||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
|
||||
}
|
||||
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||
} finally {
|
||||
|
|
|
@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url";
|
|||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
import { universalTimeout } from "../global";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
/**
|
||||
* Scrapes a URL with ScrapingBee
|
||||
|
@ -56,8 +57,8 @@ export async function scrapWithScrapingBee(
|
|||
text = decoder.decode(response.data);
|
||||
logParams.success = true;
|
||||
} catch (decodeError) {
|
||||
console.error(
|
||||
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
|
||||
Logger.debug(
|
||||
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
|
||||
);
|
||||
logParams.error_message = decodeError.message || decodeError;
|
||||
}
|
||||
|
@ -72,7 +73,7 @@ export async function scrapWithScrapingBee(
|
|||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
|
||||
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
|
||||
logParams.error_message = error.message || error;
|
||||
logParams.response_code = error.response?.status;
|
||||
return {
|
||||
|
|
|
@ -16,16 +16,21 @@ import { scrapWithFetch } from "./scrapers/fetch";
|
|||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||
import { extractLinks } from "./utils/utils";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||
import { clientSideError } from "../../strings";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const baseScrapers = [
|
||||
export const baseScrapers = [
|
||||
"fire-engine",
|
||||
"fire-engine;chrome-cdp",
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
] as const;
|
||||
].filter(Boolean);
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
|
@ -46,7 +51,7 @@ export async function generateRequestParams(
|
|||
return defaultParams;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error generating URL key: ${error}`);
|
||||
Logger.error(`Error generating URL key: ${error}`);
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
|
@ -70,6 +75,8 @@ function getScrapingFallbackOrder(
|
|||
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||
case "fire-engine":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "fire-engine;chrome-cdp":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "playwright":
|
||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||
default:
|
||||
|
@ -78,21 +85,22 @@ function getScrapingFallbackOrder(
|
|||
});
|
||||
|
||||
let defaultOrder = [
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
||||
"scrapingBee",
|
||||
"fire-engine",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
];
|
||||
].filter(Boolean);
|
||||
|
||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||
defaultOrder = [
|
||||
"fire-engine",
|
||||
"playwright",
|
||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
||||
...defaultOrder.filter(
|
||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||
),
|
||||
];
|
||||
].filter(Boolean);
|
||||
}
|
||||
|
||||
const filteredDefaultOrder = defaultOrder.filter(
|
||||
|
@ -109,7 +117,10 @@ function getScrapingFallbackOrder(
|
|||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||
}
|
||||
|
||||
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
|
@ -117,6 +128,7 @@ export async function scrapSingleUrl(
|
|||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
|
@ -136,16 +148,36 @@ export async function scrapSingleUrl(
|
|||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
|
||||
const timer = Date.now();
|
||||
const logInsertPromise = ScrapeEvents.insert(jobId, {
|
||||
type: "scrape",
|
||||
url,
|
||||
worker: process.env.FLY_MACHINE_ID,
|
||||
method,
|
||||
result: null,
|
||||
});
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if(method === "fire-engine;chrome-cdp"){
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.log(`Scraping ${url} with Fire Engine`);
|
||||
const response = await scrapWithFireEngine({
|
||||
url,
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
}
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
|
@ -234,11 +266,21 @@ export async function scrapSingleUrl(
|
|||
scraperResponse.text = customScrapedContent.html;
|
||||
screenshot = customScrapedContent.screenshot;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
const text = await parseMarkdown(cleanedHtml);
|
||||
|
||||
const insertedLogId = await logInsertPromise;
|
||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||
response_size: scraperResponse.text.length,
|
||||
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
|
||||
error: scraperResponse.metadata.pageError,
|
||||
response_code: scraperResponse.metadata.pageStatusCode,
|
||||
time_taken: Date.now() - timer,
|
||||
});
|
||||
|
||||
return {
|
||||
text: await parseMarkdown(cleanedHtml),
|
||||
text,
|
||||
html: cleanedHtml,
|
||||
rawHtml: scraperResponse.text,
|
||||
screenshot: scraperResponse.screenshot,
|
||||
|
@ -260,19 +302,19 @@ export async function scrapSingleUrl(
|
|||
try {
|
||||
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
console.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
|
||||
}
|
||||
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
|
||||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||
);
|
||||
|
||||
for (const scraper of scrapersInOrder) {
|
||||
// If exists text coming from crawler, use it
|
||||
if (existingHtml && existingHtml.trim().length >= 100) {
|
||||
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
|
||||
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||
text = await parseMarkdown(cleanedHtml);
|
||||
html = cleanedHtml;
|
||||
|
@ -294,12 +336,18 @@ export async function scrapSingleUrl(
|
|||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) break;
|
||||
if (pageStatusCode && pageStatusCode == 404) break;
|
||||
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
if (nextScraperIndex < scrapersInOrder.length) {
|
||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && pageStatusCode == 404) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||
break;
|
||||
}
|
||||
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
|
||||
// if (nextScraperIndex < scrapersInOrder.length) {
|
||||
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||
// }
|
||||
}
|
||||
|
||||
if (!text) {
|
||||
|
@ -309,6 +357,10 @@ export async function scrapSingleUrl(
|
|||
const soup = cheerio.load(rawHtml);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
|
@ -317,9 +369,10 @@ export async function scrapSingleUrl(
|
|||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
|
@ -335,7 +388,7 @@ export async function scrapSingleUrl(
|
|||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
|
@ -344,16 +397,23 @@ export async function scrapSingleUrl(
|
|||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
};
|
||||
}
|
||||
|
||||
return document;
|
||||
} catch (error) {
|
||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
||||
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
|
||||
ScrapeEvents.insert(jobId, {
|
||||
type: "error",
|
||||
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
|
||||
stack: error.stack,
|
||||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
|
|
@ -1,18 +1,33 @@
|
|||
import axios from "axios";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { WebCrawler } from "./crawler";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
sitemapUrl: string,
|
||||
allUrls: string[] = []
|
||||
{
|
||||
sitemapUrl,
|
||||
allUrls = [],
|
||||
mode = 'axios'
|
||||
}: {
|
||||
sitemapUrl: string,
|
||||
allUrls?: string[],
|
||||
mode?: 'axios' | 'fire-engine'
|
||||
}
|
||||
): Promise<string[]> {
|
||||
try {
|
||||
let content: string;
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"tlsclient", disableJsDom: true, mobileProxy: true } });
|
||||
content = response.html;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
@ -23,27 +38,27 @@ export async function getLinksFromSitemap(
|
|||
if (root && root.sitemap) {
|
||||
for (const sitemap of root.sitemap) {
|
||||
if (sitemap.loc && sitemap.loc.length > 0) {
|
||||
await getLinksFromSitemap(sitemap.loc[0], allUrls);
|
||||
await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode });
|
||||
}
|
||||
}
|
||||
} else if (root && root.url) {
|
||||
for (const url of root.url) {
|
||||
if (url.loc && url.loc.length > 0) {
|
||||
if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
|
||||
allUrls.push(url.loc[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error processing ${sitemapUrl}: ${error}`);
|
||||
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
||||
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
||||
export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
const xml = response.data;
|
||||
const parsedXml = await parseStringPromise(xml);
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import { Logger } from '../../../../lib/logger';
|
||||
import { isUrlBlocked } from '../blocklist';
|
||||
|
||||
describe('isUrlBlocked', () => {
|
||||
|
@ -19,7 +20,7 @@ describe('isUrlBlocked', () => {
|
|||
|
||||
blockedUrls.forEach(url => {
|
||||
if (!isUrlBlocked(url)) {
|
||||
console.log(`URL not blocked: ${url}`);
|
||||
Logger.debug(`URL not blocked: ${url}`);
|
||||
}
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
const socialMediaBlocklist = [
|
||||
'facebook.com',
|
||||
'x.com',
|
||||
|
@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||
return isBlocked;
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
console.error(`Error processing URL: ${url}`, e);
|
||||
Logger.error(`Error parsing the following URL: ${url}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,28 +1,15 @@
|
|||
export const urlSpecificParams = {
|
||||
"platform.openai.com": {
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
},
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
referer: "https://www.google.com/",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
},
|
||||
cookies: {
|
||||
__cf_bm:
|
||||
"mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
wait: 3000,
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp"
|
||||
},
|
||||
},
|
||||
},
|
||||
"support.greenpay.me":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
|
@ -43,7 +30,7 @@ export const urlSpecificParams = {
|
|||
},
|
||||
},
|
||||
"docs.pdw.co":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
|
@ -83,7 +70,7 @@ export const urlSpecificParams = {
|
|||
},
|
||||
},
|
||||
"developers.notion.com":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
|
@ -103,7 +90,7 @@ export const urlSpecificParams = {
|
|||
},
|
||||
},
|
||||
"docs2.hubitat.com":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
|
@ -153,7 +140,7 @@ export const urlSpecificParams = {
|
|||
},
|
||||
},
|
||||
"help.salesforce.com":{
|
||||
defaultScraper: "playwright",
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
|
@ -175,6 +162,7 @@ export const urlSpecificParams = {
|
|||
"firecrawl.dev":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
engine: "playwright",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
|
@ -192,4 +180,59 @@ export const urlSpecificParams = {
|
|||
"ir.veeva.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
},
|
||||
"eonhealth.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
mobileProxy: true,
|
||||
method: "get",
|
||||
engine: "request",
|
||||
},
|
||||
},
|
||||
},
|
||||
"notion.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
wait_browser: "networkidle2",
|
||||
block_resources: false,
|
||||
wait: 2000,
|
||||
engine: "playwright",
|
||||
}
|
||||
},
|
||||
"mendable.ai":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
mobileProxy: true,
|
||||
method: "get",
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"developer.apple.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
engine: "playwright",
|
||||
wait: 2000,
|
||||
fireEngineOptions: {
|
||||
blockMedia: false,
|
||||
}
|
||||
},
|
||||
},
|
||||
"amazon.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "chrome-cdp",
|
||||
},
|
||||
},
|
||||
},
|
||||
"digikey.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
|
|
@ -4,38 +4,76 @@ import { createWriteStream } from "node:fs";
|
|||
import path from "path";
|
||||
import os from "os";
|
||||
import mammoth from "mammoth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
|
||||
const content = await processDocxToText(tempFilePath);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
let tempFilePath = '';
|
||||
let pageStatusCode = 200;
|
||||
let pageError = '';
|
||||
let content = '';
|
||||
|
||||
try {
|
||||
const downloadResult = await downloadDocx(url);
|
||||
tempFilePath = downloadResult.tempFilePath;
|
||||
pageStatusCode = downloadResult.pageStatusCode;
|
||||
pageError = downloadResult.pageError;
|
||||
content = await processDocxToText(tempFilePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
||||
pageStatusCode = 500;
|
||||
pageError = error.message;
|
||||
content = '';
|
||||
} finally {
|
||||
if (tempFilePath) {
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
}
|
||||
}
|
||||
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
try {
|
||||
const response = await axios({
|
||||
url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
|
||||
const writer = createWriteStream(tempFilePath);
|
||||
|
||||
response.data.pipe(writer);
|
||||
response.data.pipe(writer);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", reject);
|
||||
});
|
||||
return new Promise((resolve, reject) => {
|
||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||
writer.on("error", () => {
|
||||
Logger.error('Failed to write DOCX file to disk');
|
||||
reject(new Error('Failed to write DOCX file to disk'));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to download DOCX: ${error.message}`);
|
||||
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
export async function processDocxToText(filePath: string): Promise<string> {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
try {
|
||||
const content = await extractTextFromDocx(filePath);
|
||||
return content;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
try {
|
||||
const result = await mammoth.extractRawText({ path: filePath });
|
||||
return result.value;
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import axios from 'axios';
|
||||
import { Logger } from '../../../lib/logger';
|
||||
|
||||
export async function getImageDescription(
|
||||
imageUrl: string,
|
||||
|
@ -82,7 +83,7 @@ export async function getImageDescription(
|
|||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error generating image alt text:", error?.message);
|
||||
Logger.error(`Error generating image alt text: ${error}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import { CheerioAPI } from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
interface Metadata {
|
||||
title?: string;
|
||||
description?: string;
|
||||
|
@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
|
||||
} catch (error) {
|
||||
console.error("Error extracting metadata:", error);
|
||||
Logger.error(`Error extracting metadata: ${error}`);
|
||||
}
|
||||
|
||||
return {
|
||||
|
|
|
@ -7,14 +7,20 @@ import pdf from "pdf-parse";
|
|||
import path from "path";
|
||||
import os from "os";
|
||||
import { axiosTimeout } from "../../../lib/timeout";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
try {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
||||
return { content: "", pageStatusCode: 500, pageError: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
||||
|
@ -39,6 +45,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
Logger.debug("Processing pdf document w/ LlamaIndex");
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
|
@ -69,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||
let attempt = 0;
|
||||
const maxAttempts = 10; // Maximum number of attempts
|
||||
let resultAvailable = false;
|
||||
|
||||
while (attempt < maxAttempts && !resultAvailable) {
|
||||
try {
|
||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||
|
@ -81,31 +87,54 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error fetching result w/ LlamaIndex");
|
||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||
attempt++;
|
||||
if (attempt >= maxAttempts) {
|
||||
Logger.error("Max attempts reached, unable to fetch result.");
|
||||
break; // Exit the loop if max attempts are reached
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||
// You may want to handle specific errors differently
|
||||
}
|
||||
}
|
||||
|
||||
if (!resultAvailable) {
|
||||
content = await processPdf(filePath);
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
content = resultResponse.data[resultType];
|
||||
} catch (error) {
|
||||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else if (parsePDF) {
|
||||
content = await processPdf(filePath);
|
||||
try {
|
||||
content = await processPdf(filePath);
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to process PDF: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
} else {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
try {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to read PDF file: ${error}`);
|
||||
content = "";
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
async function processPdf(file: string) {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
try {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
} catch (error) {
|
||||
throw error;
|
||||
}
|
||||
}
|
|
@ -8,7 +8,11 @@ export const removeUnwantedElements = (
|
|||
) => {
|
||||
const soup = cheerio.load(html);
|
||||
|
||||
if (pageOptions.onlyIncludeTags) {
|
||||
if (
|
||||
pageOptions.onlyIncludeTags &&
|
||||
pageOptions.onlyIncludeTags.length > 0 &&
|
||||
pageOptions.onlyIncludeTags[0] !== ''
|
||||
) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
|
@ -26,7 +30,11 @@ export const removeUnwantedElements = (
|
|||
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
|
||||
if (pageOptions.removeTags) {
|
||||
if (
|
||||
pageOptions.removeTags &&
|
||||
pageOptions.removeTags.length > 0 &&
|
||||
pageOptions.removeTags[0] !== ''
|
||||
) {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import { Logger } from "../../../lib/logger";
|
||||
import { Document } from "../../../lib/entities";
|
||||
|
||||
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
|
||||
|
@ -6,13 +7,13 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const paths =
|
||||
document.content.match(
|
||||
/(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
|
||||
/!?\[.*?\]\(.*?\)|href=".+?"/g
|
||||
) || [];
|
||||
|
||||
paths.forEach((path: string) => {
|
||||
try {
|
||||
const isImage = path.startsWith("!");
|
||||
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
||||
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
|
||||
let url = matchedUrl[1];
|
||||
|
||||
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
||||
|
@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
console.error("Error replacing paths with absolute paths", error);
|
||||
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
||||
|
@ -50,11 +51,11 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||
const images =
|
||||
document.content.match(
|
||||
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
||||
/!\[.*?\]\(.*?\)/g
|
||||
) || [];
|
||||
|
||||
images.forEach((image: string) => {
|
||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
||||
let imageUrl = image.match(/\((.*?)\)/)[1];
|
||||
let altText = image.match(/\[(.*?)\]/)[1];
|
||||
|
||||
if (!imageUrl.startsWith("data:image")) {
|
||||
|
@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||
|
||||
return documents;
|
||||
} catch (error) {
|
||||
console.error("Error replacing img paths with absolute paths", error);
|
||||
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
|
||||
return documents;
|
||||
}
|
||||
};
|
|
@ -1,4 +1,7 @@
|
|||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
|
||||
export async function attemptScrapWithRequests(
|
||||
urlToScrap: string
|
||||
|
@ -7,13 +10,13 @@ export async function attemptScrapWithRequests(
|
|||
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||
|
||||
if (!response.data) {
|
||||
console.log("Failed normal requests as well");
|
||||
Logger.debug("Failed normal requests as well");
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error(`Error in attemptScrapWithRequests: ${error}`);
|
||||
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -21,3 +24,35 @@ export async function attemptScrapWithRequests(
|
|||
export function sanitizeText(text: string): string {
|
||||
return text.replace("\u0000", "");
|
||||
}
|
||||
|
||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const $ = cheerio.load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
// Parse the base URL to get the origin
|
||||
const urlObject = new URL(baseUrl);
|
||||
const origin = urlObject.origin;
|
||||
|
||||
$('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (href) {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(`${origin}${href}`);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(`${baseUrl}/${href}`);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates and return
|
||||
return [...new Set(links)];
|
||||
}
|
|
@ -2,6 +2,7 @@ import axios from 'axios';
|
|||
import * as cheerio from 'cheerio';
|
||||
import * as querystring from 'querystring';
|
||||
import { SearchResult } from '../../src/lib/entities';
|
||||
import { Logger } from '../../src/lib/logger';
|
||||
|
||||
const _useragent_list = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
||||
|
@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results
|
|||
await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
|
||||
} catch (error) {
|
||||
if (error.message === 'Too many requests') {
|
||||
console.warn('Too many requests, breaking the loop');
|
||||
Logger.warn('Too many requests, breaking the loop');
|
||||
break;
|
||||
}
|
||||
throw error;
|
||||
|
@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results
|
|||
}
|
||||
}
|
||||
if (attempts >= maxAttempts) {
|
||||
console.warn('Max attempts reached, breaking the loop');
|
||||
Logger.warn('Max attempts reached, breaking the loop');
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { serper_search } from "./serper";
|
||||
|
@ -47,7 +48,7 @@ export async function search({
|
|||
timeout
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error in search function: ", error);
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
|
|
63
apps/api/src/services/alerts/index.ts
Normal file
63
apps/api/src/services/alerts/index.ts
Normal file
|
@ -0,0 +1,63 @@
|
|||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getWebScraperQueue } from "../queue-service";
|
||||
import { sendSlackWebhook } from "./slack";
|
||||
|
||||
export async function checkAlerts() {
|
||||
try {
|
||||
if (
|
||||
process.env.SLACK_WEBHOOK_URL &&
|
||||
process.env.ENV === "production" &&
|
||||
process.env.ALERT_NUM_ACTIVE_JOBS &&
|
||||
process.env.ALERT_NUM_WAITING_JOBS
|
||||
) {
|
||||
Logger.info("Initializing alerts");
|
||||
const checkActiveJobs = async () => {
|
||||
try {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const activeJobs = await webScraperQueue.getActiveCount();
|
||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||
Logger.warn(
|
||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||
);
|
||||
sendSlackWebhook(
|
||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
|
||||
true
|
||||
);
|
||||
} else {
|
||||
Logger.info(
|
||||
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to check active jobs: ${error}`);
|
||||
}
|
||||
};
|
||||
|
||||
const checkWaitingQueue = async () => {
|
||||
const webScraperQueue = getWebScraperQueue();
|
||||
const waitingJobs = await webScraperQueue.getWaitingCount();
|
||||
const paused = await webScraperQueue.getPausedCount();
|
||||
|
||||
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||
Logger.warn(
|
||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
||||
);
|
||||
sendSlackWebhook(
|
||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}. Scale up the number of workers with fly scale count worker=20`,
|
||||
true
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
const checkAll = async () => {
|
||||
await checkActiveJobs();
|
||||
await checkWaitingQueue();
|
||||
};
|
||||
|
||||
await checkAll();
|
||||
// setInterval(checkAll, 10000); // Run every
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to initialize alerts: ${error}`);
|
||||
}
|
||||
}
|
24
apps/api/src/services/alerts/slack.ts
Normal file
24
apps/api/src/services/alerts/slack.ts
Normal file
|
@ -0,0 +1,24 @@
|
|||
import axios from "axios";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function sendSlackWebhook(
|
||||
message: string,
|
||||
alertEveryone: boolean = false
|
||||
) {
|
||||
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
||||
const payload = {
|
||||
text: `${messagePrefix} ${message}`,
|
||||
};
|
||||
|
||||
try {
|
||||
const response = await axios.post(webhookUrl, payload, {
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
Logger.log("Webhook sent successfully:", response.data);
|
||||
} catch (error) {
|
||||
Logger.debug(`Error sending webhook: ${error}`);
|
||||
}
|
||||
}
|
|
@ -2,9 +2,14 @@ import { NotificationType } from "../../types";
|
|||
import { withAuth } from "../../lib/withAuth";
|
||||
import { sendNotification } from "../notification/email_notification";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getValue, setValue } from "../redis";
|
||||
import { redlock } from "../redlock";
|
||||
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
|
||||
export async function billTeam(team_id: string, credits: number) {
|
||||
return withAuth(supaBillTeam)(team_id, credits);
|
||||
}
|
||||
|
@ -12,27 +17,27 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
console.log(`Billing team ${team_id} for ${credits} credits`);
|
||||
Logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||
// When the API is used, you can log the credit usage in the credit_usage table:
|
||||
// team_id: The ID of the team using the API.
|
||||
// subscription_id: The ID of the team's active subscription.
|
||||
// credits_used: The number of credits consumed by the API call.
|
||||
// created_at: The timestamp of the API usage.
|
||||
|
||||
// 1. get the subscription
|
||||
const { data: subscription } = await supabase_service
|
||||
.from("subscriptions")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single();
|
||||
|
||||
// 2. Check for available coupons
|
||||
const { data: coupons } = await supabase_service
|
||||
.from("coupons")
|
||||
.select("id, credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active");
|
||||
// 1. get the subscription and check for available coupons concurrently
|
||||
const [{ data: subscription }, { data: coupons }] = await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single(),
|
||||
supabase_service
|
||||
.from("coupons")
|
||||
.select("id, credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
|
@ -169,21 +174,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
|
||||
// Retrieve the team's active subscription
|
||||
const { data: subscription, error: subscriptionError } =
|
||||
await supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single();
|
||||
|
||||
// Check for available coupons
|
||||
const { data: coupons } = await supabase_service
|
||||
.from("coupons")
|
||||
.select("credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active");
|
||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
||||
await Promise.all([
|
||||
supabase_service
|
||||
.from("subscriptions")
|
||||
.select("id, price_id, current_period_start, current_period_end")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active")
|
||||
.single(),
|
||||
supabase_service
|
||||
.from("coupons")
|
||||
.select("credits")
|
||||
.eq("team_id", team_id)
|
||||
.eq("status", "active"),
|
||||
]);
|
||||
|
||||
let couponCredits = 0;
|
||||
if (coupons && coupons.length > 0) {
|
||||
|
@ -218,7 +223,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
0
|
||||
);
|
||||
|
||||
console.log("totalCreditsUsed", totalCreditsUsed);
|
||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||
|
||||
const end = new Date();
|
||||
end.setDate(end.getDate() + 30);
|
||||
|
@ -238,7 +243,6 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
// 5. Compare the total credits used with the credits allowed by the plan.
|
||||
if (totalCreditsUsed + credits > FREE_CREDITS) {
|
||||
// Send email notification for insufficient credits
|
||||
|
||||
await sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
|
@ -254,28 +258,45 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||
}
|
||||
|
||||
let totalCreditsUsed = 0;
|
||||
const cacheKey = `credit_usage_${subscription.id}_${subscription.current_period_start}_${subscription.current_period_end}_lc`;
|
||||
const redLockKey = `lock_${cacheKey}`;
|
||||
const lockTTL = 10000; // 10 seconds
|
||||
|
||||
try {
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service.rpc("get_credit_usage_2", {
|
||||
sub_id: subscription.id,
|
||||
start_time: subscription.current_period_start,
|
||||
end_time: subscription.current_period_end,
|
||||
});
|
||||
const lock = await redlock.acquire([redLockKey], lockTTL);
|
||||
|
||||
if (creditUsageError) {
|
||||
console.error("Error calculating credit usage:", creditUsageError);
|
||||
}
|
||||
try {
|
||||
const cachedCreditUsage = await getValue(cacheKey);
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
if (cachedCreditUsage) {
|
||||
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||
} else {
|
||||
const { data: creditUsages, error: creditUsageError } =
|
||||
await supabase_service.rpc("get_credit_usage_2", {
|
||||
sub_id: subscription.id,
|
||||
start_time: subscription.current_period_start,
|
||||
end_time: subscription.current_period_end,
|
||||
});
|
||||
|
||||
if (creditUsageError) {
|
||||
Logger.error(`Error calculating credit usage: ${creditUsageError}`);
|
||||
}
|
||||
|
||||
if (creditUsages && creditUsages.length > 0) {
|
||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await lock.release();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error calculating credit usage:", error);
|
||||
Logger.error(`Error acquiring lock or calculating credit usage: ${error}`);
|
||||
}
|
||||
|
||||
// Adjust total credits used by subtracting coupon value
|
||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||
|
||||
// Get the price details
|
||||
const { data: price, error: priceError } = await supabase_service
|
||||
.from("prices")
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function createIdempotencyKey(
|
||||
req: Request,
|
||||
|
@ -14,7 +15,7 @@ export async function createIdempotencyKey(
|
|||
.insert({ key: idempotencyKey });
|
||||
|
||||
if (error) {
|
||||
console.error("Failed to create idempotency key:", error);
|
||||
Logger.error(`Failed to create idempotency key: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import { Request } from "express";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { validate as isUuid } from 'uuid';
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function validateIdempotencyKey(
|
||||
req: Request,
|
||||
|
@ -13,7 +14,7 @@ export async function validateIdempotencyKey(
|
|||
// Ensure idempotencyKey is treated as a string
|
||||
const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey;
|
||||
if (!isUuid(key)) {
|
||||
console.error("Invalid idempotency key provided in the request headers.");
|
||||
Logger.debug("Invalid idempotency key provided in the request headers.");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -23,7 +24,7 @@ export async function validateIdempotencyKey(
|
|||
.eq("key", idempotencyKey);
|
||||
|
||||
if (error) {
|
||||
console.error(error);
|
||||
Logger.error(`Error validating idempotency key: ${error}`);
|
||||
}
|
||||
|
||||
if (!data || data.length === 0) {
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import { supabase_service } from "../supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import "dotenv/config";
|
||||
|
||||
export async function logCrawl(job_id: string, team_id: string) {
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
if (process.env.USE_DB_AUTHENTICATION === 'true') {
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.insert([
|
||||
{
|
||||
|
@ -11,7 +13,8 @@ export async function logCrawl(job_id: string, team_id: string) {
|
|||
team_id: team_id,
|
||||
},
|
||||
]);
|
||||
} catch (error) {
|
||||
console.error("Error logging crawl job:\n", error);
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging crawl job to supabase:\n${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,11 +3,11 @@ import { supabase_service } from "../supabase";
|
|||
import { FirecrawlJob } from "../../types";
|
||||
import { posthog } from "../posthog";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function logJob(job: FirecrawlJob) {
|
||||
try {
|
||||
// Only log jobs in production
|
||||
if (process.env.ENV !== "production") {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
.from("firecrawl_jobs")
|
||||
.insert([
|
||||
{
|
||||
job_id: job.job_id ? job.job_id : null,
|
||||
success: job.success,
|
||||
message: job.message,
|
||||
num_docs: job.num_docs,
|
||||
|
@ -38,6 +39,7 @@ export async function logJob(job: FirecrawlJob) {
|
|||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: !!job.retry,
|
||||
},
|
||||
]);
|
||||
|
||||
|
@ -61,14 +63,15 @@ export async function logJob(job: FirecrawlJob) {
|
|||
origin: job.origin,
|
||||
extractor_options: job.extractor_options,
|
||||
num_tokens: job.num_tokens,
|
||||
retry: job.retry,
|
||||
},
|
||||
};
|
||||
posthog.capture(phLog);
|
||||
}
|
||||
if (error) {
|
||||
console.error("Error logging job:\n", error);
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error logging job:\n", error);
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,11 +2,16 @@ import "dotenv/config";
|
|||
import { ScrapeLog } from "../../types";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function logScrape(
|
||||
scrapeLog: ScrapeLog,
|
||||
pageOptions?: PageOptions
|
||||
) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
Logger.debug("Skipping logging scrape to Supabase");
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// Only log jobs in production
|
||||
// if (process.env.ENV !== "production") {
|
||||
|
@ -32,16 +37,16 @@ export async function logScrape(
|
|||
retried: scrapeLog.retried,
|
||||
error_message: scrapeLog.error_message,
|
||||
date_added: new Date().toISOString(),
|
||||
html: scrapeLog.html,
|
||||
html: "Removed to save db space",
|
||||
ipv4_support: scrapeLog.ipv4_support,
|
||||
ipv6_support: scrapeLog.ipv6_support,
|
||||
},
|
||||
]);
|
||||
|
||||
if (error) {
|
||||
console.error("Error logging proxy:\n", error);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error logging proxy:\n", error);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
import { Logtail } from "@logtail/node";
|
||||
import "dotenv/config";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
|
||||
class MockLogtail {
|
||||
info(message: string, context?: Record<string, any>): void {
|
||||
console.log(message, context);
|
||||
Logger.debug(`${message} - ${context}`);
|
||||
}
|
||||
error(message: string, context: Record<string, any> = {}): void {
|
||||
console.error(message, context);
|
||||
Logger.error(`${message} - ${context}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
|
||||
// Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
|
||||
export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
|
||||
console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||
Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
|
||||
return new MockLogtail();
|
||||
})();
|
||||
|
|
|
@ -2,6 +2,7 @@ import { supabase_service } from "../supabase";
|
|||
import { withAuth } from "../../lib/withAuth";
|
||||
import { Resend } from "resend";
|
||||
import { NotificationType } from "../../types";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
const emailTemplates: Record<
|
||||
NotificationType,
|
||||
|
@ -52,11 +53,11 @@ async function sendEmailNotification(
|
|||
});
|
||||
|
||||
if (error) {
|
||||
console.error("Error sending email: ", error);
|
||||
Logger.debug(`Error sending email: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error sending email (2): ", error);
|
||||
Logger.debug(`Error sending email (2): ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
}
|
||||
|
@ -70,7 +71,28 @@ export async function sendNotificationInternal(
|
|||
if (team_id === "preview") {
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
|
@ -78,14 +100,16 @@ export async function sendNotificationInternal(
|
|||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (error) {
|
||||
console.error("Error fetching notifications: ", error);
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
if (recentData.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
|
||||
return { success: false };
|
||||
} else {
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
|
@ -93,7 +117,7 @@ export async function sendNotificationInternal(
|
|||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
console.error("Error fetching emails: ", emailsError);
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
|
@ -112,7 +136,7 @@ export async function sendNotificationInternal(
|
|||
]);
|
||||
|
||||
if (insertError) {
|
||||
console.error("Error inserting notification record: ", insertError);
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { PostHog } from 'posthog-node';
|
||||
import "dotenv/config";
|
||||
import { Logger } from '../../src/lib/logger';
|
||||
|
||||
export default function PostHogClient() {
|
||||
const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, {
|
||||
|
@ -19,7 +20,7 @@ class MockPostHog {
|
|||
export const posthog = process.env.POSTHOG_API_KEY
|
||||
? PostHogClient()
|
||||
: (() => {
|
||||
console.warn(
|
||||
Logger.warn(
|
||||
"POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more."
|
||||
);
|
||||
return new MockPostHog();
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import Queue from "bull";
|
||||
import { Queue as BullQueue } from "bull";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
let webScraperQueue: BullQueue;
|
||||
|
||||
|
@ -7,11 +8,16 @@ export function getWebScraperQueue() {
|
|||
if (!webScraperQueue) {
|
||||
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
||||
settings: {
|
||||
lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds,
|
||||
lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
|
||||
lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||
stalledInterval: 30 * 1000,
|
||||
maxStalledCount: 10,
|
||||
},
|
||||
defaultJobOptions:{
|
||||
attempts: 2
|
||||
}
|
||||
});
|
||||
console.log("Web scraper queue created");
|
||||
Logger.info("Web scraper queue created");
|
||||
}
|
||||
return webScraperQueue;
|
||||
}
|
||||
|
|
|
@ -6,93 +6,120 @@ import { startWebScraperPipeline } from "../main/runWebScraper";
|
|||
import { callWebhook } from "./webhook";
|
||||
import { logJob } from "./logging/log_job";
|
||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
||||
import { Job } from "bull";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { ScrapeEvents } from "../lib/scrape-events";
|
||||
|
||||
if(process.env.ENV === 'production') {
|
||||
initSDK({ consoleCapture: true, additionalInstrumentations: []});
|
||||
if (process.env.ENV === 'production') {
|
||||
initSDK({
|
||||
consoleCapture: true,
|
||||
additionalInstrumentations: [],
|
||||
});
|
||||
}
|
||||
|
||||
getWebScraperQueue().process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
async function (job, done) {
|
||||
const wsq = getWebScraperQueue();
|
||||
|
||||
try {
|
||||
job.progress({
|
||||
current: 1,
|
||||
total: 100,
|
||||
current_step: "SCRAPING",
|
||||
current_url: "",
|
||||
});
|
||||
const start = Date.now();
|
||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
async function processJob(job: Job, done) {
|
||||
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||
|
||||
const data = {
|
||||
success: success,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
};
|
||||
try {
|
||||
job.progress({
|
||||
current: 1,
|
||||
total: 100,
|
||||
current_step: "SCRAPING",
|
||||
current_url: "",
|
||||
});
|
||||
const start = Date.now();
|
||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
const data = {
|
||||
success: success,
|
||||
result: {
|
||||
links: docs.map((doc) => {
|
||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||
}),
|
||||
},
|
||||
project_id: job.data.project_id,
|
||||
error: message /* etc... */,
|
||||
};
|
||||
|
||||
await logJob({
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
done(null, data);
|
||||
} catch (error) {
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
console.error(error.message); // or any other error handling
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
|
||||
logtail.error("Custom error while ingesting", {
|
||||
job_id: job.id,
|
||||
error: error.message,
|
||||
dataIngestionJob: error.dataIngestionJob,
|
||||
});
|
||||
}
|
||||
console.log(error);
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: success,
|
||||
message: message,
|
||||
num_docs: docs.length,
|
||||
docs: docs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
Logger.info(`🐂 Job done ${job.id}`);
|
||||
done(null, data);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
if (await getWebScraperQueue().isPaused(false)) {
|
||||
Logger.debug("🐂Queue is paused, ignoring");
|
||||
return;
|
||||
}
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
if (error instanceof CustomError) {
|
||||
// Here we handle the error, then save the failed job
|
||||
Logger.error(error.message); // or any other error handling
|
||||
|
||||
logtail.error("Custom error while ingesting", {
|
||||
job_id: job.id,
|
||||
error: error.message,
|
||||
dataIngestionJob: error.dataIngestionJob,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: false,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
await logJob({
|
||||
success: false,
|
||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
done(null, data);
|
||||
}
|
||||
Logger.error(error);
|
||||
|
||||
logtail.error("Overall error ingesting", {
|
||||
job_id: job.id,
|
||||
error: error.message,
|
||||
});
|
||||
|
||||
const data = {
|
||||
success: false,
|
||||
project_id: job.data.project_id,
|
||||
error:
|
||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||
};
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
await logJob({
|
||||
job_id: job.id as string,
|
||||
success: false,
|
||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||
num_docs: 0,
|
||||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: job.data.url,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
done(null, data);
|
||||
}
|
||||
}
|
||||
|
||||
wsq.process(
|
||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||
processJob
|
||||
);
|
||||
|
||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||
|
|
|
@ -1,48 +1,98 @@
|
|||
import { getRateLimiter, serverRateLimiter, testSuiteRateLimiter, redisClient } from "./rate-limiter";
|
||||
import {
|
||||
getRateLimiter,
|
||||
serverRateLimiter,
|
||||
testSuiteRateLimiter,
|
||||
redisRateLimitClient,
|
||||
} from "./rate-limiter";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
|
||||
describe("Rate Limiter Service", () => {
|
||||
beforeAll(async () => {
|
||||
await redisClient.connect();
|
||||
try {
|
||||
await redisRateLimitClient.connect();
|
||||
// if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") {
|
||||
// console.log("Erasing all keys");
|
||||
// // erase all the keys that start with "test-prefix"
|
||||
// const keys = await redisRateLimitClient.keys("test-prefix:*");
|
||||
// if (keys.length > 0) {
|
||||
// await redisRateLimitClient.del(...keys);
|
||||
// }
|
||||
// }
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await redisClient.disconnect();
|
||||
try {
|
||||
// if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") {
|
||||
await redisRateLimitClient.disconnect();
|
||||
// }
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
it("should return the testSuiteRateLimiter for specific tokens", () => {
|
||||
const limiter = getRateLimiter("crawl" as RateLimiterMode, "a01ccae");
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:a01ccae"
|
||||
);
|
||||
expect(limiter).toBe(testSuiteRateLimiter);
|
||||
|
||||
const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "6254cf9");
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:6254cf9"
|
||||
);
|
||||
expect(limiter2).toBe(testSuiteRateLimiter);
|
||||
});
|
||||
|
||||
it("should return the serverRateLimiter if mode is not found", () => {
|
||||
const limiter = getRateLimiter("nonexistent" as RateLimiterMode, "someToken");
|
||||
const limiter = getRateLimiter(
|
||||
"nonexistent" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter).toBe(serverRateLimiter);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter based on mode and plan", () => {
|
||||
const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken", "free");
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(2);
|
||||
|
||||
const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken", "standard");
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter2.points).toBe(50);
|
||||
|
||||
const limiter3 = getRateLimiter("search" as RateLimiterMode, "someToken", "growth");
|
||||
const limiter3 = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter3.points).toBe(500);
|
||||
|
||||
const limiter4 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "growth");
|
||||
const limiter4 = getRateLimiter(
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(150);
|
||||
});
|
||||
|
||||
it("should return the default rate limiter if plan is not provided", () => {
|
||||
const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken");
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter.points).toBe(3);
|
||||
|
||||
const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken");
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(20);
|
||||
});
|
||||
|
||||
|
@ -50,7 +100,7 @@ describe("Rate Limiter Service", () => {
|
|||
const keyPrefix = "test-prefix";
|
||||
const points = 10;
|
||||
const limiter = new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix,
|
||||
points,
|
||||
duration: 60,
|
||||
|
@ -62,26 +112,253 @@ describe("Rate Limiter Service", () => {
|
|||
});
|
||||
|
||||
it("should return the correct rate limiter for 'preview' mode", () => {
|
||||
const limiter = getRateLimiter("preview" as RateLimiterMode, "someToken", "free");
|
||||
const limiter = getRateLimiter(
|
||||
"preview" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
|
||||
const limiter2 = getRateLimiter("preview" as RateLimiterMode, "someToken");
|
||||
const limiter2 = getRateLimiter(
|
||||
"preview" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(5);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'account' mode", () => {
|
||||
const limiter = getRateLimiter("account" as RateLimiterMode, "someToken", "free");
|
||||
const limiter = getRateLimiter(
|
||||
"account" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(100);
|
||||
|
||||
const limiter2 = getRateLimiter("account" as RateLimiterMode, "someToken");
|
||||
const limiter2 = getRateLimiter(
|
||||
"account" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(100);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'crawlStatus' mode", () => {
|
||||
const limiter = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "free");
|
||||
const limiter = getRateLimiter(
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(150);
|
||||
|
||||
const limiter2 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken");
|
||||
const limiter2 = getRateLimiter(
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'crawl' mode", async () => {
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someTokenCRAWL",
|
||||
"free"
|
||||
);
|
||||
const consumePoints = 1;
|
||||
|
||||
const res = await limiter.consume(
|
||||
"test-prefix:someTokenCRAWL",
|
||||
consumePoints
|
||||
);
|
||||
expect(res.remainingPoints).toBe(1);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'scrape' mode (DEFAULT)", async () => {
|
||||
const limiter = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someTokenX"
|
||||
);
|
||||
const consumePoints = 4;
|
||||
|
||||
const res = await limiter.consume("test-prefix:someTokenX", consumePoints);
|
||||
expect(res.remainingPoints).toBe(16);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'scrape' mode (HOBBY)", async () => {
|
||||
const limiter = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someTokenXY",
|
||||
"hobby"
|
||||
);
|
||||
// expect hobby to have 100 points
|
||||
expect(limiter.points).toBe(10);
|
||||
|
||||
const consumePoints = 5;
|
||||
|
||||
const res = await limiter.consume("test-prefix:someTokenXY", consumePoints);
|
||||
expect(res.consumedPoints).toBe(5);
|
||||
expect(res.remainingPoints).toBe(5);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'crawl' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(2);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"starter"
|
||||
);
|
||||
expect(limiter2.points).toBe(3);
|
||||
|
||||
const limiter3 = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter3.points).toBe(5);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'scrape' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"starter"
|
||||
);
|
||||
expect(limiter2.points).toBe(20);
|
||||
|
||||
const limiter3 = getRateLimiter(
|
||||
"scrape" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter3.points).toBe(50);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'search' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"starter"
|
||||
);
|
||||
expect(limiter2.points).toBe(20);
|
||||
|
||||
const limiter3 = getRateLimiter(
|
||||
"search" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"standard"
|
||||
);
|
||||
expect(limiter3.points).toBe(40);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'preview' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"preview" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(5);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"preview" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(5);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'account' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"account" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(100);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"account" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(100);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'crawlStatus' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(150);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'testSuite' mode", () => {
|
||||
const limiter = getRateLimiter(
|
||||
"testSuite" as RateLimiterMode,
|
||||
"test-prefix:someToken",
|
||||
"free"
|
||||
);
|
||||
expect(limiter.points).toBe(10000);
|
||||
|
||||
const limiter2 = getRateLimiter(
|
||||
"testSuite" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(10000);
|
||||
});
|
||||
|
||||
it("should throw an error when consuming more points than available", async () => {
|
||||
const limiter = getRateLimiter(
|
||||
"crawl" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
const consumePoints = limiter.points + 1;
|
||||
|
||||
try {
|
||||
await limiter.consume("test-prefix:someToken", consumePoints);
|
||||
} catch (error) {
|
||||
// expect remaining points to be 0
|
||||
const res = await limiter.get("test-prefix:someToken");
|
||||
expect(res.remainingPoints).toBe(0);
|
||||
}
|
||||
});
|
||||
|
||||
it("should reset points after duration", async () => {
|
||||
const keyPrefix = "test-prefix";
|
||||
const points = 10;
|
||||
const duration = 1; // 1 second
|
||||
const limiter = new RateLimiterRedis({
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix,
|
||||
points,
|
||||
duration,
|
||||
});
|
||||
|
||||
const consumePoints = 5;
|
||||
await limiter.consume("test-prefix:someToken", consumePoints);
|
||||
await new Promise((resolve) => setTimeout(resolve, duration * 1000 + 100)); // Wait for duration + 100ms
|
||||
|
||||
const res = await limiter.consume("test-prefix:someToken", consumePoints);
|
||||
expect(res.remainingPoints).toBe(points - consumePoints);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import * as redis from "redis";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import Redis from "ioredis";
|
||||
|
||||
const RATE_LIMITS = {
|
||||
crawl: {
|
||||
|
@ -9,7 +9,7 @@ const RATE_LIMITS = {
|
|||
starter: 3,
|
||||
standard: 5,
|
||||
standardOld: 40,
|
||||
scale: 20,
|
||||
scale: 50,
|
||||
hobby: 3,
|
||||
standardNew: 10,
|
||||
standardnew: 10,
|
||||
|
@ -21,7 +21,7 @@ const RATE_LIMITS = {
|
|||
starter: 20,
|
||||
standard: 50,
|
||||
standardOld: 40,
|
||||
scale: 50,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
standardnew: 50,
|
||||
|
@ -33,7 +33,7 @@ const RATE_LIMITS = {
|
|||
starter: 20,
|
||||
standard: 40,
|
||||
standardOld: 40,
|
||||
scale: 50,
|
||||
scale: 500,
|
||||
hobby: 10,
|
||||
standardNew: 50,
|
||||
standardnew: 50,
|
||||
|
@ -57,14 +57,13 @@ const RATE_LIMITS = {
|
|||
},
|
||||
};
|
||||
|
||||
export const redisClient = redis.createClient({
|
||||
url: process.env.REDIS_URL,
|
||||
legacyMode: true,
|
||||
});
|
||||
export const redisRateLimitClient = new Redis(
|
||||
process.env.REDIS_RATE_LIMIT_URL
|
||||
)
|
||||
|
||||
const createRateLimiter = (keyPrefix, points) =>
|
||||
new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix,
|
||||
points,
|
||||
duration: 60, // Duration in seconds
|
||||
|
@ -76,7 +75,7 @@ export const serverRateLimiter = createRateLimiter(
|
|||
);
|
||||
|
||||
export const testSuiteRateLimiter = new RateLimiterRedis({
|
||||
storeClient: redisClient,
|
||||
storeClient: redisRateLimitClient,
|
||||
keyPrefix: "test-suite",
|
||||
points: 10000,
|
||||
duration: 60, // Duration in seconds
|
||||
|
|
|
@ -1,32 +1,31 @@
|
|||
import Redis from "ioredis";
|
||||
|
||||
// Initialize Redis client
|
||||
const redis = new Redis(process.env.REDIS_URL);
|
||||
import { redisRateLimitClient } from "./rate-limiter";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// Listen to 'error' events to the Redis connection
|
||||
redis.on("error", (error) => {
|
||||
redisRateLimitClient.on("error", (error) => {
|
||||
try {
|
||||
if (error.message === "ECONNRESET") {
|
||||
console.log("Connection to Redis Session Store timed out.");
|
||||
Logger.error("Connection to Redis Session Rate Limit Store timed out.");
|
||||
} else if (error.message === "ECONNREFUSED") {
|
||||
console.log("Connection to Redis Session Store refused!");
|
||||
} else console.log(error);
|
||||
Logger.error("Connection to Redis Session Rate Limit Store refused!");
|
||||
} else Logger.error(error);
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
// Listen to 'reconnecting' event to Redis
|
||||
redis.on("reconnecting", (err) => {
|
||||
redisRateLimitClient.on("reconnecting", (err) => {
|
||||
try {
|
||||
if (redis.status === "reconnecting")
|
||||
console.log("Reconnecting to Redis Session Store...");
|
||||
else console.log("Error reconnecting to Redis Session Store.");
|
||||
if (redisRateLimitClient.status === "reconnecting")
|
||||
Logger.info("Reconnecting to Redis Session Rate Limit Store...");
|
||||
else Logger.error("Error reconnecting to Redis Session Rate Limit Store.");
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
// Listen to the 'connect' event to Redis
|
||||
redis.on("connect", (err) => {
|
||||
redisRateLimitClient.on("connect", (err) => {
|
||||
try {
|
||||
if (!err) console.log("Connected to Redis Session Store!");
|
||||
if (!err) Logger.info("Connected to Redis Session Rate Limit Store!");
|
||||
} catch (error) {}
|
||||
});
|
||||
|
||||
|
@ -38,9 +37,9 @@ redis.on("connect", (err) => {
|
|||
*/
|
||||
const setValue = async (key: string, value: string, expire?: number) => {
|
||||
if (expire) {
|
||||
await redis.set(key, value, "EX", expire);
|
||||
await redisRateLimitClient.set(key, value, "EX", expire);
|
||||
} else {
|
||||
await redis.set(key, value);
|
||||
await redisRateLimitClient.set(key, value);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -50,7 +49,7 @@ const setValue = async (key: string, value: string, expire?: number) => {
|
|||
* @returns {Promise<string|null>} The value, if found, otherwise null.
|
||||
*/
|
||||
const getValue = async (key: string): Promise<string | null> => {
|
||||
const value = await redis.get(key);
|
||||
const value = await redisRateLimitClient.get(key);
|
||||
return value;
|
||||
};
|
||||
|
||||
|
@ -59,7 +58,7 @@ const getValue = async (key: string): Promise<string | null> => {
|
|||
* @param {string} key The key to delete.
|
||||
*/
|
||||
const deleteKey = async (key: string) => {
|
||||
await redis.del(key);
|
||||
await redisRateLimitClient.del(key);
|
||||
};
|
||||
|
||||
export { setValue, getValue, deleteKey };
|
||||
|
|
29
apps/api/src/services/redlock.ts
Normal file
29
apps/api/src/services/redlock.ts
Normal file
|
@ -0,0 +1,29 @@
|
|||
import Redlock from "redlock";
|
||||
import Client from "ioredis";
|
||||
|
||||
export const redlock = new Redlock(
|
||||
// You should have one client for each independent redis node
|
||||
// or cluster.
|
||||
[new Client(process.env.REDIS_RATE_LIMIT_URL)],
|
||||
{
|
||||
// The expected clock drift; for more details see:
|
||||
// http://redis.io/topics/distlock
|
||||
driftFactor: 0.01, // multiplied by lock ttl to determine drift time
|
||||
|
||||
// The max number of times Redlock will attempt to lock a resource
|
||||
// before erroring.
|
||||
retryCount: 5,
|
||||
|
||||
// the time in ms between attempts
|
||||
retryDelay: 100, // time in ms
|
||||
|
||||
// the max time in ms randomly added to retries
|
||||
// to improve performance under high contention
|
||||
// see https://www.awsarchitectureblog.com/2015/03/backoff.html
|
||||
retryJitter: 200, // time in ms
|
||||
|
||||
// The minimum remaining time on a lock before an extension is automatically
|
||||
// attempted with the `using` API.
|
||||
automaticExtensionThreshold: 500, // time in ms
|
||||
}
|
||||
);
|
|
@ -1,4 +1,5 @@
|
|||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { Logger } from "../lib/logger";
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
class SupabaseService {
|
||||
|
@ -10,13 +11,13 @@ class SupabaseService {
|
|||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
// Warn the user that Authentication is disabled by setting the client to null
|
||||
console.warn(
|
||||
"\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
|
||||
Logger.warn(
|
||||
"Authentication is disabled. Supabase client will not be initialized."
|
||||
);
|
||||
this.client = null;
|
||||
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||
console.error(
|
||||
"\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
|
||||
Logger.error(
|
||||
"Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable"
|
||||
);
|
||||
} else {
|
||||
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||
|
@ -38,9 +39,6 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||
const client = target.getClient();
|
||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||
if (client === null) {
|
||||
console.error(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
return () => {
|
||||
throw new Error("Supabase client is not configured.");
|
||||
};
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
|
||||
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
try {
|
||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
let webhookUrl = selfHostedUrl;
|
||||
|
||||
|
@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
|||
.eq("team_id", teamId)
|
||||
.limit(1);
|
||||
if (error) {
|
||||
console.error(
|
||||
`Error fetching webhook URL for team ID: ${teamId}`,
|
||||
error.message
|
||||
);
|
||||
Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
|||
}),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(
|
||||
`Error sending webhook for team ID: ${teamId}`,
|
||||
error.message
|
||||
);
|
||||
Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -1,2 +1,4 @@
|
|||
export const errorNoResults =
|
||||
"No results found, please check the URL or contact us at help@mendable.ai to file a ticket.";
|
||||
|
||||
export const clientSideError = "client-side exception has occurred"
|
|
@ -48,6 +48,7 @@ export interface RunWebScraperResult {
|
|||
}
|
||||
|
||||
export interface FirecrawlJob {
|
||||
job_id?: string;
|
||||
success: boolean;
|
||||
message: string;
|
||||
num_docs: number;
|
||||
|
@ -61,6 +62,7 @@ export interface FirecrawlJob {
|
|||
origin: string;
|
||||
extractor_options?: ExtractorOptions,
|
||||
num_tokens?: number,
|
||||
retry?: boolean,
|
||||
}
|
||||
|
||||
export interface FirecrawlScrapeResponse {
|
||||
|
|
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
25
apps/go-sdk/examples/.gitignore
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
# If you prefer the allow list template instead of the deny list, see community template:
|
||||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||
#
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
go.work.sum
|
||||
|
||||
# env file
|
||||
.env
|
21
apps/go-sdk/examples/LICENSE
Normal file
21
apps/go-sdk/examples/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Mendable
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
87
apps/go-sdk/examples/example.go
Normal file
87
apps/go-sdk/examples/example.go
Normal file
|
@ -0,0 +1,87 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mendableai/firecrawl-go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a website
|
||||
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult.Markdown)
|
||||
|
||||
// Crawl a website
|
||||
idempotencyKey := uuid.New().String() // optional idempotency key
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal crawl result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonCrawlResult))
|
||||
|
||||
// LLM Extraction using JSON schema
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
Mode: "llm-extraction",
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
|
||||
// Pretty print the LLM extraction result
|
||||
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonResult))
|
||||
}
|
9
apps/go-sdk/examples/go.mod
Normal file
9
apps/go-sdk/examples/go.mod
Normal file
|
@ -0,0 +1,9 @@
|
|||
module github.com/mendableai/firecrawl-go-examples
|
||||
|
||||
go 1.22.5
|
||||
|
||||
replace github.com/mendableai/firecrawl => ../
|
||||
|
||||
require github.com/google/uuid v1.6.0
|
||||
|
||||
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
14
apps/go-sdk/examples/go.sum
Normal file
14
apps/go-sdk/examples/go.sum
Normal file
|
@ -0,0 +1,14 @@
|
|||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
2
apps/go-sdk/firecrawl/.env.example
Normal file
2
apps/go-sdk/firecrawl/.env.example
Normal file
|
@ -0,0 +1,2 @@
|
|||
API_URL=http://localhost:3002
|
||||
TEST_API_KEY=fc-YOUR-API-KEY
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
2
apps/go-sdk/firecrawl/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
.env
|
||||
vendor
|
21
apps/go-sdk/firecrawl/LICENSE
Normal file
21
apps/go-sdk/firecrawl/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Sideguide Technologies Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
189
apps/go-sdk/firecrawl/README.md
Normal file
189
apps/go-sdk/firecrawl/README.md
Normal file
|
@ -0,0 +1,189 @@
|
|||
# Firecrawl Go SDK
|
||||
|
||||
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Go SDK, you can
|
||||
|
||||
```bash
|
||||
go get github.com/mendableai/firecrawl
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK with error handling:
|
||||
|
||||
```go
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/mendableai/firecrawl/firecrawl"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Initialize the FirecrawlApp with your API key
|
||||
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a single URL
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while scraping: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
|
||||
// Crawl a website
|
||||
crawlUrl := "https://mendable.ai"
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
crawlResult, err := app.CrawlURL(crawlUrl, params)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while crawling: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
}
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```go
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
|
||||
|
||||
```go
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
},
|
||||
}
|
||||
|
||||
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
|
||||
```go
|
||||
query := "what is mendable?"
|
||||
searchResult, err := app.Search(query)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to search: %v", err)
|
||||
}
|
||||
fmt.Println(searchResult)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```go
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
"includes": []string{}, // leave empty for all pages
|
||||
"limit": 1000,
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```go
|
||||
status, err := app.CheckCrawlStatus(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to check crawl status: %v", err)
|
||||
}
|
||||
fmt.Println(status)
|
||||
```
|
||||
|
||||
### Canceling a Crawl Job
|
||||
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
|
||||
|
||||
```go
|
||||
canceled, err := app.CancelCrawlJob(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to cancel crawl job: %v", err)
|
||||
}
|
||||
fmt.Println(canceled)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
|
||||
|
||||
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user