mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
fix(go-sdk): submodules
This commit is contained in:
parent
96e91ab9ec
commit
1baba3ce0a
8
.gitmodules
vendored
8
.gitmodules
vendored
|
@ -1,6 +1,6 @@
|
|||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
[submodule "apps/go-sdk/firecrawl-go"]
|
||||
path = apps/go-sdk/firecrawl-go
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
[submodule "apps/go-sdk/firecrawl-go-examples"]
|
||||
path = apps/go-sdk/firecrawl-go-examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc
|
||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"corterix.com"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||
|
|
25
apps/go-sdk/examples/.gitignore
vendored
25
apps/go-sdk/examples/.gitignore
vendored
|
@ -1,25 +0,0 @@
|
|||
# If you prefer the allow list template instead of the deny list, see community template:
|
||||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||
#
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
go.work.sum
|
||||
|
||||
# env file
|
||||
.env
|
|
@ -1,21 +0,0 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Mendable
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -1,87 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mendableai/firecrawl-go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a website
|
||||
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult.Markdown)
|
||||
|
||||
// Crawl a website
|
||||
idempotencyKey := uuid.New().String() // optional idempotency key
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal crawl result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonCrawlResult))
|
||||
|
||||
// LLM Extraction using JSON schema
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
Mode: "llm-extraction",
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
|
||||
// Pretty print the LLM extraction result
|
||||
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonResult))
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
module github.com/mendableai/firecrawl-go-examples
|
||||
|
||||
go 1.22.5
|
||||
|
||||
replace github.com/mendableai/firecrawl => ../
|
||||
|
||||
require github.com/google/uuid v1.6.0
|
||||
|
||||
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
|
@ -1,14 +0,0 @@
|
|||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
|
@ -1,2 +0,0 @@
|
|||
API_URL=http://localhost:3002
|
||||
TEST_API_KEY=fc-YOUR-API-KEY
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
2
apps/go-sdk/firecrawl/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
|||
.env
|
||||
vendor
|
|
@ -1,21 +0,0 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Sideguide Technologies Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -1,189 +0,0 @@
|
|||
# Firecrawl Go SDK
|
||||
|
||||
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Go SDK, you can
|
||||
|
||||
```bash
|
||||
go get github.com/mendableai/firecrawl
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK with error handling:
|
||||
|
||||
```go
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/mendableai/firecrawl/firecrawl"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Initialize the FirecrawlApp with your API key
|
||||
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a single URL
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while scraping: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
|
||||
// Crawl a website
|
||||
crawlUrl := "https://mendable.ai"
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
crawlResult, err := app.CrawlURL(crawlUrl, params)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while crawling: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
}
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```go
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
|
||||
|
||||
```go
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
},
|
||||
}
|
||||
|
||||
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
|
||||
```go
|
||||
query := "what is mendable?"
|
||||
searchResult, err := app.Search(query)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to search: %v", err)
|
||||
}
|
||||
fmt.Println(searchResult)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```go
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
"includes": []string{}, // leave empty for all pages
|
||||
"limit": 1000,
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```go
|
||||
status, err := app.CheckCrawlStatus(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to check crawl status: %v", err)
|
||||
}
|
||||
fmt.Println(status)
|
||||
```
|
||||
|
||||
### Canceling a Crawl Job
|
||||
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
|
||||
|
||||
```go
|
||||
canceled, err := app.CancelCrawlJob(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to cancel crawl job: %v", err)
|
||||
}
|
||||
fmt.Println(canceled)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
|
||||
|
||||
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
|
|
@ -1,584 +0,0 @@
|
|||
// Package firecrawl provides a client for interacting with the Firecrawl API.
|
||||
package firecrawl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FirecrawlDocumentMetadata represents metadata for a Firecrawl document
|
||||
type FirecrawlDocumentMetadata struct {
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Keywords string `json:"keywords,omitempty"`
|
||||
Robots string `json:"robots,omitempty"`
|
||||
OGTitle string `json:"ogTitle,omitempty"`
|
||||
OGDescription string `json:"ogDescription,omitempty"`
|
||||
OGURL string `json:"ogUrl,omitempty"`
|
||||
OGImage string `json:"ogImage,omitempty"`
|
||||
OGAudio string `json:"ogAudio,omitempty"`
|
||||
OGDeterminer string `json:"ogDeterminer,omitempty"`
|
||||
OGLocale string `json:"ogLocale,omitempty"`
|
||||
OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"`
|
||||
OGSiteName string `json:"ogSiteName,omitempty"`
|
||||
OGVideo string `json:"ogVideo,omitempty"`
|
||||
DCTermsCreated string `json:"dctermsCreated,omitempty"`
|
||||
DCDateCreated string `json:"dcDateCreated,omitempty"`
|
||||
DCDate string `json:"dcDate,omitempty"`
|
||||
DCTermsType string `json:"dctermsType,omitempty"`
|
||||
DCType string `json:"dcType,omitempty"`
|
||||
DCTermsAudience string `json:"dctermsAudience,omitempty"`
|
||||
DCTermsSubject string `json:"dctermsSubject,omitempty"`
|
||||
DCSubject string `json:"dcSubject,omitempty"`
|
||||
DCDescription string `json:"dcDescription,omitempty"`
|
||||
DCTermsKeywords string `json:"dctermsKeywords,omitempty"`
|
||||
ModifiedTime string `json:"modifiedTime,omitempty"`
|
||||
PublishedTime string `json:"publishedTime,omitempty"`
|
||||
ArticleTag string `json:"articleTag,omitempty"`
|
||||
ArticleSection string `json:"articleSection,omitempty"`
|
||||
SourceURL string `json:"sourceURL,omitempty"`
|
||||
PageStatusCode int `json:"pageStatusCode,omitempty"`
|
||||
PageError string `json:"pageError,omitempty"`
|
||||
}
|
||||
|
||||
// FirecrawlDocument represents a document in Firecrawl
|
||||
type FirecrawlDocument struct {
|
||||
ID string `json:"id,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Content string `json:"content"`
|
||||
Markdown string `json:"markdown,omitempty"`
|
||||
HTML string `json:"html,omitempty"`
|
||||
LLMExtraction map[string]any `json:"llm_extraction,omitempty"`
|
||||
CreatedAt *time.Time `json:"createdAt,omitempty"`
|
||||
UpdatedAt *time.Time `json:"updatedAt,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"`
|
||||
ChildrenLinks []string `json:"childrenLinks,omitempty"`
|
||||
Provider string `json:"provider,omitempty"`
|
||||
Warning string `json:"warning,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
// ExtractorOptions represents options for extraction.
|
||||
type ExtractorOptions struct {
|
||||
Mode string `json:"mode,omitempty"`
|
||||
ExtractionPrompt string `json:"extractionPrompt,omitempty"`
|
||||
ExtractionSchema any `json:"extractionSchema,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResponse represents the response for scraping operations
|
||||
type ScrapeResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data *FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// SearchResponse represents the response for searching operations
|
||||
type SearchResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// CrawlResponse represents the response for crawling operations
|
||||
type CrawlResponse struct {
|
||||
Success bool `json:"success"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// JobStatusResponse represents the response for checking crawl job status
|
||||
type JobStatusResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
Current int `json:"current,omitempty"`
|
||||
CurrentURL string `json:"current_url,omitempty"`
|
||||
CurrentStep string `json:"current_step,omitempty"`
|
||||
Total int `json:"total,omitempty"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
PartialData []*FirecrawlDocument `json:"partial_data,omitempty"`
|
||||
}
|
||||
|
||||
// CancelCrawlJobResponse represents the response for canceling a crawl job
|
||||
type CancelCrawlJobResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// requestOptions represents options for making requests.
|
||||
type requestOptions struct {
|
||||
retries int
|
||||
backoff int
|
||||
}
|
||||
|
||||
// requestOption is a functional option type for requestOptions.
|
||||
type requestOption func(*requestOptions)
|
||||
|
||||
// newRequestOptions creates a new requestOptions instance with the provided options.
|
||||
//
|
||||
// Parameters:
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - *requestOptions: A new instance of requestOptions with the provided options.
|
||||
func newRequestOptions(opts ...requestOption) *requestOptions {
|
||||
options := &requestOptions{retries: 1}
|
||||
for _, opt := range opts {
|
||||
opt(options)
|
||||
}
|
||||
return options
|
||||
}
|
||||
|
||||
// withRetries sets the number of retries for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - retries: The number of retries to be performed.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the number of retries for a request.
|
||||
func withRetries(retries int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.retries = retries
|
||||
}
|
||||
}
|
||||
|
||||
// withBackoff sets the backoff interval for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - backoff: The backoff interval (in milliseconds) to be used for retries.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the backoff interval for a request.
|
||||
func withBackoff(backoff int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.backoff = backoff
|
||||
}
|
||||
}
|
||||
|
||||
// FirecrawlApp represents a client for the Firecrawl API.
|
||||
type FirecrawlApp struct {
|
||||
APIKey string
|
||||
APIURL string
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL.
|
||||
// If the API key or API URL is not provided, it attempts to retrieve them from environment variables.
|
||||
// If the API key is still not found, it returns an error.
|
||||
//
|
||||
// Parameters:
|
||||
// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable.
|
||||
// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev".
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL.
|
||||
// - error: An error if the API key is not provided or retrieved.
|
||||
func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) {
|
||||
if apiKey == "" {
|
||||
apiKey = os.Getenv("FIRECRAWL_API_KEY")
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("no API key provided")
|
||||
}
|
||||
}
|
||||
|
||||
if apiURL == "" {
|
||||
apiURL = os.Getenv("FIRECRAWL_API_URL")
|
||||
if apiURL == "" {
|
||||
apiURL = "https://api.firecrawl.dev"
|
||||
}
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
Timeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
return &FirecrawlApp{
|
||||
APIKey: apiKey,
|
||||
APIURL: apiURL,
|
||||
Client: client,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ScrapeURL scrapes the content of the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to be scraped.
|
||||
// - params: Optional parameters for the scrape request, including extractor options for LLM extraction.
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlDocument: The scraped document data.
|
||||
// - error: An error if the scrape request fails.
|
||||
func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
scrapeBody := map[string]any{"url": url}
|
||||
|
||||
if params != nil {
|
||||
if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok {
|
||||
if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok {
|
||||
extractorOptions.ExtractionSchema = schema.schema()
|
||||
}
|
||||
if extractorOptions.Mode == "" {
|
||||
extractorOptions.Mode = "llm-extraction"
|
||||
}
|
||||
scrapeBody["extractorOptions"] = extractorOptions
|
||||
}
|
||||
|
||||
for key, value := range params {
|
||||
if key != "extractorOptions" {
|
||||
scrapeBody[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/scrape", app.APIURL),
|
||||
scrapeBody,
|
||||
headers,
|
||||
"scrape URL",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var scrapeResponse ScrapeResponse
|
||||
err = json.Unmarshal(resp, &scrapeResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if scrapeResponse.Success {
|
||||
return scrapeResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to scrape URL")
|
||||
}
|
||||
|
||||
// Search performs a search query using the Firecrawl API and returns the search results.
|
||||
//
|
||||
// Parameters:
|
||||
// - query: The search query string.
|
||||
// - params: Optional parameters for the search request.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results.
|
||||
// - error: An error if the search request fails.
|
||||
func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
searchBody := map[string]any{"query": query}
|
||||
for k, v := range params {
|
||||
searchBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/search", app.APIURL),
|
||||
searchBody,
|
||||
headers,
|
||||
"search",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var searchResponse SearchResponse
|
||||
err = json.Unmarshal(resp, &searchResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if searchResponse.Success {
|
||||
return searchResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to search")
|
||||
}
|
||||
|
||||
// CrawlURL starts a crawl job for the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to crawl.
|
||||
// - params: Optional parameters for the crawl request.
|
||||
// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true.
|
||||
// - idempotencyKey: An optional idempotency key to ensure the request is idempotent.
|
||||
//
|
||||
// Returns:
|
||||
// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true.
|
||||
// - error: An error if the crawl request fails.
|
||||
func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) {
|
||||
headers := app.prepareHeaders(idempotencyKey)
|
||||
crawlBody := map[string]any{"url": url}
|
||||
for k, v := range params {
|
||||
crawlBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/crawl", app.APIURL),
|
||||
crawlBody,
|
||||
headers,
|
||||
"start crawl job",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var crawlResponse CrawlResponse
|
||||
err = json.Unmarshal(resp, &crawlResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if waitUntilDone {
|
||||
return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval)
|
||||
}
|
||||
|
||||
if crawlResponse.JobID == "" {
|
||||
return nil, fmt.Errorf("failed to get job ID")
|
||||
}
|
||||
|
||||
return crawlResponse.JobID, nil
|
||||
}
|
||||
|
||||
// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to check.
|
||||
//
|
||||
// Returns:
|
||||
// - *JobStatusResponse: The status of the crawl job.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var jobStatusResponse JobStatusResponse
|
||||
err = json.Unmarshal(resp, &jobStatusResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &jobStatusResponse, nil
|
||||
}
|
||||
|
||||
// CancelCrawlJob cancels a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to cancel.
|
||||
//
|
||||
// Returns:
|
||||
// - string: The status of the crawl job after cancellation.
|
||||
// - error: An error if the crawl job cancellation request fails.
|
||||
func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodDelete,
|
||||
fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"cancel crawl job",
|
||||
)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var cancelCrawlJobResponse CancelCrawlJobResponse
|
||||
err = json.Unmarshal(resp, &cancelCrawlJobResponse)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return cancelCrawlJobResponse.Status, nil
|
||||
}
|
||||
|
||||
// prepareHeaders prepares the headers for an HTTP request.
|
||||
//
|
||||
// Parameters:
|
||||
// - idempotencyKey: A string representing the idempotency key to be included in the headers.
|
||||
// If the idempotency key is an empty string, it will not be included in the headers.
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]string: A map containing the headers for the HTTP request.
|
||||
func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string {
|
||||
headers := map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": fmt.Sprintf("Bearer %s", app.APIKey),
|
||||
}
|
||||
if idempotencyKey != "" {
|
||||
headers["x-idempotency-key"] = idempotencyKey
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
// makeRequest makes a request to the specified URL with the provided method, data, headers, and options.
|
||||
//
|
||||
// Parameters:
|
||||
// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE").
|
||||
// - url: The URL to send the request to.
|
||||
// - data: The data to be sent in the request body.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - action: A string describing the action being performed.
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - []byte: The response body from the request.
|
||||
// - error: An error if the request fails.
|
||||
func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) {
|
||||
var body []byte
|
||||
var err error
|
||||
if data != nil {
|
||||
body, err = json.Marshal(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(method, url, bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for key, value := range headers {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
options := newRequestOptions(opts...)
|
||||
for i := 0; i < options.retries; i++ {
|
||||
resp, err = app.Client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 502 {
|
||||
break
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond)
|
||||
}
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
statusCode := resp.StatusCode
|
||||
if statusCode != 200 {
|
||||
return nil, app.handleError(statusCode, respBody, action)
|
||||
}
|
||||
|
||||
return respBody, nil
|
||||
}
|
||||
|
||||
// monitorJobStatus monitors the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to monitor.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: The crawl result if the job is completed.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) {
|
||||
attempts := 0
|
||||
for {
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var statusData JobStatusResponse
|
||||
err = json.Unmarshal(resp, &statusData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status := statusData.Status
|
||||
if status == "" {
|
||||
return nil, fmt.Errorf("invalid status in response")
|
||||
}
|
||||
|
||||
if status == "completed" {
|
||||
if statusData.Data != nil {
|
||||
return statusData.Data, nil
|
||||
}
|
||||
attempts++
|
||||
if attempts > 3 {
|
||||
return nil, fmt.Errorf("crawl job completed but no data was returned")
|
||||
}
|
||||
} else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" {
|
||||
pollInterval = max(pollInterval, 2)
|
||||
time.Sleep(time.Duration(pollInterval) * time.Second)
|
||||
} else {
|
||||
return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleError handles errors returned by the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - resp: The HTTP response object.
|
||||
// - body: The response body from the HTTP response.
|
||||
// - action: A string describing the action being performed.
|
||||
//
|
||||
// Returns:
|
||||
// - error: An error describing the failure reason.
|
||||
func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error {
|
||||
var errorData map[string]any
|
||||
err := json.Unmarshal(body, &errorData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse error response: %v", err)
|
||||
}
|
||||
|
||||
errorMessage, _ := errorData["error"].(string)
|
||||
if errorMessage == "" {
|
||||
errorMessage = "No additional error details provided."
|
||||
}
|
||||
|
||||
var message string
|
||||
switch statusCode {
|
||||
case 402:
|
||||
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
|
||||
case 408:
|
||||
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
|
||||
case 409:
|
||||
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
|
||||
case 500:
|
||||
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
|
||||
default:
|
||||
message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage)
|
||||
}
|
||||
|
||||
return fmt.Errorf(message)
|
||||
}
|
|
@ -1,292 +0,0 @@
|
|||
package firecrawl
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/joho/godotenv"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var API_URL string
|
||||
var TEST_API_KEY string
|
||||
|
||||
func init() {
|
||||
err := godotenv.Load("../.env")
|
||||
if err != nil {
|
||||
log.Fatalf("Error loading .env file: %v", err)
|
||||
}
|
||||
API_URL = os.Getenv("API_URL")
|
||||
TEST_API_KEY = os.Getenv("TEST_API_KEY")
|
||||
}
|
||||
|
||||
func TestNoAPIKey(t *testing.T) {
|
||||
_, err := NewFirecrawlApp("", API_URL)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no API key provided")
|
||||
}
|
||||
|
||||
func TestScrapeURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://firecrawl.dev", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://facebook.com/fake-test", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestScrapeURLE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.NotEqual(t, response.Markdown, "")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
assert.Equal(t, response.HTML, "")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"includeHtml": true,
|
||||
},
|
||||
}
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.Contains(t, response.Markdown, "_Roast_")
|
||||
assert.Contains(t, response.HTML, "<h1")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil)
|
||||
time.Sleep(6 * time.Second) // wait for 6 seconds
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestCrawlURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestShouldReturnErrorForBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestCrawlURLWaitForCompletionE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
uniqueIdempotencyKey := uuid.New().String()
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, uniqueIdempotencyKey)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", params, true, 2, uniqueIdempotencyKey)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used")
|
||||
}
|
||||
|
||||
func TestCheckCrawlStatusE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", params, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
time.Sleep(30 * time.Second) // wait for 30 seconds
|
||||
|
||||
statusResponse, err := app.CheckCrawlStatus(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, statusResponse)
|
||||
|
||||
assert.Equal(t, "completed", statusResponse.Status)
|
||||
assert.Greater(t, len(statusResponse.Data), 0)
|
||||
}
|
||||
|
||||
func TestSearchE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.Search("test query", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Greater(t, len(response), 2)
|
||||
assert.NotEqual(t, response[0].Content, "")
|
||||
}
|
||||
|
||||
func TestSearchInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.Search("test query", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestLLMExtraction(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"extractorOptions": ExtractorOptions{
|
||||
Mode: "llm-extraction",
|
||||
ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
ExtractionSchema: map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"company_mission": map[string]string{"type": "string"},
|
||||
"supports_sso": map[string]string{"type": "boolean"},
|
||||
"is_open_source": map[string]string{"type": "boolean"},
|
||||
},
|
||||
"required": []string{"company_mission", "supports_sso", "is_open_source"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
response, err := app.ScrapeURL("https://mendable.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.LLMExtraction, "company_mission")
|
||||
assert.IsType(t, true, response.LLMExtraction["supports_sso"])
|
||||
assert.IsType(t, true, response.LLMExtraction["is_open_source"])
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CancelCrawlJob("test query")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestCancelNonExistingCrawlJob(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
jobID := uuid.New().String()
|
||||
_, err = app.CancelCrawlJob(jobID)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Job not found")
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
status, err := app.CancelCrawlJob(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cancelled", status)
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
module github.com/mendableai/firecrawl-go
|
||||
|
||||
go 1.22.5
|
||||
|
||||
require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/stretchr/testify v1.9.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
|
@ -1,14 +0,0 @@
|
|||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
Loading…
Reference in New Issue
Block a user