mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
feat(sdk/rust): first batch of changes for 1.0.0
This commit is contained in:
parent
6aa468163e
commit
93a20442e3
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"rust-analyzer.linkedProjects": [
|
||||
"apps/rust-sdk/Cargo.toml"
|
||||
]
|
||||
}
|
229
apps/rust-sdk/Cargo.lock
generated
229
apps/rust-sdk/Cargo.lock
generated
|
@ -26,6 +26,21 @@ dependencies = [
|
|||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arrayref"
|
||||
version = "0.3.7"
|
||||
|
@ -151,6 +166,19 @@ version = "1.0.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clippy"
|
||||
version = "0.0.302"
|
||||
|
@ -197,6 +225,51 @@ version = "0.8.20"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diff"
|
||||
version = "0.1.13"
|
||||
|
@ -215,10 +288,10 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "dotenv"
|
||||
version = "0.15.0"
|
||||
name = "dotenvy"
|
||||
version = "0.15.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
|
||||
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
|
@ -276,16 +349,17 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
|||
|
||||
[[package]]
|
||||
name = "firecrawl"
|
||||
version = "0.1.0"
|
||||
version = "1.0.0"
|
||||
dependencies = [
|
||||
"assert_matches",
|
||||
"clippy",
|
||||
"dotenv",
|
||||
"dotenvy",
|
||||
"log 0.4.22",
|
||||
"reqwest",
|
||||
"rustfmt",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"uuid",
|
||||
|
@ -426,13 +500,19 @@ dependencies = [
|
|||
"futures-core",
|
||||
"futures-sink",
|
||||
"http",
|
||||
"indexmap",
|
||||
"indexmap 2.2.6",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
|
@ -445,6 +525,12 @@ version = "0.3.9"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
|
||||
|
||||
[[package]]
|
||||
name = "hex"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.1.0"
|
||||
|
@ -558,6 +644,35 @@ dependencies = [
|
|||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.61"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone-haiku"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ident_case"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
|
@ -568,6 +683,17 @@ dependencies = [
|
|||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||
dependencies = [
|
||||
"autocfg 1.3.0",
|
||||
"hashbrown 0.12.3",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.2.6"
|
||||
|
@ -575,7 +701,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
"hashbrown 0.14.5",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -701,6 +828,12 @@ dependencies = [
|
|||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
|
@ -846,6 +979,12 @@ version = "0.3.30"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.86"
|
||||
|
@ -1293,6 +1432,36 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_with"
|
||||
version = "3.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"chrono",
|
||||
"hex",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.2.6",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"serde_with_macros",
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_with_macros"
|
||||
version = "3.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.2"
|
||||
|
@ -1342,6 +1511,12 @@ dependencies = [
|
|||
"log 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
|
@ -1489,6 +1664,37 @@ dependencies = [
|
|||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.7.0"
|
||||
|
@ -1843,6 +2049,15 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
[package]
|
||||
name = "firecrawl"
|
||||
author="Mendable.ai"
|
||||
version = "0.1.0"
|
||||
author= "Mendable.ai"
|
||||
version = "1.0.0"
|
||||
edition = "2021"
|
||||
license = "GPL-2.0-or-later"
|
||||
license = "GPL-3.0-or-later"
|
||||
homepage = "https://www.firecrawl.dev/"
|
||||
repository ="https://github.com/mendableai/firecrawl"
|
||||
description = "Rust SDK for Firecrawl API."
|
||||
authors = ["sanix-darker <sanixdk@gmail.com>"]
|
||||
authors = ["Gergő Móricz <mogery@firecrawl.dev>", "sanix-darker <sanixdk@gmail.com>"]
|
||||
|
||||
[lib]
|
||||
path = "src/lib.rs"
|
||||
|
@ -18,6 +18,7 @@ name = "firecrawl"
|
|||
reqwest = { version = "^0.12", features = ["json", "blocking"] }
|
||||
serde = { version = "^1.0", features = ["derive"] }
|
||||
serde_json = "^1.0"
|
||||
serde_with = "^3.9"
|
||||
log = "^0.4"
|
||||
thiserror = "^1.0"
|
||||
uuid = { version = "^1.10", features = ["v4"] }
|
||||
|
@ -27,7 +28,7 @@ tokio = { version = "^1", features = ["full"] }
|
|||
clippy = "^0.0.302"
|
||||
rustfmt = "^0.10"
|
||||
assert_matches = "^1.5"
|
||||
dotenv = "^0.15"
|
||||
dotenvy = "^0.15"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
[build-dependencies]
|
||||
|
|
297
apps/rust-sdk/src/crawl.rs
Normal file
297
apps/rust-sdk/src/crawl.rs
Normal file
|
@ -0,0 +1,297 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
|
||||
pub enum CrawlScrapeFormats {
|
||||
/// Will result in a copy of the Markdown content of the page.
|
||||
#[serde(rename = "markdown")]
|
||||
Markdown,
|
||||
|
||||
/// Will result in a copy of the filtered, content-only HTML of the page.
|
||||
#[serde(rename = "html")]
|
||||
HTML,
|
||||
|
||||
/// Will result in a copy of the raw HTML of the page.
|
||||
#[serde(rename = "rawHtml")]
|
||||
RawHTML,
|
||||
|
||||
/// Will result in a Vec of URLs found on the page.
|
||||
#[serde(rename = "links")]
|
||||
Links,
|
||||
|
||||
/// Will result in a URL to a screenshot of the page.
|
||||
///
|
||||
/// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
|
||||
#[serde(rename = "screenshot")]
|
||||
Screenshot,
|
||||
|
||||
/// Will result in a URL to a full-page screenshot of the page.
|
||||
///
|
||||
/// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
|
||||
#[serde(rename = "screenshot@fullPage")]
|
||||
ScreenshotFullPage,
|
||||
}
|
||||
|
||||
impl From<CrawlScrapeFormats> for ScrapeFormats {
|
||||
fn from(value: CrawlScrapeFormats) -> Self {
|
||||
match value {
|
||||
CrawlScrapeFormats::Markdown => Self::Markdown,
|
||||
CrawlScrapeFormats::HTML => Self::HTML,
|
||||
CrawlScrapeFormats::RawHTML => Self::RawHTML,
|
||||
CrawlScrapeFormats::Links => Self::Links,
|
||||
CrawlScrapeFormats::Screenshot => Self::Screenshot,
|
||||
CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlScrapeOptions {
|
||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||
pub formats: Option<Vec<CrawlScrapeFormats>>,
|
||||
|
||||
/// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
|
||||
pub only_main_content: Option<bool>,
|
||||
|
||||
/// HTML tags to exclusively include.
|
||||
///
|
||||
/// For example, if you pass `div`, you will only get content from `<div>`s and their children.
|
||||
pub include_tags: Option<Vec<String>>,
|
||||
|
||||
/// HTML tags to exclude.
|
||||
///
|
||||
/// For example, if you pass `img`, you will never get image URLs in your results.
|
||||
pub exclude_tags: Option<Vec<String>>,
|
||||
|
||||
/// Additional HTTP headers to use when loading the page.
|
||||
pub headers: Option<HashMap<String, String>>,
|
||||
|
||||
// Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
|
||||
pub wait_for: Option<u32>,
|
||||
|
||||
// Timeout before returning an error, in milliseconds. (default: `60000`)
|
||||
pub timeout: Option<u32>,
|
||||
}
|
||||
|
||||
impl From<CrawlScrapeOptions> for ScrapeOptions {
|
||||
fn from(value: CrawlScrapeOptions) -> Self {
|
||||
ScrapeOptions {
|
||||
formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
|
||||
only_main_content: value.only_main_content,
|
||||
include_tags: value.include_tags,
|
||||
exclude_tags: value.exclude_tags,
|
||||
headers: value.headers,
|
||||
wait_for: value.wait_for,
|
||||
timeout: value.timeout,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlOptions {
|
||||
/// Options to pass through to the scraper.
|
||||
pub scrape_options: Option<CrawlScrapeOptions>,
|
||||
|
||||
/// URL RegEx patterns to (exclusively) include.
|
||||
///
|
||||
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
|
||||
pub include_paths: Option<String>,
|
||||
|
||||
/// URL RegEx patterns to exclude.
|
||||
///
|
||||
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
|
||||
pub exclude_paths: Option<String>,
|
||||
|
||||
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
|
||||
pub max_depth: Option<u32>,
|
||||
|
||||
/// Tells the crawler to ignore the sitemap when crawling. (default: `true`)
|
||||
pub ignore_sitemap: Option<bool>,
|
||||
|
||||
/// Maximum number of pages to crawl. (default: `10`)
|
||||
pub limit: Option<u32>,
|
||||
|
||||
/// Allows the crawler to navigate links that are backwards in the URL hierarchy. (default: `false`)
|
||||
pub allow_backward_links: Option<bool>,
|
||||
|
||||
/// Allows the crawler to follow links to external URLs. (default: `false`)
|
||||
pub allow_external_links: Option<bool>,
|
||||
|
||||
/// URL to send Webhook crawl events to.
|
||||
pub webhook: Option<String>,
|
||||
|
||||
/// Idempotency key to send to the crawl endpoint.
|
||||
#[serde(skip)]
|
||||
pub idempotency_key: Option<String>,
|
||||
|
||||
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
|
||||
#[serde(skip)]
|
||||
pub poll_interval: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct CrawlRequestBody {
|
||||
url: String,
|
||||
|
||||
#[serde(flatten)]
|
||||
options: CrawlOptions,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct CrawlResponse {
|
||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||
/// No need to expose.
|
||||
success: bool,
|
||||
|
||||
/// The resulting document.
|
||||
data: Document,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum CrawlStatusTypes {
|
||||
/// The crawl job is in progress.
|
||||
Scraping,
|
||||
|
||||
/// The crawl job has been completed successfully.
|
||||
Completed,
|
||||
|
||||
/// The crawl job has failed.
|
||||
Failed,
|
||||
|
||||
/// The crawl job has been cancelled.
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlStatus {
|
||||
/// The status of the crawl.
|
||||
pub status: CrawlStatusTypes,
|
||||
|
||||
/// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
|
||||
pub total: u32,
|
||||
|
||||
/// Number of pages that have been successfully scraped.
|
||||
pub completed: u32,
|
||||
|
||||
/// Amount of credits used by the crawl job.
|
||||
pub credits_used: u32,
|
||||
|
||||
/// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
|
||||
pub expires_at: String, // TODO: parse into date
|
||||
|
||||
/// URL to call to get the next batch of documents.
|
||||
/// Unless you are sidestepping the SDK, you do not need to deal with this.
|
||||
pub next: Option<String>,
|
||||
|
||||
/// List of documents returned by the crawl
|
||||
pub data: Vec<Document>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlAsyncResponse {
|
||||
success: bool,
|
||||
|
||||
/// Crawl ID
|
||||
pub id: String,
|
||||
|
||||
/// URL to get the status of the crawl job
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
impl FirecrawlApp {
|
||||
pub async fn crawl_url_async(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: Option<CrawlOptions>,
|
||||
) -> Result<CrawlAsyncResponse, FirecrawlError> {
|
||||
let body = CrawlRequestBody {
|
||||
url: url.as_ref().to_string(),
|
||||
options: options.unwrap_or_default(),
|
||||
};
|
||||
|
||||
let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||
.headers(headers.clone())
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
||||
}
|
||||
|
||||
pub async fn crawl_url(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: Option<CrawlOptions>,
|
||||
) -> Result<Vec<Document>, FirecrawlError> {
|
||||
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
||||
|
||||
let res = self.crawl_url_async(url, options).await?;
|
||||
|
||||
self.monitor_job_status(&res.id, poll_interval).await
|
||||
}
|
||||
|
||||
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/{}",
|
||||
self.api_url, API_VERSION, id
|
||||
))
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "check crawl status").await
|
||||
}
|
||||
|
||||
async fn monitor_job_status(
|
||||
&self,
|
||||
id: &str,
|
||||
poll_interval: u64,
|
||||
) -> Result<Vec<Document>, FirecrawlError> {
|
||||
loop {
|
||||
let status_data = self.check_crawl_status(id).await?;
|
||||
match status_data.status {
|
||||
CrawlStatusTypes::Completed => {
|
||||
return Ok(status_data.data);
|
||||
}
|
||||
CrawlStatusTypes::Scraping => {
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
|
||||
}
|
||||
CrawlStatusTypes::Failed => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job failed."
|
||||
)));
|
||||
}
|
||||
CrawlStatusTypes::Cancelled => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job was cancelled."
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
86
apps/rust-sdk/src/document.rs
Normal file
86
apps/rust-sdk/src/document.rs
Normal file
|
@ -0,0 +1,86 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DocumentMetadata {
|
||||
// firecrawl specific
|
||||
#[serde(rename = "sourceURL")]
|
||||
pub source_url: String,
|
||||
pub status_code: u16,
|
||||
pub error: Option<String>,
|
||||
|
||||
// basic meta tags
|
||||
pub title: String,
|
||||
pub description: String,
|
||||
pub language: Option<String>,
|
||||
pub keywords: Option<String>,
|
||||
pub robots: Option<String>,
|
||||
|
||||
// og: namespace
|
||||
pub og_title: Option<String>,
|
||||
pub og_description: Option<String>,
|
||||
pub og_url: Option<String>,
|
||||
pub og_image: Option<String>,
|
||||
pub og_audio: Option<String>,
|
||||
pub og_determiner: Option<String>,
|
||||
pub og_locale: Option<String>,
|
||||
pub og_locale_alternate: Option<String>,
|
||||
pub og_site_name: Option<String>,
|
||||
pub og_video: Option<String>,
|
||||
|
||||
// article: namespace
|
||||
pub article_section: Option<String>,
|
||||
pub article_tag: Option<String>,
|
||||
pub published_time: Option<String>,
|
||||
pub modified_time: Option<String>,
|
||||
|
||||
// dc./dcterms. namespace
|
||||
pub dcterms_keywords: Option<String>,
|
||||
pub dc_description: Option<String>,
|
||||
pub dc_subject: Option<String>,
|
||||
pub dcterms_subject: Option<String>,
|
||||
pub dcterms_audience: Option<String>,
|
||||
pub dc_type: Option<String>,
|
||||
pub dcterms_type: Option<String>,
|
||||
pub dc_date: Option<String>,
|
||||
pub dc_date_created: Option<String>,
|
||||
pub dcterms_created: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Document {
|
||||
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
|
||||
pub markdown: Option<String>,
|
||||
|
||||
/// The HTML of the page, present if `ScrapeFormats::HTML` is present in `ScrapeOptions.formats`.
|
||||
///
|
||||
/// This contains HTML that has non-content tags removed. If you need the original HTML, use `ScrapeFormats::RawHTML`.
|
||||
pub html: Option<String>,
|
||||
|
||||
/// The raw HTML of the page, present if `ScrapeFormats::RawHTML` is present in `ScrapeOptions.formats`.
|
||||
///
|
||||
/// This contains the original, untouched HTML on the page. If you only need human-readable content, use `ScrapeFormats::HTML`.
|
||||
pub raw_html: Option<String>,
|
||||
|
||||
/// The URL to the screenshot of the page, present if `ScrapeFormats::Screenshot` or `ScrapeFormats::ScreenshotFullPage` is present in `ScrapeOptions.formats`.
|
||||
pub screenshot: Option<String>,
|
||||
|
||||
/// A list of the links on the page, present if `ScrapeFormats::Links` is present in `ScrapeOptions.formats`.
|
||||
pub links: Option<Vec<String>>,
|
||||
|
||||
/// The extracted data from the page, present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
|
||||
/// If `ScrapeOptions.extract.schema` is `Some`, this `Value` is guaranteed to match the provided schema.
|
||||
pub extract: Option<Value>,
|
||||
|
||||
/// The metadata from the page.
|
||||
pub metadata: DocumentMetadata,
|
||||
|
||||
/// Can be present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
|
||||
/// The warning message will contain any errors encountered during the extraction.
|
||||
pub warning: Option<String>,
|
||||
}
|
||||
|
29
apps/rust-sdk/src/error.rs
Normal file
29
apps/rust-sdk/src/error.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct FirecrawlAPIError {
|
||||
/// Always false.
|
||||
success: bool,
|
||||
|
||||
/// Error message
|
||||
pub error: String,
|
||||
|
||||
/// Additional details of this error. Schema depends on the error itself.
|
||||
pub details: Option<Value>,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FirecrawlError {
|
||||
#[error("HTTP request failed: {0}")]
|
||||
HttpRequestFailed(String),
|
||||
#[error("API key not provided")]
|
||||
APIKeyNotProvided,
|
||||
#[error("Failed to parse response: {0}")]
|
||||
ResponseParseError(String),
|
||||
#[error("API error")]
|
||||
APIError(FirecrawlAPIError),
|
||||
#[error("Crawl job failed or stopped: {0}")]
|
||||
CrawlJobFailed(String),
|
||||
}
|
|
@ -1,40 +1,14 @@
|
|||
/*
|
||||
*
|
||||
* - Structs and Enums:
|
||||
* FirecrawlError: Custom error enum for handling various errors.
|
||||
* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client.
|
||||
*
|
||||
* - Initialization:
|
||||
*
|
||||
* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided.
|
||||
*
|
||||
* - API Methods:
|
||||
* scrape_url, search, crawl_url, check_crawl_status:
|
||||
* Methods for interacting with the Firecrawl API, similar to the Python methods.
|
||||
* monitor_job_status: Polls the API to monitor the status of a crawl job until completion.
|
||||
*/
|
||||
|
||||
use std::env;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use log::debug;
|
||||
use reqwest::{Client, Response};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FirecrawlError {
|
||||
#[error("HTTP request failed: {0}")]
|
||||
HttpRequestFailed(String),
|
||||
#[error("API key not provided")]
|
||||
ApiKeyNotProvided,
|
||||
#[error("Failed to parse response: {0}")]
|
||||
ResponseParseError(String),
|
||||
#[error("Crawl job failed or stopped: {0}")]
|
||||
CrawlJobFailed(String),
|
||||
}
|
||||
pub mod crawl;
|
||||
pub mod document;
|
||||
mod error;
|
||||
pub mod scrape;
|
||||
|
||||
pub use error::FirecrawlError;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FirecrawlApp {
|
||||
|
@ -42,26 +16,15 @@ pub struct FirecrawlApp {
|
|||
api_url: String,
|
||||
client: Client,
|
||||
}
|
||||
// the api verstion of firecrawl
|
||||
const API_VERSION: &str = "/v0";
|
||||
|
||||
pub(crate) const API_VERSION: &str = "/v1";
|
||||
|
||||
impl FirecrawlApp {
|
||||
/// Initialize the FirecrawlApp instance.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
/// * `api_url` (Optional[str]): Base URL for the Firecrawl API.
|
||||
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
|
||||
let api_key = api_key
|
||||
.or_else(|| env::var("FIRECRAWL_API_KEY").ok())
|
||||
.ok_or(FirecrawlError::ApiKeyNotProvided)?;
|
||||
let api_url = api_url.unwrap_or_else(|| {
|
||||
env::var("FIRECRAWL_API_URL")
|
||||
.unwrap_or_else(|_| "https://api.firecrawl.dev".to_string())
|
||||
});
|
||||
|
||||
debug!("Initialized FirecrawlApp with API key: {}", api_key);
|
||||
debug!("Initialized FirecrawlApp with API URL: {}", api_url);
|
||||
.ok_or(FirecrawlError::APIKeyNotProvided)?;
|
||||
let api_url = api_url
|
||||
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
|
||||
|
||||
Ok(FirecrawlApp {
|
||||
api_key,
|
||||
|
@ -70,237 +33,7 @@ impl FirecrawlApp {
|
|||
})
|
||||
}
|
||||
|
||||
/// Scrape the specified URL using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `url` (str): The URL to scrape.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The scraped data if the request is successful.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the scrape request fails.
|
||||
pub async fn scrape_url(
|
||||
&self,
|
||||
url: &str,
|
||||
params: Option<Value>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let mut scrape_params = json!({"url": url});
|
||||
|
||||
if let Some(mut params) = params {
|
||||
if let Some(extractor_options) = params.get_mut("extractorOptions") {
|
||||
if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") {
|
||||
if extraction_schema.is_object() && extraction_schema.get("schema").is_some() {
|
||||
extractor_options["extractionSchema"] = extraction_schema["schema"].clone();
|
||||
}
|
||||
extractor_options["mode"] = extractor_options
|
||||
.get("mode")
|
||||
.cloned()
|
||||
.unwrap_or_else(|| json!("llm-extraction"));
|
||||
}
|
||||
scrape_params["extractorOptions"] = extractor_options.clone();
|
||||
}
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
if key != "extractorOptions" {
|
||||
scrape_params[key] = value.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&scrape_params)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "scrape URL").await
|
||||
}
|
||||
|
||||
/// Perform a search using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `query` (str): The search query.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The search results if the request is successful.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the search request fails.
|
||||
pub async fn search(
|
||||
&self,
|
||||
query: &str,
|
||||
params: Option<Value>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let mut json_data = json!({"query": query});
|
||||
if let Some(params) = params {
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
json_data[key] = value.clone();
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/search", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&json_data)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "search").await
|
||||
}
|
||||
|
||||
/// Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `url` (str): The URL to crawl.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
/// * `wait_until_done` (bool): Whether to wait until the crawl job is completed.
|
||||
/// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion.
|
||||
/// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The crawl job ID or the crawl results if waiting until completion.
|
||||
///
|
||||
/// # `Raises`:
|
||||
/// * `Exception`: If the crawl job initiation or monitoring fails.
|
||||
pub async fn crawl_url(
|
||||
&self,
|
||||
url: &str,
|
||||
params: Option<Value>,
|
||||
wait_until_done: bool,
|
||||
poll_interval: u64,
|
||||
idempotency_key: Option<String>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(idempotency_key);
|
||||
let mut json_data = json!({"url": url});
|
||||
if let Some(params) = params {
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
json_data[key] = value.clone();
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||
.headers(headers.clone())
|
||||
.json(&json_data)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
let response_json = self.handle_response(response, "start crawl job").await?;
|
||||
let job_id = response_json["jobId"].as_str().unwrap().to_string();
|
||||
|
||||
if wait_until_done {
|
||||
self.monitor_job_status(&job_id, headers, poll_interval)
|
||||
.await
|
||||
} else {
|
||||
Ok(json!({"jobId": job_id}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Check the status of a crawl job using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `job_id` (str): The ID of the crawl job.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The status of the crawl job.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the status check request fails.
|
||||
pub async fn check_crawl_status(&self, job_id: &str) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/status/{}",
|
||||
self.api_url, API_VERSION, job_id
|
||||
))
|
||||
.headers(headers)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "check crawl status").await
|
||||
}
|
||||
|
||||
/// Monitor the status of a crawl job until completion.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `job_id` (str): The ID of the crawl job.
|
||||
/// * `headers` (Dict[str, str]): The headers to include in the status check requests.
|
||||
/// * `poll_interval` (int): Secounds between status checks.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The crawl results if the job is completed successfully.
|
||||
///
|
||||
/// # Raises:
|
||||
/// Exception: If the job fails or an error occurs during status checks.
|
||||
async fn monitor_job_status(
|
||||
&self,
|
||||
job_id: &str,
|
||||
headers: reqwest::header::HeaderMap,
|
||||
poll_interval: u64,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
loop {
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/status/{}",
|
||||
self.api_url, API_VERSION, job_id
|
||||
))
|
||||
.headers(headers.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
let status_data = self.handle_response(response, "check crawl status").await?;
|
||||
match status_data["status"].as_str() {
|
||||
Some("completed") => {
|
||||
if status_data["data"].is_object() {
|
||||
return Ok(status_data["data"].clone());
|
||||
} else {
|
||||
return Err(FirecrawlError::CrawlJobFailed(
|
||||
"Crawl job completed but no data was returned".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Some("active") | Some("paused") | Some("pending") | Some("queued")
|
||||
| Some("waiting") => {
|
||||
thread::sleep(Duration::from_secs(poll_interval));
|
||||
}
|
||||
Some(status) => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job failed or was stopped. Status: {}",
|
||||
status
|
||||
)));
|
||||
}
|
||||
None => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(
|
||||
"Unexpected response: no status field".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare the headers for API requests.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests.
|
||||
///
|
||||
/// # Returns:
|
||||
/// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||
fn prepare_headers(&self, idempotency_key: Option<String>) -> reqwest::header::HeaderMap {
|
||||
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
headers.insert("Content-Type", "application/json".parse().unwrap());
|
||||
headers.insert(
|
||||
|
@ -313,30 +46,22 @@ impl FirecrawlApp {
|
|||
headers
|
||||
}
|
||||
|
||||
/// Handle errors from API responses.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `response` (requests.Response): The response object from the API request.
|
||||
/// * `action` (str): Description of the action that was being performed.
|
||||
///
|
||||
/// # Raises:
|
||||
/// Exception: An exception with a message containing the status code and error details from the response.
|
||||
async fn handle_response(
|
||||
async fn handle_response<'a, T: DeserializeOwned>(
|
||||
&self,
|
||||
response: Response,
|
||||
action: &str,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
action: impl AsRef<str>,
|
||||
) -> Result<T, FirecrawlError> {
|
||||
if response.status().is_success() {
|
||||
let response_json: Value = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
|
||||
if response_json["success"].as_bool().unwrap_or(false) {
|
||||
Ok(response_json["data"].clone())
|
||||
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
|
||||
} else {
|
||||
Err(FirecrawlError::HttpRequestFailed(format!(
|
||||
"Failed to {}: {}",
|
||||
action, response_json["error"]
|
||||
action.as_ref(), response_json["error"]
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
|
@ -348,23 +73,23 @@ impl FirecrawlApp {
|
|||
let message = match status_code {
|
||||
402 => format!(
|
||||
"Payment Required: Failed to {}. {}",
|
||||
action, error_message["error"]
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
408 => format!(
|
||||
"Request Timeout: Failed to {} as the request timed out. {}",
|
||||
action, error_message["error"]
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
409 => format!(
|
||||
"Conflict: Failed to {} due to a conflict. {}",
|
||||
action, error_message["error"]
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
500 => format!(
|
||||
"Internal Server Error: Failed to {}. {}",
|
||||
action, error_message["error"]
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
_ => format!(
|
||||
"Unexpected error during {}: Status code {}. {}",
|
||||
action, status_code, error_message["error"]
|
||||
action.as_ref(), status_code, error_message["error"]
|
||||
),
|
||||
};
|
||||
Err(FirecrawlError::HttpRequestFailed(message))
|
||||
|
|
139
apps/rust-sdk/src/scrape.rs
Normal file
139
apps/rust-sdk/src/scrape.rs
Normal file
|
@ -0,0 +1,139 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
|
||||
pub enum ScrapeFormats {
|
||||
/// Will result in a copy of the Markdown content of the page.
|
||||
#[serde(rename = "markdown")]
|
||||
Markdown,
|
||||
|
||||
/// Will result in a copy of the filtered, content-only HTML of the page.
|
||||
#[serde(rename = "html")]
|
||||
HTML,
|
||||
|
||||
/// Will result in a copy of the raw HTML of the page.
|
||||
#[serde(rename = "rawHtml")]
|
||||
RawHTML,
|
||||
|
||||
/// Will result in a Vec of URLs found on the page.
|
||||
#[serde(rename = "links")]
|
||||
Links,
|
||||
|
||||
/// Will result in a URL to a screenshot of the page.
|
||||
///
|
||||
/// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
|
||||
#[serde(rename = "screenshot")]
|
||||
Screenshot,
|
||||
|
||||
/// Will result in a URL to a full-page screenshot of the page.
|
||||
///
|
||||
/// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
|
||||
#[serde(rename = "screenshot@fullPage")]
|
||||
ScreenshotFullPage,
|
||||
|
||||
/// Will result in the results of an LLM extraction.
|
||||
///
|
||||
/// See `ScrapeOptions.extract` for more options.
|
||||
#[serde(rename = "extract")]
|
||||
Extract,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ExtractOptions {
|
||||
/// Schema the output should adhere to, provided in JSON Schema format.
|
||||
pub schema: Option<Value>,
|
||||
|
||||
pub system_prompt: Option<Value>,
|
||||
|
||||
/// Extraction prompt to send to the LLM agent along with the page content.
|
||||
pub prompt: Option<Value>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ScrapeOptions {
|
||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||
pub formats: Option<Vec<ScrapeFormats>>,
|
||||
|
||||
/// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
|
||||
pub only_main_content: Option<bool>,
|
||||
|
||||
/// HTML tags to exclusively include.
|
||||
///
|
||||
/// For example, if you pass `div`, you will only get content from `<div>`s and their children.
|
||||
pub include_tags: Option<Vec<String>>,
|
||||
|
||||
/// HTML tags to exclude.
|
||||
///
|
||||
/// For example, if you pass `img`, you will never get image URLs in your results.
|
||||
pub exclude_tags: Option<Vec<String>>,
|
||||
|
||||
/// Additional HTTP headers to use when loading the page.
|
||||
pub headers: Option<HashMap<String, String>>,
|
||||
|
||||
// Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
|
||||
pub wait_for: Option<u32>,
|
||||
|
||||
// Timeout before returning an error, in milliseconds. (default: `60000`)
|
||||
pub timeout: Option<u32>,
|
||||
|
||||
/// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`.
|
||||
pub extract: Option<ExtractOptions>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ScrapeRequestBody {
|
||||
url: String,
|
||||
|
||||
#[serde(flatten)]
|
||||
options: ScrapeOptions,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ScrapeResponse {
|
||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||
/// No need to expose.
|
||||
success: bool,
|
||||
|
||||
/// The resulting document.
|
||||
data: Document,
|
||||
}
|
||||
|
||||
impl FirecrawlApp {
|
||||
pub async fn scrape_url(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: Option<ScrapeOptions>,
|
||||
) -> Result<Document, FirecrawlError> {
|
||||
let body = ScrapeRequestBody {
|
||||
url: url.as_ref().to_string(),
|
||||
options: options.unwrap_or_default(),
|
||||
};
|
||||
|
||||
let headers = self.prepare_headers(None);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
|
||||
|
||||
Ok(response.data)
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
use assert_matches::assert_matches;
|
||||
use dotenv::dotenv;
|
||||
use dotenvy::dotenv;
|
||||
use firecrawl::FirecrawlApp;
|
||||
use serde_json::json;
|
||||
use std::env;
|
||||
|
|
Loading…
Reference in New Issue
Block a user