feat(sdk/rust): first batch of changes for 1.0.0

2024-11-16 03:32:22 +08:00 · 2024-09-19 22:22:57 +02:00 · 2024-09-19 22:22:57 +02:00 · 93a20442e3
commit 93a20442e3
parent 6aa468163e
9 changed files with 808 additions and 311 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,5 @@
+{
+    "rust-analyzer.linkedProjects": [
+        "apps/rust-sdk/Cargo.toml"
+    ]
+}
--- a/apps/rust-sdk/Cargo.lock
+++ b/apps/rust-sdk/Cargo.lock
@ -26,6 +26,21 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "arrayref"
 version = "0.3.7"
@ -151,6 +166,19 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "chrono"
+version = "0.4.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "num-traits",
+ "serde",
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "clippy"
 version = "0.0.302"
@ -197,6 +225,51 @@ version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"

+[[package]]
+name = "darling"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "diff"
 version = "0.1.13"
@ -215,10 +288,10 @@ dependencies = [
 ]

 [[package]]
-name = "dotenv"
-version = "0.15.0"
+name = "dotenvy"
+version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
+checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"

 [[package]]
 name = "encoding_rs"
@ -276,16 +349,17 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"

 [[package]]
 name = "firecrawl"
-version = "0.1.0"
+version = "1.0.0"
 dependencies = [
 "assert_matches",
 "clippy",
- "dotenv",
+ "dotenvy",
 "log 0.4.22",
 "reqwest",
 "rustfmt",
 "serde",
 "serde_json",
+ "serde_with",
 "thiserror",
 "tokio",
 "uuid",
@ -426,13 +500,19 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http",
- "indexmap",
+ "indexmap 2.2.6",
 "slab",
 "tokio",
 "tokio-util",
 "tracing",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@ -445,6 +525,12 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"

+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
 [[package]]
 name = "http"
 version = "1.1.0"
@ -558,6 +644,35 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "iana-time-zone"
+version = "0.1.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "0.5.0"
@ -568,6 +683,17 @@ dependencies = [
 "unicode-normalization",
 ]

+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg 1.3.0",
+ "hashbrown 0.12.3",
+ "serde",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.2.6"
@ -575,7 +701,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
 dependencies = [
 "equivalent",
- "hashbrown",
+ "hashbrown 0.14.5",
+ "serde",
 ]

 [[package]]
@ -701,6 +828,12 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@ -846,6 +979,12 @@ version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"

+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.86"
@ -1293,6 +1432,36 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "serde_with"
+version = "3.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857"
+dependencies = [
+ "base64 0.22.1",
+ "chrono",
+ "hex",
+ "indexmap 1.9.3",
+ "indexmap 2.2.6",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "serde_with_macros",
+ "time",
+]
+
+[[package]]
+name = "serde_with_macros"
+version = "3.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.2"
@ -1342,6 +1511,12 @@ dependencies = [
 "log 0.3.9",
 ]

+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@ -1489,6 +1664,37 @@ dependencies = [
 "lazy_static",
 ]

+[[package]]
+name = "time"
+version = "0.3.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+
+[[package]]
+name = "time-macros"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.7.0"
@ -1843,6 +2049,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
--- a/apps/rust-sdk/Cargo.toml
+++ b/apps/rust-sdk/Cargo.toml
@ -1,13 +1,13 @@
 [package]
 name = "firecrawl"
 author= "Mendable.ai"
-version = "0.1.0"
+version = "1.0.0"
 edition = "2021"
-license = "GPL-2.0-or-later"
+license = "GPL-3.0-or-later"
 homepage = "https://www.firecrawl.dev/"
 repository ="https://github.com/mendableai/firecrawl"
 description = "Rust SDK for Firecrawl API."
-authors = ["sanix-darker <sanixdk@gmail.com>"]
+authors = ["Gergő Móricz <mogery@firecrawl.dev>", "sanix-darker <sanixdk@gmail.com>"]

 [lib]
 path = "src/lib.rs"
@ -18,6 +18,7 @@ name = "firecrawl"
 reqwest = { version = "^0.12", features = ["json", "blocking"] }
 serde = { version = "^1.0", features = ["derive"] }
 serde_json = "^1.0"
+serde_with = "^3.9"
 log = "^0.4"
 thiserror = "^1.0"
 uuid = { version = "^1.10", features = ["v4"] }
@ -27,7 +28,7 @@ tokio = { version = "^1", features = ["full"] }
 clippy = "^0.0.302"
 rustfmt = "^0.10"
 assert_matches = "^1.5"
-dotenv = "^0.15"
+dotenvy = "^0.15"
 tokio = { version = "1", features = ["full"] }

 [build-dependencies]
--- a/apps/rust-sdk/src/crawl.rs
+++ b/apps/rust-sdk/src/crawl.rs
@ -0,0 +1,297 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
+
+#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
+pub enum CrawlScrapeFormats {
+    /// Will result in a copy of the Markdown content of the page.
+    #[serde(rename = "markdown")]
+    Markdown,
+
+    /// Will result in a copy of the filtered, content-only HTML of the page.
+    #[serde(rename = "html")]
+    HTML,
+
+    /// Will result in a copy of the raw HTML of the page.
+    #[serde(rename = "rawHtml")]
+    RawHTML,
+
+    /// Will result in a Vec of URLs found on the page.
+    #[serde(rename = "links")]
+    Links,
+
+    /// Will result in a URL to a screenshot of the page.
+    /// 
+    /// Can not be used in conjunction with `CrawlScrapeFormats::ScreenshotFullPage`.
+    #[serde(rename = "screenshot")]
+    Screenshot,
+
+    /// Will result in a URL to a full-page screenshot of the page.
+    /// 
+    /// Can not be used in conjunction with `CrawlScrapeFormats::Screenshot`.
+    #[serde(rename = "screenshot@fullPage")]
+    ScreenshotFullPage,
+}
+
+impl From<CrawlScrapeFormats> for ScrapeFormats {
+    fn from(value: CrawlScrapeFormats) -> Self {
+        match value {
+            CrawlScrapeFormats::Markdown => Self::Markdown,
+            CrawlScrapeFormats::HTML => Self::HTML,
+            CrawlScrapeFormats::RawHTML => Self::RawHTML,
+            CrawlScrapeFormats::Links => Self::Links,
+            CrawlScrapeFormats::Screenshot => Self::Screenshot,
+            CrawlScrapeFormats::ScreenshotFullPage => Self::ScreenshotFullPage,
+        }
+    }
+}
+
+#[derive(Deserialize, Serialize, Debug, Default, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlScrapeOptions {
+    /// Formats to extract from the page. (default: `[ Markdown ]`)
+    pub formats: Option<Vec<CrawlScrapeFormats>>,
+
+    /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
+    pub only_main_content: Option<bool>,
+
+    /// HTML tags to exclusively include.
+    /// 
+    /// For example, if you pass `div`, you will only get content from `<div>`s and their children.
+    pub include_tags: Option<Vec<String>>,
+
+    /// HTML tags to exclude.
+    /// 
+    /// For example, if you pass `img`, you will never get image URLs in your results.
+    pub exclude_tags: Option<Vec<String>>,
+
+    /// Additional HTTP headers to use when loading the page.
+    pub headers: Option<HashMap<String, String>>,
+
+    // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
+    pub wait_for: Option<u32>,
+
+    // Timeout before returning an error, in milliseconds. (default: `60000`)
+    pub timeout: Option<u32>,
+}
+
+impl From<CrawlScrapeOptions> for ScrapeOptions {
+    fn from(value: CrawlScrapeOptions) -> Self {
+        ScrapeOptions {
+            formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
+            only_main_content: value.only_main_content,
+            include_tags: value.include_tags,
+            exclude_tags: value.exclude_tags,
+            headers: value.headers,
+            wait_for: value.wait_for,
+            timeout: value.timeout,
+            ..Default::default()
+        }
+    }
+}
+
+#[derive(Deserialize, Serialize, Debug, Default, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlOptions {
+    /// Options to pass through to the scraper.
+    pub scrape_options: Option<CrawlScrapeOptions>,
+
+    /// URL RegEx patterns to (exclusively) include.
+    /// 
+    /// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
+    pub include_paths: Option<String>,
+
+    /// URL RegEx patterns to exclude.
+    /// 
+    /// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
+    pub exclude_paths: Option<String>,
+
+    /// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
+    pub max_depth: Option<u32>,
+
+    /// Tells the crawler to ignore the sitemap when crawling. (default: `true`)
+    pub ignore_sitemap: Option<bool>,
+
+    /// Maximum number of pages to crawl. (default: `10`)
+    pub limit: Option<u32>,
+
+    /// Allows the crawler to navigate links that are backwards in the URL hierarchy. (default: `false`)
+    pub allow_backward_links: Option<bool>,
+
+    /// Allows the crawler to follow links to external URLs. (default: `false`)
+    pub allow_external_links: Option<bool>,
+
+    /// URL to send Webhook crawl events to.
+    pub webhook: Option<String>,
+
+    /// Idempotency key to send to the crawl endpoint.
+    #[serde(skip)]
+    pub idempotency_key: Option<String>,
+
+    /// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
+    #[serde(skip)]
+    pub poll_interval: Option<u64>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+struct CrawlRequestBody {
+    url: String,
+
+    #[serde(flatten)]
+    options: CrawlOptions,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+struct CrawlResponse {
+    /// This will always be `true` due to `FirecrawlApp::handle_response`.
+    /// No need to expose.
+    success: bool,
+
+    /// The resulting document.
+    data: Document,
+}
+
+#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
+#[serde(rename_all = "camelCase")]
+pub enum CrawlStatusTypes {
+    /// The crawl job is in progress.
+    Scraping,
+
+    /// The crawl job has been completed successfully.
+    Completed,
+
+    /// The crawl job has failed.
+    Failed,
+
+    /// The crawl job has been cancelled.
+    Cancelled,
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlStatus {
+    /// The status of the crawl.
+    pub status: CrawlStatusTypes,
+
+    /// Number of pages that will be scraped in total. This number may grow as the crawler discovers new pages.
+    pub total: u32,
+
+    /// Number of pages that have been successfully scraped.
+    pub completed: u32,
+
+    /// Amount of credits used by the crawl job.
+    pub credits_used: u32,
+
+    /// Expiry time of crawl data. After this date, the crawl data will be unavailable from the API.
+    pub expires_at: String, // TODO: parse into date
+
+    /// URL to call to get the next batch of documents.
+    /// Unless you are sidestepping the SDK, you do not need to deal with this.
+    pub next: Option<String>,
+
+    /// List of documents returned by the crawl
+    pub data: Vec<Document>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct CrawlAsyncResponse {
+    success: bool,
+
+    /// Crawl ID
+    pub id: String,
+
+    /// URL to get the status of the crawl job
+    pub url: String,
+}
+
+impl FirecrawlApp {
+    pub async fn crawl_url_async(
+        &self,
+        url: impl AsRef<str>,
+        options: Option<CrawlOptions>,
+    ) -> Result<CrawlAsyncResponse, FirecrawlError> {
+        let body = CrawlRequestBody {
+            url: url.as_ref().to_string(),
+            options: options.unwrap_or_default(),
+        };
+        
+        let headers = self.prepare_headers(body.options.idempotency_key.as_ref());
+
+        let response = self
+            .client
+            .post(&format!("{}{}/crawl", self.api_url, API_VERSION))
+            .headers(headers.clone())
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
+
+        self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
+    }
+
+    pub async fn crawl_url(
+        &self,
+        url: impl AsRef<str>,
+        options: Option<CrawlOptions>,
+    ) -> Result<Vec<Document>, FirecrawlError> {
+        let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
+
+        let res = self.crawl_url_async(url, options).await?;
+
+        self.monitor_job_status(&res.id, poll_interval).await
+    }
+
+    pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
+        let response = self
+            .client
+            .get(&format!(
+                "{}{}/crawl/{}",
+                self.api_url, API_VERSION, id
+            ))
+            .headers(self.prepare_headers(None))
+            .send()
+            .await
+            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
+
+        self.handle_response(response, "check crawl status").await
+    }
+
+    async fn monitor_job_status(
+        &self,
+        id: &str,
+        poll_interval: u64,
+    ) -> Result<Vec<Document>, FirecrawlError> {
+        loop {
+            let status_data = self.check_crawl_status(id).await?;
+            match status_data.status {
+                CrawlStatusTypes::Completed => {
+                    return Ok(status_data.data);
+                }
+                CrawlStatusTypes::Scraping => {
+                    tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
+                }
+                CrawlStatusTypes::Failed => {
+                    return Err(FirecrawlError::CrawlJobFailed(format!(
+                        "Crawl job failed."
+                    )));
+                }
+                CrawlStatusTypes::Cancelled => {
+                    return Err(FirecrawlError::CrawlJobFailed(format!(
+                        "Crawl job was cancelled."
+                    )));
+                }
+            }
+        }
+    }
+}
--- a/apps/rust-sdk/src/document.rs
+++ b/apps/rust-sdk/src/document.rs
@ -0,0 +1,86 @@
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+#[derive(Deserialize, Serialize, Debug, Default, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct DocumentMetadata {
+    // firecrawl specific
+    #[serde(rename = "sourceURL")]
+    pub source_url: String,
+    pub status_code: u16,
+    pub error: Option<String>,
+
+    // basic meta tags
+    pub title: String,
+    pub description: String,
+    pub language: Option<String>,
+    pub keywords: Option<String>,
+    pub robots: Option<String>,
+
+    // og: namespace
+    pub og_title: Option<String>,
+    pub og_description: Option<String>,
+    pub og_url: Option<String>,
+    pub og_image: Option<String>,
+    pub og_audio: Option<String>,
+    pub og_determiner: Option<String>,
+    pub og_locale: Option<String>,
+    pub og_locale_alternate: Option<String>,
+    pub og_site_name: Option<String>,
+    pub og_video: Option<String>,
+
+    // article: namespace
+    pub article_section: Option<String>,
+    pub article_tag: Option<String>,
+    pub published_time: Option<String>,
+    pub modified_time: Option<String>,
+
+    // dc./dcterms. namespace
+    pub dcterms_keywords: Option<String>,
+    pub dc_description: Option<String>,
+    pub dc_subject: Option<String>,
+    pub dcterms_subject: Option<String>,
+    pub dcterms_audience: Option<String>,
+    pub dc_type: Option<String>,
+    pub dcterms_type: Option<String>,
+    pub dc_date: Option<String>,
+    pub dc_date_created: Option<String>,
+    pub dcterms_created: Option<String>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default, Clone)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct Document {
+    /// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
+    pub markdown: Option<String>,
+
+    /// The HTML of the page, present if `ScrapeFormats::HTML` is present in `ScrapeOptions.formats`.
+    /// 
+    /// This contains HTML that has non-content tags removed. If you need the original HTML, use `ScrapeFormats::RawHTML`.
+    pub html: Option<String>,
+
+    /// The raw HTML of the page, present if `ScrapeFormats::RawHTML` is present in `ScrapeOptions.formats`.
+    /// 
+    /// This contains the original, untouched HTML on the page. If you only need human-readable content, use `ScrapeFormats::HTML`.
+    pub raw_html: Option<String>,
+
+    /// The URL to the screenshot of the page, present if `ScrapeFormats::Screenshot` or `ScrapeFormats::ScreenshotFullPage` is present in `ScrapeOptions.formats`.
+    pub screenshot: Option<String>,
+
+    /// A list of the links on the page, present if `ScrapeFormats::Links` is present in `ScrapeOptions.formats`.
+    pub links: Option<Vec<String>>,
+
+    /// The extracted data from the page, present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
+    /// If `ScrapeOptions.extract.schema` is `Some`, this `Value` is guaranteed to match the provided schema.
+    pub extract: Option<Value>,
+
+    /// The metadata from the page.
+    pub metadata: DocumentMetadata,
+
+    /// Can be present if `ScrapeFormats::Extract` is present in `ScrapeOptions.formats`.
+    /// The warning message will contain any errors encountered during the extraction.
+    pub warning: Option<String>,
+}
+
--- a/apps/rust-sdk/src/error.rs
+++ b/apps/rust-sdk/src/error.rs
@ -0,0 +1,29 @@
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use thiserror::Error;
+
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct FirecrawlAPIError {
+    /// Always false.
+    success: bool,
+
+    /// Error message
+    pub error: String,
+
+    /// Additional details of this error. Schema depends on the error itself.
+    pub details: Option<Value>,
+}
+
+#[derive(Error, Debug)]
+pub enum FirecrawlError {
+    #[error("HTTP request failed: {0}")]
+    HttpRequestFailed(String),
+    #[error("API key not provided")]
+    APIKeyNotProvided,
+    #[error("Failed to parse response: {0}")]
+    ResponseParseError(String),
+    #[error("API error")]
+    APIError(FirecrawlAPIError),
+    #[error("Crawl job failed or stopped: {0}")]
+    CrawlJobFailed(String),
+}
--- a/apps/rust-sdk/src/lib.rs
+++ b/apps/rust-sdk/src/lib.rs
@ -1,40 +1,14 @@
-/*
-*
-* - Structs and Enums:
-*     FirecrawlError: Custom error enum for handling various errors.
-*     FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client.
-*
-* - Initialization:
-*
-*     FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided.
-*
-* - API Methods:
-*     scrape_url, search, crawl_url, check_crawl_status:
-*       Methods for interacting with the Firecrawl API, similar to the Python methods.
-*     monitor_job_status: Polls the API to monitor the status of a crawl job until completion.
-*/
-
-use std::env;
-use std::thread;
-use std::time::Duration;
-
-use log::debug;
 use reqwest::{Client, Response};
+use serde::de::DeserializeOwned;
 use serde_json::json;
 use serde_json::Value;
-use thiserror::Error;

-#[derive(Error, Debug)]
-pub enum FirecrawlError {
-    #[error("HTTP request failed: {0}")]
-    HttpRequestFailed(String),
-    #[error("API key not provided")]
-    ApiKeyNotProvided,
-    #[error("Failed to parse response: {0}")]
-    ResponseParseError(String),
-    #[error("Crawl job failed or stopped: {0}")]
-    CrawlJobFailed(String),
-}
+pub mod crawl;
+pub mod document;
+mod error;
+pub mod scrape;
+
+pub use error::FirecrawlError;

 #[derive(Clone, Debug)]
 pub struct FirecrawlApp {
@ -42,26 +16,15 @@ pub struct FirecrawlApp {
    api_url: String,
    client: Client,
 }
-// the api verstion of firecrawl
-const API_VERSION: &str = "/v0";
+
+pub(crate) const API_VERSION: &str = "/v1";

 impl FirecrawlApp {
-    /// Initialize the FirecrawlApp instance.
-    ///
-    /// # Arguments:
-    ///    * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API.
-    ///    * `api_url` (Optional[str]): Base URL for the Firecrawl API.
    pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
        let api_key = api_key
-            .or_else(|| env::var("FIRECRAWL_API_KEY").ok())
-            .ok_or(FirecrawlError::ApiKeyNotProvided)?;
-        let api_url = api_url.unwrap_or_else(|| {
-            env::var("FIRECRAWL_API_URL")
-                .unwrap_or_else(|_| "https://api.firecrawl.dev".to_string())
-        });
-
-        debug!("Initialized FirecrawlApp with API key: {}", api_key);
-        debug!("Initialized FirecrawlApp with API URL: {}", api_url);
+            .ok_or(FirecrawlError::APIKeyNotProvided)?;
+        let api_url = api_url
+            .unwrap_or_else(|| "https://api.firecrawl.dev".to_string());

        Ok(FirecrawlApp {
            api_key,
@ -70,237 +33,7 @@ impl FirecrawlApp {
        })
    }

-    /// Scrape the specified URL using the Firecrawl API.
-    ///
-    /// # Arguments:
-    ///    * `url` (str): The URL to scrape.
-    ///    * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
-    ///
-    /// # Returns:
-    ///    * `Any`: The scraped data if the request is successful.
-    ///
-    /// # Raises:
-    ///    * `Exception`: If the scrape request fails.
-    pub async fn scrape_url(
-        &self,
-        url: &str,
-        params: Option<Value>,
-    ) -> Result<Value, FirecrawlError> {
-        let headers = self.prepare_headers(None);
-        let mut scrape_params = json!({"url": url});
-
-        if let Some(mut params) = params {
-            if let Some(extractor_options) = params.get_mut("extractorOptions") {
-                if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") {
-                    if extraction_schema.is_object() && extraction_schema.get("schema").is_some() {
-                        extractor_options["extractionSchema"] = extraction_schema["schema"].clone();
-                    }
-                    extractor_options["mode"] = extractor_options
-                        .get("mode")
-                        .cloned()
-                        .unwrap_or_else(|| json!("llm-extraction"));
-                }
-                scrape_params["extractorOptions"] = extractor_options.clone();
-            }
-            for (key, value) in params.as_object().unwrap() {
-                if key != "extractorOptions" {
-                    scrape_params[key] = value.clone();
-                }
-            }
-        }
-
-        let response = self
-            .client
-            .post(&format!("{}{}/scrape", self.api_url, API_VERSION))
-            .headers(headers)
-            .json(&scrape_params)
-            .send()
-            .await
-            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
-
-        self.handle_response(response, "scrape URL").await
-    }
-
-    /// Perform a search using the Firecrawl API.
-    ///
-    /// # Arguments:
-    ///   * `query` (str): The search query.
-    ///   * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request.
-    ///
-    /// # Returns:
-    ///   * `Any`: The search results if the request is successful.
-    ///
-    /// # Raises:
-    ///   * `Exception`: If the search request fails.
-    pub async fn search(
-        &self,
-        query: &str,
-        params: Option<Value>,
-    ) -> Result<Value, FirecrawlError> {
-        let headers = self.prepare_headers(None);
-        let mut json_data = json!({"query": query});
-        if let Some(params) = params {
-            for (key, value) in params.as_object().unwrap() {
-                json_data[key] = value.clone();
-            }
-        }
-
-        let response = self
-            .client
-            .post(&format!("{}{}/search", self.api_url, API_VERSION))
-            .headers(headers)
-            .json(&json_data)
-            .send()
-            .await
-            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
-
-        self.handle_response(response, "search").await
-    }
-
-    ///   Initiate a crawl job for the specified URL using the Firecrawl API.
-    ///
-    ///   # Arguments:
-    ///       * `url` (str): The URL to crawl.
-    ///       * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
-    ///       * `wait_until_done` (bool): Whether to wait until the crawl job is completed.
-    ///       * `poll_interval` (int): Time in seconds between status checks when waiting for job completion.
-    ///       * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests.
-    ///
-    ///   # Returns:
-    ///       * `Any`: The crawl job ID or the crawl results if waiting until completion.
-    ///
-    ///   # `Raises`:
-    ///       * `Exception`: If the crawl job initiation or monitoring fails.
-    pub async fn crawl_url(
-        &self,
-        url: &str,
-        params: Option<Value>,
-        wait_until_done: bool,
-        poll_interval: u64,
-        idempotency_key: Option<String>,
-    ) -> Result<Value, FirecrawlError> {
-        let headers = self.prepare_headers(idempotency_key);
-        let mut json_data = json!({"url": url});
-        if let Some(params) = params {
-            for (key, value) in params.as_object().unwrap() {
-                json_data[key] = value.clone();
-            }
-        }
-
-        let response = self
-            .client
-            .post(&format!("{}{}/crawl", self.api_url, API_VERSION))
-            .headers(headers.clone())
-            .json(&json_data)
-            .send()
-            .await
-            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
-
-        let response_json = self.handle_response(response, "start crawl job").await?;
-        let job_id = response_json["jobId"].as_str().unwrap().to_string();
-
-        if wait_until_done {
-            self.monitor_job_status(&job_id, headers, poll_interval)
-                .await
-        } else {
-            Ok(json!({"jobId": job_id}))
-        }
-    }
-
-    /// Check the status of a crawl job using the Firecrawl API.
-    ///
-    /// # Arguments:
-    ///     * `job_id` (str): The ID of the crawl job.
-    ///
-    /// # Returns:
-    ///     * `Any`: The status of the crawl job.
-    ///
-    /// # Raises:
-    ///     * `Exception`: If the status check request fails.
-    pub async fn check_crawl_status(&self, job_id: &str) -> Result<Value, FirecrawlError> {
-        let headers = self.prepare_headers(None);
-        let response = self
-            .client
-            .get(&format!(
-                "{}{}/crawl/status/{}",
-                self.api_url, API_VERSION, job_id
-            ))
-            .headers(headers)
-            .send()
-            .await
-            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
-
-        self.handle_response(response, "check crawl status").await
-    }
-
-    /// Monitor the status of a crawl job until completion.
-    ///
-    /// # Arguments:
-    ///     * `job_id` (str): The ID of the crawl job.
-    ///     * `headers` (Dict[str, str]): The headers to include in the status check requests.
-    ///     * `poll_interval` (int): Secounds between status checks.
-    ///
-    /// # Returns:
-    ///     * `Any`: The crawl results if the job is completed successfully.
-    ///
-    /// # Raises:
-    ///     Exception: If the job fails or an error occurs during status checks.
-    async fn monitor_job_status(
-        &self,
-        job_id: &str,
-        headers: reqwest::header::HeaderMap,
-        poll_interval: u64,
-    ) -> Result<Value, FirecrawlError> {
-        loop {
-            let response = self
-                .client
-                .get(&format!(
-                    "{}{}/crawl/status/{}",
-                    self.api_url, API_VERSION, job_id
-                ))
-                .headers(headers.clone())
-                .send()
-                .await
-                .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
-
-            let status_data = self.handle_response(response, "check crawl status").await?;
-            match status_data["status"].as_str() {
-                Some("completed") => {
-                    if status_data["data"].is_object() {
-                        return Ok(status_data["data"].clone());
-                    } else {
-                        return Err(FirecrawlError::CrawlJobFailed(
-                            "Crawl job completed but no data was returned".to_string(),
-                        ));
-                    }
-                }
-                Some("active") | Some("paused") | Some("pending") | Some("queued")
-                | Some("waiting") => {
-                    thread::sleep(Duration::from_secs(poll_interval));
-                }
-                Some(status) => {
-                    return Err(FirecrawlError::CrawlJobFailed(format!(
-                        "Crawl job failed or was stopped. Status: {}",
-                        status
-                    )));
-                }
-                None => {
-                    return Err(FirecrawlError::CrawlJobFailed(
-                        "Unexpected response: no status field".to_string(),
-                    ));
-                }
-            }
-        }
-    }
-
-    /// Prepare the headers for API requests.
-    ///
-    /// # Arguments:
-    ///     `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests.
-    ///
-    /// # Returns:
-    ///     Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
-    fn prepare_headers(&self, idempotency_key: Option<String>) -> reqwest::header::HeaderMap {
+    fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
        let mut headers = reqwest::header::HeaderMap::new();
        headers.insert("Content-Type", "application/json".parse().unwrap());
        headers.insert(
@ -313,30 +46,22 @@ impl FirecrawlApp {
        headers
    }

-    /// Handle errors from API responses.
-    ///
-    /// # Arguments:
-    ///     * `response` (requests.Response): The response object from the API request.
-    ///     * `action` (str): Description of the action that was being performed.
-    ///
-    /// # Raises:
-    ///     Exception: An exception with a message containing the status code and error details from the response.
-    async fn handle_response(
+    async fn handle_response<'a, T: DeserializeOwned>(
        &self,
        response: Response,
-        action: &str,
-    ) -> Result<Value, FirecrawlError> {
+        action: impl AsRef<str>,
+    ) -> Result<T, FirecrawlError> {
        if response.status().is_success() {
            let response_json: Value = response
                .json()
                .await
                .map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
            if response_json["success"].as_bool().unwrap_or(false) {
-                Ok(response_json["data"].clone())
+                Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
            } else {
                Err(FirecrawlError::HttpRequestFailed(format!(
                    "Failed to {}: {}",
-                    action, response_json["error"]
+                    action.as_ref(), response_json["error"]
                )))
            }
        } else {
@ -348,23 +73,23 @@ impl FirecrawlApp {
            let message = match status_code {
                402 => format!(
                    "Payment Required: Failed to {}. {}",
-                    action, error_message["error"]
+                    action.as_ref(), error_message["error"]
                ),
                408 => format!(
                    "Request Timeout: Failed to {} as the request timed out. {}",
-                    action, error_message["error"]
+                    action.as_ref(), error_message["error"]
                ),
                409 => format!(
                    "Conflict: Failed to {} due to a conflict. {}",
-                    action, error_message["error"]
+                    action.as_ref(), error_message["error"]
                ),
                500 => format!(
                    "Internal Server Error: Failed to {}. {}",
-                    action, error_message["error"]
+                    action.as_ref(), error_message["error"]
                ),
                _ => format!(
                    "Unexpected error during {}: Status code {}. {}",
-                    action, status_code, error_message["error"]
+                    action.as_ref(), status_code, error_message["error"]
                ),
            };
            Err(FirecrawlError::HttpRequestFailed(message))
--- a/apps/rust-sdk/src/scrape.rs
+++ b/apps/rust-sdk/src/scrape.rs
@ -0,0 +1,139 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION};
+
+#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
+pub enum ScrapeFormats {
+    /// Will result in a copy of the Markdown content of the page.
+    #[serde(rename = "markdown")]
+    Markdown,
+
+    /// Will result in a copy of the filtered, content-only HTML of the page.
+    #[serde(rename = "html")]
+    HTML,
+
+    /// Will result in a copy of the raw HTML of the page.
+    #[serde(rename = "rawHtml")]
+    RawHTML,
+
+    /// Will result in a Vec of URLs found on the page.
+    #[serde(rename = "links")]
+    Links,
+
+    /// Will result in a URL to a screenshot of the page.
+    /// 
+    /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`.
+    #[serde(rename = "screenshot")]
+    Screenshot,
+
+    /// Will result in a URL to a full-page screenshot of the page.
+    /// 
+    /// Can not be used in conjunction with `ScrapeFormats::Screenshot`.
+    #[serde(rename = "screenshot@fullPage")]
+    ScreenshotFullPage,
+
+    /// Will result in the results of an LLM extraction.
+    /// 
+    /// See `ScrapeOptions.extract` for more options.
+    #[serde(rename = "extract")]
+    Extract,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct ExtractOptions {
+    /// Schema the output should adhere to, provided in JSON Schema format.
+    pub schema: Option<Value>,
+
+    pub system_prompt: Option<Value>,
+
+    /// Extraction prompt to send to the LLM agent along with the page content.
+    pub prompt: Option<Value>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+pub struct ScrapeOptions {
+    /// Formats to extract from the page. (default: `[ Markdown ]`)
+    pub formats: Option<Vec<ScrapeFormats>>,
+
+    /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`)
+    pub only_main_content: Option<bool>,
+
+    /// HTML tags to exclusively include.
+    /// 
+    /// For example, if you pass `div`, you will only get content from `<div>`s and their children.
+    pub include_tags: Option<Vec<String>>,
+
+    /// HTML tags to exclude.
+    /// 
+    /// For example, if you pass `img`, you will never get image URLs in your results.
+    pub exclude_tags: Option<Vec<String>>,
+
+    /// Additional HTTP headers to use when loading the page.
+    pub headers: Option<HashMap<String, String>>,
+
+    // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`)
+    pub wait_for: Option<u32>,
+
+    // Timeout before returning an error, in milliseconds. (default: `60000`)
+    pub timeout: Option<u32>,
+
+    /// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`.
+    pub extract: Option<ExtractOptions>,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+struct ScrapeRequestBody {
+    url: String,
+
+    #[serde(flatten)]
+    options: ScrapeOptions,
+}
+
+#[derive(Deserialize, Serialize, Debug, Default)]
+#[serde_with::skip_serializing_none]
+#[serde(rename_all = "camelCase")]
+struct ScrapeResponse {
+    /// This will always be `true` due to `FirecrawlApp::handle_response`.
+    /// No need to expose.
+    success: bool,
+
+    /// The resulting document.
+    data: Document,
+}
+
+impl FirecrawlApp {
+    pub async fn scrape_url(
+        &self,
+        url: impl AsRef<str>,
+        options: Option<ScrapeOptions>,
+    ) -> Result<Document, FirecrawlError> {
+        let body = ScrapeRequestBody {
+            url: url.as_ref().to_string(),
+            options: options.unwrap_or_default(),
+        };
+
+        let headers = self.prepare_headers(None);
+
+        let response = self
+            .client
+            .post(&format!("{}{}/scrape", self.api_url, API_VERSION))
+            .headers(headers)
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
+
+        let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
+
+        Ok(response.data)
+    }
+}
--- a/apps/rust-sdk/tests/e2e_with_auth.rs
+++ b/apps/rust-sdk/tests/e2e_with_auth.rs
@ -1,5 +1,5 @@
 use assert_matches::assert_matches;
-use dotenv::dotenv;
+use dotenvy::dotenv;
 use firecrawl::FirecrawlApp;
 use serde_json::json;
 use std::env;