From c7b3365ffdde3392bbe672107c7de1dfe85089bf Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:15:29 -0300 Subject: [PATCH] fix(v1): update readme - v1.0.1 --- apps/js-sdk/README.md | 81 +++++++++++---------------- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/README.md | 78 ++++++++++++++------------ apps/python-sdk/firecrawl/__init__.py | 2 +- 4 files changed, 76 insertions(+), 87 deletions(-) diff --git a/apps/js-sdk/README.md b/apps/js-sdk/README.md index 397e5fc8..0368677a 100644 --- a/apps/js-sdk/README.md +++ b/apps/js-sdk/README.md @@ -18,29 +18,30 @@ npm install @mendable/firecrawl-js Here's an example of how to use the SDK with error handling: ```js -import FirecrawlApp from "@mendable/firecrawl-js"; +import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js'; -// Initialize the FirecrawlApp with your API key -const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" }); +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); -// Scrape a single URL -const url = "https://mendable.ai"; -const scrapedData = await app.scrapeUrl(url); +// Scrape a website +const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', { + formats: ['markdown', 'html'], +}); + +if (scrapeResponse) { + console.log(scrapeResponse) +} // Crawl a website -const crawlUrl = "https://mendable.ai"; -const params = { - crawlerOptions: { - excludes: ["blog/"], - includes: [], // leave empty for all pages - limit: 1000, - }, - pageOptions: { - onlyMainContent: true, - }, -}; +const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { + limit: 100, + scrapeOptions: { + formats: ['markdown', 'html'], + } +} as CrawlParams, true, 30) as CrawlStatusResponse; -const crawlResult = await app.crawlUrl(crawlUrl, params); +if (crawlResponse) { + console.log(crawlResponse) +} ``` ### Scraping a URL @@ -57,28 +58,16 @@ const scrapedData = await app.scrapeUrl(url); To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. ```js -const crawlUrl = "https://example.com"; +const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { + limit: 100, + scrapeOptions: { + formats: ['markdown', 'html'], + } +} as CrawlParams, true, 30) as CrawlStatusResponse; -const params = { - crawlerOptions: { - excludes: ["blog/"], - includes: [], // leave empty for all pages - limit: 1000, - }, - pageOptions: { - onlyMainContent: true, - }, -}; - -const waitUntilDone = true; -const pollInterval = 5; - -const crawlResult = await app.crawlUrl( - crawlUrl, - params, - waitUntilDone, - pollInterval -); +if (crawlResponse) { + console.log(crawlResponse) +} ``` ### Checking Crawl Status @@ -86,7 +75,7 @@ const crawlResult = await app.crawlUrl( To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. ```js -const status = await app.checkCrawlStatus(jobId); +const status = await app.checkCrawlStatus(id); ``` ### Extracting structured data from a URL @@ -123,17 +112,13 @@ const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", { console.log(scrapeResult.data["llm_extraction"]); ``` -### Search for a query +### Map a Website -With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results. +Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap. ```js -const query = "what is mendable?"; -const searchResults = await app.search(query, { - pageOptions: { - fetchPageContent: true, // Fetch the page content for each search result - }, -}); +const mapResult = await app.mapUrl('https://example.com') as MapResponse; +console.log(mapResult) ``` ## Error Handling diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 58b125e4..d8bbc13f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.0.0", + "version": "1.0.1", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 8505fec6..0cf36e9c 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -18,23 +18,28 @@ pip install firecrawl-py Here's an example of how to use the SDK: ```python -from firecrawl import FirecrawlApp +from firecrawl.firecrawl import FirecrawlApp -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) +# Scrape a website: +scrape_status = app.scrape_url( + 'https://firecrawl.dev', + params={'formats': ['markdown', 'html']} +) +print(scrape_status) -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) +# Crawl a website: +crawl_status = app.crawl_url( + 'https://firecrawl.dev', + params={ + 'limit': 100, + 'scrapeOptions': {'formats': ['markdown', 'html']} + }, + wait_until_done=True, + poll_interval=30 +) +print(crawl_status) ``` ### Scraping a URL @@ -72,15 +77,6 @@ data = app.scrape_url('https://news.ycombinator.com', { print(data["llm_extraction"]) ``` -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - ### Crawling a Website To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. @@ -88,18 +84,16 @@ To crawl a website, use the `crawl_url` method. It takes the starting URL and op The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. ```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) +crawl_status = app.crawl_url( + 'https://firecrawl.dev', + params={ + 'limit': 100, + 'scrapeOptions': {'formats': ['markdown', 'html']} + }, + wait_until_done=True, + poll_interval=30 +) +print(crawl_status) ``` If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. @@ -109,8 +103,18 @@ If `wait_until_done` is set to `True`, the `crawl_url` method will return the cr To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. ```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) +id = crawl_result['id'] +status = app.check_crawl_status(id) +``` + +### Map a Website + +Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap. + +```python +# Map a website: +map_result = app.map_url('https://example.com') +print(map_result) ``` ## Error Handling diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 1beaa043..229f9ccd 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.0.0" +__version__ = "1.0.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl")