mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
fix(v1): update readme - v1.0.1
This commit is contained in:
parent
9e87d05b77
commit
c7b3365ffd
|
@ -18,29 +18,30 @@ npm install @mendable/firecrawl-js
|
||||||
Here's an example of how to use the SDK with error handling:
|
Here's an example of how to use the SDK with error handling:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
import FirecrawlApp from "@mendable/firecrawl-js";
|
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
|
||||||
|
|
||||||
// Initialize the FirecrawlApp with your API key
|
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||||
const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" });
|
|
||||||
|
|
||||||
// Scrape a single URL
|
// Scrape a website
|
||||||
const url = "https://mendable.ai";
|
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
|
||||||
const scrapedData = await app.scrapeUrl(url);
|
formats: ['markdown', 'html'],
|
||||||
|
});
|
||||||
|
|
||||||
|
if (scrapeResponse) {
|
||||||
|
console.log(scrapeResponse)
|
||||||
|
}
|
||||||
|
|
||||||
// Crawl a website
|
// Crawl a website
|
||||||
const crawlUrl = "https://mendable.ai";
|
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||||
const params = {
|
limit: 100,
|
||||||
crawlerOptions: {
|
scrapeOptions: {
|
||||||
excludes: ["blog/"],
|
formats: ['markdown', 'html'],
|
||||||
includes: [], // leave empty for all pages
|
}
|
||||||
limit: 1000,
|
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||||
},
|
|
||||||
pageOptions: {
|
|
||||||
onlyMainContent: true,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const crawlResult = await app.crawlUrl(crawlUrl, params);
|
if (crawlResponse) {
|
||||||
|
console.log(crawlResponse)
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Scraping a URL
|
### Scraping a URL
|
||||||
|
@ -57,28 +58,16 @@ const scrapedData = await app.scrapeUrl(url);
|
||||||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||||
|
|
||||||
```js
|
```js
|
||||||
const crawlUrl = "https://example.com";
|
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||||
|
limit: 100,
|
||||||
|
scrapeOptions: {
|
||||||
|
formats: ['markdown', 'html'],
|
||||||
|
}
|
||||||
|
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||||
|
|
||||||
const params = {
|
if (crawlResponse) {
|
||||||
crawlerOptions: {
|
console.log(crawlResponse)
|
||||||
excludes: ["blog/"],
|
}
|
||||||
includes: [], // leave empty for all pages
|
|
||||||
limit: 1000,
|
|
||||||
},
|
|
||||||
pageOptions: {
|
|
||||||
onlyMainContent: true,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const waitUntilDone = true;
|
|
||||||
const pollInterval = 5;
|
|
||||||
|
|
||||||
const crawlResult = await app.crawlUrl(
|
|
||||||
crawlUrl,
|
|
||||||
params,
|
|
||||||
waitUntilDone,
|
|
||||||
pollInterval
|
|
||||||
);
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Checking Crawl Status
|
### Checking Crawl Status
|
||||||
|
@ -86,7 +75,7 @@ const crawlResult = await app.crawlUrl(
|
||||||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||||
|
|
||||||
```js
|
```js
|
||||||
const status = await app.checkCrawlStatus(jobId);
|
const status = await app.checkCrawlStatus(id);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Extracting structured data from a URL
|
### Extracting structured data from a URL
|
||||||
|
@ -123,17 +112,13 @@ const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
|
||||||
console.log(scrapeResult.data["llm_extraction"]);
|
console.log(scrapeResult.data["llm_extraction"]);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Search for a query
|
### Map a Website
|
||||||
|
|
||||||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
|
||||||
|
|
||||||
```js
|
```js
|
||||||
const query = "what is mendable?";
|
const mapResult = await app.mapUrl('https://example.com') as MapResponse;
|
||||||
const searchResults = await app.search(query, {
|
console.log(mapResult)
|
||||||
pageOptions: {
|
|
||||||
fetchPageContent: true, // Fetch the page content for each search result
|
|
||||||
},
|
|
||||||
});
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Error Handling
|
## Error Handling
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.0.0",
|
"version": "1.0.1",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/cjs/index.js",
|
"main": "build/cjs/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
|
|
@ -18,23 +18,28 @@ pip install firecrawl-py
|
||||||
Here's an example of how to use the SDK:
|
Here's an example of how to use the SDK:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from firecrawl import FirecrawlApp
|
from firecrawl.firecrawl import FirecrawlApp
|
||||||
|
|
||||||
# Initialize the FirecrawlApp with your API key
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||||
app = FirecrawlApp(api_key='your_api_key')
|
|
||||||
|
|
||||||
# Scrape a single URL
|
# Scrape a website:
|
||||||
url = 'https://mendable.ai'
|
scrape_status = app.scrape_url(
|
||||||
scraped_data = app.scrape_url(url)
|
'https://firecrawl.dev',
|
||||||
|
params={'formats': ['markdown', 'html']}
|
||||||
|
)
|
||||||
|
print(scrape_status)
|
||||||
|
|
||||||
# Crawl a website
|
# Crawl a website:
|
||||||
crawl_url = 'https://mendable.ai'
|
crawl_status = app.crawl_url(
|
||||||
params = {
|
'https://firecrawl.dev',
|
||||||
'pageOptions': {
|
params={
|
||||||
'onlyMainContent': True
|
'limit': 100,
|
||||||
}
|
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||||
}
|
},
|
||||||
crawl_result = app.crawl_url(crawl_url, params=params)
|
wait_until_done=True,
|
||||||
|
poll_interval=30
|
||||||
|
)
|
||||||
|
print(crawl_status)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Scraping a URL
|
### Scraping a URL
|
||||||
|
@ -72,15 +77,6 @@ data = app.scrape_url('https://news.ycombinator.com', {
|
||||||
print(data["llm_extraction"])
|
print(data["llm_extraction"])
|
||||||
```
|
```
|
||||||
|
|
||||||
### Search for a query
|
|
||||||
|
|
||||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
|
||||||
|
|
||||||
```python
|
|
||||||
query = 'what is mendable?'
|
|
||||||
search_result = app.search(query)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Crawling a Website
|
### Crawling a Website
|
||||||
|
|
||||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||||
|
@ -88,18 +84,16 @@ To crawl a website, use the `crawl_url` method. It takes the starting URL and op
|
||||||
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
crawl_url = 'https://example.com'
|
crawl_status = app.crawl_url(
|
||||||
params = {
|
'https://firecrawl.dev',
|
||||||
'crawlerOptions': {
|
params={
|
||||||
'excludes': ['blog/*'],
|
'limit': 100,
|
||||||
'includes': [], # leave empty for all pages
|
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||||
'limit': 1000,
|
},
|
||||||
},
|
wait_until_done=True,
|
||||||
'pageOptions': {
|
poll_interval=30
|
||||||
'onlyMainContent': True
|
)
|
||||||
}
|
print(crawl_status)
|
||||||
}
|
|
||||||
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
||||||
|
@ -109,8 +103,18 @@ If `wait_until_done` is set to `True`, the `crawl_url` method will return the cr
|
||||||
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
job_id = crawl_result['jobId']
|
id = crawl_result['id']
|
||||||
status = app.check_crawl_status(job_id)
|
status = app.check_crawl_status(id)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Map a Website
|
||||||
|
|
||||||
|
Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Map a website:
|
||||||
|
map_result = app.map_url('https://example.com')
|
||||||
|
print(map_result)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Error Handling
|
## Error Handling
|
||||||
|
|
|
@ -13,7 +13,7 @@ import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "1.0.0"
|
__version__ = "1.0.1"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user