mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Compare commits
4 Commits
1a87fd954c
...
b1302d6221
Author | SHA1 | Date | |
---|---|---|---|
|
b1302d6221 | ||
|
c95a4a26c9 | ||
|
3a342bfbf0 | ||
|
6f45ab6691 |
|
@ -186,9 +186,10 @@ export const webhookSchema = z.preprocess(x => {
|
|||
headers: z.record(z.string(), z.string()).default({}),
|
||||
}).strict(strictMessage))
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
export const batchScrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
webhook: webhookSchema.optional(),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
|
@ -199,12 +200,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
|
|||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
|
|
|
@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
|
|||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
body: {
|
||||
url: meta.url,
|
||||
wait_after_load: meta.options.waitFor,
|
||||
timeout,
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
},
|
||||
method: "POST",
|
||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||
schema: z.object({
|
||||
|
|
|
@ -36,13 +36,15 @@ class FirecrawlApp:
|
|||
raise ValueError('No API key provided')
|
||||
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None,
|
||||
timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
Scrape the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Any: The scraped data if the request is successful.
|
||||
|
@ -76,6 +78,7 @@ class FirecrawlApp:
|
|||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=scrape_params,
|
||||
timeout=timeout
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
|
@ -108,7 +111,9 @@ class FirecrawlApp:
|
|||
def crawl_url(self, url: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None
|
||||
) -> Any:
|
||||
"""
|
||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
|
||||
|
@ -117,6 +122,7 @@ class FirecrawlApp:
|
|||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
|
||||
|
@ -136,7 +142,7 @@ class FirecrawlApp:
|
|||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
@ -145,7 +151,8 @@ class FirecrawlApp:
|
|||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
|
||||
def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate a crawl job asynchronously.
|
||||
|
||||
|
@ -153,6 +160,7 @@ class FirecrawlApp:
|
|||
url (str): The URL to crawl.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
|
||||
|
@ -165,18 +173,19 @@ class FirecrawlApp:
|
|||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, id: str) -> Any:
|
||||
def check_crawl_status(self, id: str, timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the crawl job.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Any: The status of the crawl job.
|
||||
|
@ -187,7 +196,7 @@ class FirecrawlApp:
|
|||
endpoint = f'/v1/crawl/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
status_data = response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
|
@ -225,24 +234,27 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
||||
def cancel_crawl(self, id: str, timeout: Optional[float] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Cancel an asynchronous crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the crawl job to cancel.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: The response from the cancel crawl request.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
||||
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, "cancel crawl job")
|
||||
|
||||
def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
|
||||
def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None,
|
||||
idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None) -> 'CrawlWatcher':
|
||||
"""
|
||||
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
||||
|
||||
|
@ -250,23 +262,25 @@ class FirecrawlApp:
|
|||
url (str): The URL to crawl.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job.
|
||||
"""
|
||||
crawl_response = self.async_crawl_url(url, params, idempotency_key)
|
||||
crawl_response = self.async_crawl_url(url, params, idempotency_key, timeout=timeout)
|
||||
if crawl_response['success'] and 'id' in crawl_response:
|
||||
return CrawlWatcher(crawl_response['id'], self)
|
||||
else:
|
||||
raise Exception("Crawl job failed to start")
|
||||
|
||||
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
Perform a map search using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to perform the map search on.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the map search.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of URLs discovered during the map search.
|
||||
|
@ -284,6 +298,7 @@ class FirecrawlApp:
|
|||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
timeout=timeout
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
|
@ -299,7 +314,9 @@ class FirecrawlApp:
|
|||
def batch_scrape_urls(self, urls: list[str],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None
|
||||
) -> Any:
|
||||
"""
|
||||
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
|
||||
|
||||
|
@ -308,6 +325,7 @@ class FirecrawlApp:
|
|||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout: Optional[float] = None
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
|
||||
|
@ -327,7 +345,7 @@ class FirecrawlApp:
|
|||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
@ -336,7 +354,9 @@ class FirecrawlApp:
|
|||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
|
||||
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None,
|
||||
idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate a crawl job asynchronously.
|
||||
|
||||
|
@ -344,6 +364,8 @@ class FirecrawlApp:
|
|||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
|
||||
|
@ -356,13 +378,15 @@ class FirecrawlApp:
|
|||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
|
||||
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None,
|
||||
idempotency_key: Optional[str] = None,
|
||||
timeout: Optional[float] = None) -> 'CrawlWatcher':
|
||||
"""
|
||||
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
||||
|
||||
|
@ -370,22 +394,24 @@ class FirecrawlApp:
|
|||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
|
||||
"""
|
||||
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
|
||||
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key, timeout=timeout)
|
||||
if crawl_response['success'] and 'id' in crawl_response:
|
||||
return CrawlWatcher(crawl_response['id'], self)
|
||||
else:
|
||||
raise Exception("Batch scrape job failed to start")
|
||||
|
||||
def check_batch_scrape_status(self, id: str) -> Any:
|
||||
def check_batch_scrape_status(self, id: str, timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
Check the status of a batch scrape job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the batch scrape job.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Any: The status of the batch scrape job.
|
||||
|
@ -396,7 +422,7 @@ class FirecrawlApp:
|
|||
endpoint = f'/v1/batch/scrape/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
status_data = response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
|
@ -460,7 +486,9 @@ class FirecrawlApp:
|
|||
data: Dict[str, Any],
|
||||
headers: Dict[str, str],
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.5) -> requests.Response:
|
||||
backoff_factor: float = 0.5,
|
||||
timeout: Optional[float] = None
|
||||
) -> requests.Response:
|
||||
"""
|
||||
Make a POST request with retries.
|
||||
|
||||
|
@ -470,6 +498,7 @@ class FirecrawlApp:
|
|||
headers (Dict[str, str]): The headers to include in the POST request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the POST request.
|
||||
|
@ -478,7 +507,7 @@ class FirecrawlApp:
|
|||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response = requests.post(url, headers=headers, json=data, timeout=timeout)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2 ** attempt))
|
||||
else:
|
||||
|
@ -488,7 +517,8 @@ class FirecrawlApp:
|
|||
def _get_request(self, url: str,
|
||||
headers: Dict[str, str],
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.5) -> requests.Response:
|
||||
backoff_factor: float = 0.5,
|
||||
timeout: Optional[float] = None) -> requests.Response:
|
||||
"""
|
||||
Make a GET request with retries.
|
||||
|
||||
|
@ -497,6 +527,7 @@ class FirecrawlApp:
|
|||
headers (Dict[str, str]): The headers to include in the GET request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the GET request.
|
||||
|
@ -505,7 +536,7 @@ class FirecrawlApp:
|
|||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.get(url, headers=headers)
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2 ** attempt))
|
||||
else:
|
||||
|
@ -515,7 +546,8 @@ class FirecrawlApp:
|
|||
def _delete_request(self, url: str,
|
||||
headers: Dict[str, str],
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.5) -> requests.Response:
|
||||
backoff_factor: float = 0.5,
|
||||
timeout: Optional[float] = None) -> requests.Response:
|
||||
"""
|
||||
Make a DELETE request with retries.
|
||||
|
||||
|
@ -524,6 +556,7 @@ class FirecrawlApp:
|
|||
headers (Dict[str, str]): The headers to include in the DELETE request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the DELETE request.
|
||||
|
@ -532,21 +565,24 @@ class FirecrawlApp:
|
|||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.delete(url, headers=headers)
|
||||
response = requests.delete(url, headers=headers, timeout=timeout)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2 ** attempt))
|
||||
else:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
||||
def _monitor_job_status(self, id: str, headers: Dict[str, str],
|
||||
poll_interval: int,
|
||||
timeout: Optional[float] = None) -> Any:
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the crawl job.
|
||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||
poll_interval (int): Secounds between status checks.
|
||||
poll_interval (int): Seconds between status checks.
|
||||
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
|
||||
Returns:
|
||||
Any: The crawl results if the job is completed successfully.
|
||||
|
||||
|
@ -555,15 +591,14 @@ class FirecrawlApp:
|
|||
"""
|
||||
while True:
|
||||
api_url = f'{self.api_url}/v1/crawl/{id}'
|
||||
|
||||
status_response = self._get_request(api_url, headers)
|
||||
status_response = self._get_request(api_url, headers, timeout=timeout)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
data = status_data['data']
|
||||
while 'next' in status_data:
|
||||
status_response = self._get_request(status_data['next'], headers)
|
||||
status_response = self._get_request(status_data['next'], headers, timeout=timeout)
|
||||
status_data = status_response.json()
|
||||
data.extend(status_data['data'])
|
||||
status_data['data'] = data
|
||||
|
|
Loading…
Reference in New Issue
Block a user