diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index c2693c3d..d4b1d6ee 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -36,13 +36,15 @@ class FirecrawlApp: raise ValueError('No API key provided') logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}") - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None, + timeout: Optional[float] = None) -> Any: """ Scrape the specified URL using the Firecrawl API. Args: url (str): The URL to scrape. params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Any: The scraped data if the request is successful. @@ -76,6 +78,7 @@ class FirecrawlApp: f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, + timeout=timeout ) if response.status_code == 200: response = response.json() @@ -108,7 +111,9 @@ class FirecrawlApp: def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None, + timeout: Optional[float] = None + ) -> Any: """ Initiate a crawl job for the specified URL using the Firecrawl API. @@ -117,6 +122,7 @@ class FirecrawlApp: params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Dict[str, Any]: A dictionary containing the crawl results. The structure includes: @@ -136,7 +142,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout) if response.status_code == 200: id = response.json().get('id') return self._monitor_job_status(id, headers, poll_interval) @@ -145,7 +151,8 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') - def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None, + timeout: Optional[float] = None) -> Dict[str, Any]: """ Initiate a crawl job asynchronously. @@ -153,6 +160,7 @@ class FirecrawlApp: url (str): The URL to crawl. params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes: @@ -165,18 +173,19 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout) if response.status_code == 200: return response.json() else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, id: str) -> Any: + def check_crawl_status(self, id: str, timeout: Optional[float] = None) -> Any: """ Check the status of a crawl job using the Firecrawl API. Args: id (str): The ID of the crawl job. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Any: The status of the crawl job. @@ -187,7 +196,7 @@ class FirecrawlApp: endpoint = f'/v1/crawl/{id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}{endpoint}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout) if response.status_code == 200: status_data = response.json() if status_data['status'] == 'completed': @@ -225,24 +234,27 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def cancel_crawl(self, id: str) -> Dict[str, Any]: + def cancel_crawl(self, id: str, timeout: Optional[float] = None) -> Dict[str, Any]: """ Cancel an asynchronous crawl job using the Firecrawl API. Args: id (str): The ID of the crawl job to cancel. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Dict[str, Any]: The response from the cancel crawl request. """ headers = self._prepare_headers() - response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers) + response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers, timeout=timeout) if response.status_code == 200: return response.json() else: self._handle_error(response, "cancel crawl job") - def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None, + timeout: Optional[float] = None) -> 'CrawlWatcher': """ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket. @@ -250,23 +262,25 @@ class FirecrawlApp: url (str): The URL to crawl. params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job. """ - crawl_response = self.async_crawl_url(url, params, idempotency_key) + crawl_response = self.async_crawl_url(url, params, idempotency_key, timeout=timeout) if crawl_response['success'] and 'id' in crawl_response: return CrawlWatcher(crawl_response['id'], self) else: raise Exception("Crawl job failed to start") - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None) -> Any: """ Perform a map search using the Firecrawl API. Args: url (str): The URL to perform the map search on. params (Optional[Dict[str, Any]]): Additional parameters for the map search. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: List[str]: A list of URLs discovered during the map search. @@ -284,6 +298,7 @@ class FirecrawlApp: f'{self.api_url}{endpoint}', headers=headers, json=json_data, + timeout=timeout ) if response.status_code == 200: response = response.json() @@ -299,7 +314,9 @@ class FirecrawlApp: def batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None, + timeout: Optional[float] = None + ) -> Any: """ Initiate a batch scrape job for the specified URLs using the Firecrawl API. @@ -308,6 +325,7 @@ class FirecrawlApp: params (Optional[Dict[str, Any]]): Additional parameters for the scraper. poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout: Optional[float] = None Returns: Dict[str, Any]: A dictionary containing the scrape results. The structure includes: @@ -327,7 +345,7 @@ class FirecrawlApp: json_data = {'urls': urls} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout) if response.status_code == 200: id = response.json().get('id') return self._monitor_job_status(id, headers, poll_interval) @@ -336,7 +354,9 @@ class FirecrawlApp: self._handle_error(response, 'start batch scrape job') - def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None, + timeout: Optional[float] = None) -> Dict[str, Any]: """ Initiate a crawl job asynchronously. @@ -344,6 +364,8 @@ class FirecrawlApp: urls (list[str]): The URLs to scrape. params (Optional[Dict[str, Any]]): Additional parameters for the scraper. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. + Returns: Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: @@ -356,13 +378,15 @@ class FirecrawlApp: json_data = {'urls': urls} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout) if response.status_code == 200: return response.json() else: self._handle_error(response, 'start batch scrape job') - def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None, + timeout: Optional[float] = None) -> 'CrawlWatcher': """ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. @@ -370,22 +394,24 @@ class FirecrawlApp: urls (list[str]): The URLs to scrape. params (Optional[Dict[str, Any]]): Additional parameters for the scraper. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job. """ - crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) + crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key, timeout=timeout) if crawl_response['success'] and 'id' in crawl_response: return CrawlWatcher(crawl_response['id'], self) else: raise Exception("Batch scrape job failed to start") - def check_batch_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str, timeout: Optional[float] = None) -> Any: """ Check the status of a batch scrape job using the Firecrawl API. Args: id (str): The ID of the batch scrape job. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Any: The status of the batch scrape job. @@ -396,7 +422,7 @@ class FirecrawlApp: endpoint = f'/v1/batch/scrape/{id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}{endpoint}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout) if response.status_code == 200: status_data = response.json() if status_data['status'] == 'completed': @@ -460,7 +486,9 @@ class FirecrawlApp: data: Dict[str, Any], headers: Dict[str, str], retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + backoff_factor: float = 0.5, + timeout: Optional[float] = None + ) -> requests.Response: """ Make a POST request with retries. @@ -470,6 +498,7 @@ class FirecrawlApp: headers (Dict[str, str]): The headers to include in the POST request. retries (int): Number of retries for the request. backoff_factor (float): Backoff factor for retries. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: requests.Response: The response from the POST request. @@ -478,7 +507,7 @@ class FirecrawlApp: requests.RequestException: If the request fails after the specified retries. """ for attempt in range(retries): - response = requests.post(url, headers=headers, json=data) + response = requests.post(url, headers=headers, json=data, timeout=timeout) if response.status_code == 502: time.sleep(backoff_factor * (2 ** attempt)) else: @@ -488,7 +517,8 @@ class FirecrawlApp: def _get_request(self, url: str, headers: Dict[str, str], retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + backoff_factor: float = 0.5, + timeout: Optional[float] = None) -> requests.Response: """ Make a GET request with retries. @@ -497,6 +527,7 @@ class FirecrawlApp: headers (Dict[str, str]): The headers to include in the GET request. retries (int): Number of retries for the request. backoff_factor (float): Backoff factor for retries. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: requests.Response: The response from the GET request. @@ -505,7 +536,7 @@ class FirecrawlApp: requests.RequestException: If the request fails after the specified retries. """ for attempt in range(retries): - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=timeout) if response.status_code == 502: time.sleep(backoff_factor * (2 ** attempt)) else: @@ -515,7 +546,8 @@ class FirecrawlApp: def _delete_request(self, url: str, headers: Dict[str, str], retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + backoff_factor: float = 0.5, + timeout: Optional[float] = None) -> requests.Response: """ Make a DELETE request with retries. @@ -524,6 +556,7 @@ class FirecrawlApp: headers (Dict[str, str]): The headers to include in the DELETE request. retries (int): Number of retries for the request. backoff_factor (float): Backoff factor for retries. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: requests.Response: The response from the DELETE request. @@ -532,21 +565,24 @@ class FirecrawlApp: requests.RequestException: If the request fails after the specified retries. """ for attempt in range(retries): - response = requests.delete(url, headers=headers) + response = requests.delete(url, headers=headers, timeout=timeout) if response.status_code == 502: time.sleep(backoff_factor * (2 ** attempt)) else: return response return response - def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status(self, id: str, headers: Dict[str, str], + poll_interval: int, + timeout: Optional[float] = None) -> Any: """ Monitor the status of a crawl job until completion. Args: id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. - poll_interval (int): Secounds between status checks. + poll_interval (int): Seconds between status checks. + timeout (Optional[float]): Timeout in seconds for the request. Defaults to None. Returns: Any: The crawl results if the job is completed successfully. @@ -555,15 +591,14 @@ class FirecrawlApp: """ while True: api_url = f'{self.api_url}/v1/crawl/{id}' - - status_response = self._get_request(api_url, headers) + status_response = self._get_request(api_url, headers, timeout=timeout) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: data = status_data['data'] while 'next' in status_data: - status_response = self._get_request(status_data['next'], headers) + status_response = self._get_request(status_data['next'], headers, timeout=timeout) status_data = status_response.json() data.extend(status_data['data']) status_data['data'] = data