Nick: fixed pip issues

2024-11-16 11:42:24 +08:00 · 2024-05-26 18:03:37 -07:00 · 2024-05-26 18:03:37 -07:00 · 7948c6cee2
commit 7948c6cee2
parent e5c8719554
9 changed files with 166 additions and 23 deletions
--- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py
+++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py
@ -1,25 +1,57 @@
+"""
+FirecrawlApp Module
+
+This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs. The module uses requests for HTTP communication
+and handles retries for certain HTTP status codes.
+
+Classes:
+    - FirecrawlApp: Main class for interacting with the Firecrawl API.
+"""
+
 import os
-from typing import Any, Dict, Optional
-import requests
 import time
+from typing import Any, Dict, Optional
+
+import requests
+

 class FirecrawlApp:
-    def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
+    """
+    Initialize the FirecrawlApp instance.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+        api_url (Optional[str]): Base URL for the Firecrawl API.
+    """
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
            raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
-    
-    
-
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Scrape the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
+
+        Returns:
+            Any: The scraped data if the request is successful.
+
+        Raises:
+            Exception: If the scrape request fails.
+        """
+
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
        }
        # Prepare the base scrape parameters with the URL
        scrape_params = {'url': url}
-        
+
        # If there are additional params, process them
        if params:
            # Initialize extractorOptions if present
@ -32,7 +64,7 @@ class FirecrawlApp:
                extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
                # Update the scrape_params with the processed extractorOptions
                scrape_params['extractorOptions'] = extractor_options
-            
+
            # Include any other params directly at the top level of scrape_params
            for key, value in params.items():
                if key != 'extractorOptions':
@ -41,11 +73,11 @@ class FirecrawlApp:
        response = requests.post(
            f'{self.api_url}/v0/scrape',
            headers=headers,
-            json=scrape_params
+            json=scrape_params,
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success']:
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
@ -54,8 +86,21 @@ class FirecrawlApp:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
        else:
            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
-        
+
    def search(self, query, params=None):
+        """
+        Perform a search using the Firecrawl API.
+
+        Args:
+            query (str): The search query.
+            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+
+        Returns:
+            Any: The search results if the request is successful.
+
+        Raises:
+            Exception: If the search request fails.
+        """
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {self.api_key}'
@ -70,19 +115,36 @@ class FirecrawlApp:
        )
        if response.status_code == 200:
            response = response.json()
-            if response['success'] == True:
+            
+            if response['success'] and 'data' in response:
                return response['data']
            else:
                raise Exception(f'Failed to search. Error: {response["error"]}')
-            
+
        elif response.status_code in [402, 409, 500]:
            error_message = response.json().get('error', 'Unknown error occurred')
            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
        else:
            raise Exception(f'Failed to search. Status code: {response.status_code}')

-    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
-        headers = self._prepare_headers()
+    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
+        """
+        Initiate a crawl job for the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to crawl.
+            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
+            wait_until_done (bool): Whether to wait until the crawl job is completed.
+            timeout (int): Timeout between status checks when waiting for job completion.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Any: The crawl job ID or the crawl results if waiting until completion.
+
+        Raises:
+            Exception: If the crawl job initiation or monitoring fails.
+        """
+        headers = self._prepare_headers(idempotency_key)
        json_data = {'url': url}
        if params:
            json_data.update(params)
@ -97,6 +159,18 @@ class FirecrawlApp:
            self._handle_error(response, 'start crawl job')

    def check_crawl_status(self, job_id):
+        """
+        Check the status of a crawl job using the Firecrawl API.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+
+        Returns:
+            Any: The status of the crawl job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
        headers = self._prepare_headers()
        response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
        if response.status_code == 200:
@ -104,13 +178,45 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check crawl status')

-    def _prepare_headers(self):
+    def _prepare_headers(self, idempotency_key=None):
+        """
+        Prepare the headers for API requests.
+
+        Args:
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
+        """
+        if idempotency_key:
+            return {
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {self.api_key}',
+                'x-idempotency-key': idempotency_key
+            }
+
        return {
            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
+            'Authorization': f'Bearer {self.api_key}',
        }

    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a POST request with retries.
+
+        Args:
+            url (str): The URL to send the POST request to.
+            data (Dict[str, Any]): The JSON data to include in the POST request.
+            headers (Dict[str, str]): The headers to include in the POST request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the POST request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.post(url, headers=headers, json=data)
            if response.status_code == 502:
@ -120,6 +226,21 @@ class FirecrawlApp:
        return response

    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a GET request with retries.
+
+        Args:
+            url (str): The URL to send the GET request to.
+            headers (Dict[str, str]): The headers to include in the GET request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the GET request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
        for attempt in range(retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 502:
@ -129,7 +250,20 @@ class FirecrawlApp:
        return response

    def _monitor_job_status(self, job_id, headers, timeout):
-        import time
+        """
+        Monitor the status of a crawl job until completion.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+            headers (Dict[str, str]): The headers to include in the status check requests.
+            timeout (int): Timeout between status checks.
+
+        Returns:
+            Any: The crawl results if the job is completed successfully.
+
+        Raises:
+            Exception: If the job fails or an error occurs during status checks.
+        """
        while True:
            status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
            if status_response.status_code == 200:
@ -140,8 +274,7 @@ class FirecrawlApp:
                    else:
                        raise Exception('Crawl job completed but no data was returned')
                elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
-                    if timeout < 2:
-                        timeout = 2
+                    timeout=max(timeout,2)
                    time.sleep(timeout)  # Wait for the specified timeout before checking again
                else:
                    raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
@ -149,6 +282,16 @@ class FirecrawlApp:
                self._handle_error(status_response, 'check crawl status')

    def _handle_error(self, response, action):
+        """
+        Handle errors from API responses.
+
+        Args:
+            response (requests.Response): The response object from the API request.
+            action (str): Description of the action that was being performed.
+
+        Raises:
+            Exception: An exception with a message containing the status code and error details from the response.
+        """
        if response.status_code in [402, 408, 409, 500]:
            error_message = response.json().get('error', 'Unknown error occurred')
            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
--- a/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
--- a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
--- a/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
--- a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl
--- a/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/init.cpython-311.pyc
--- a/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
+++ b/apps/python-sdk/firecrawl/pycache/firecrawl.cpython-311.pyc
--- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
+++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: firecrawl-py
-Version: 0.0.9
+Version: 0.0.10
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -2,7 +2,7 @@ from setuptools import setup, find_packages

 setup(
    name='firecrawl-py',
-    version='0.0.9',
+    version='0.0.10',
    url='https://github.com/mendableai/firecrawl',
    author='Mendable.ai',
    author_email='nick@mendable.ai',