Merge pull request #880 from mendableai/python-sdk/next-handler

[SDK] Added next handler for python sdk (js is ok)
This commit is contained in:
Nicolas 2024-11-11 14:48:30 -05:00 committed by GitHub
commit 766377621e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 61 additions and 19 deletions

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.4.0" __version__ = "1.5.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -189,17 +189,38 @@ class FirecrawlApp:
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers) response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200: if response.status_code == 200:
data = response.json() status_data = response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
next_url = status_data.get('next')
if not next_url:
logger.warning("Expected 'next' URL is missing.")
break
try:
status_response = self._get_request(next_url, headers)
if status_response.status_code != 200:
logger.error(f"Failed to fetch next page: {status_response.status_code}")
break
status_data = status_response.json()
data.extend(status_data.get('data', []))
except Exception as e:
logger.error(f"Error during pagination request: {e}")
break
status_data.pop('next', None)
status_data['data'] = data
return { return {
'success': True, 'success': True,
'status': data.get('status'), 'status': status_data.get('status'),
'total': data.get('total'), 'total': status_data.get('total'),
'completed': data.get('completed'), 'completed': status_data.get('completed'),
'creditsUsed': data.get('creditsUsed'), 'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'), 'expiresAt': status_data.get('expiresAt'),
'next': data.get('next'), 'data': status_data.get('data'),
'data': data.get('data'), 'error': status_data.get('error'),
'error': data.get('error') 'next': status_data.get('next', None)
} }
else: else:
self._handle_error(response, 'check crawl status') self._handle_error(response, 'check crawl status')
@ -377,17 +398,38 @@ class FirecrawlApp:
headers = self._prepare_headers() headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers) response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200: if response.status_code == 200:
data = response.json() status_data = response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
next_url = status_data.get('next')
if not next_url:
logger.warning("Expected 'next' URL is missing.")
break
try:
status_response = self._get_request(next_url, headers)
if status_response.status_code != 200:
logger.error(f"Failed to fetch next page: {status_response.status_code}")
break
status_data = status_response.json()
data.extend(status_data.get('data', []))
except Exception as e:
logger.error(f"Error during pagination request: {e}")
break
status_data.pop('next', None)
status_data['data'] = data
return { return {
'success': True, 'success': True,
'status': data.get('status'), 'status': status_data.get('status'),
'total': data.get('total'), 'total': status_data.get('total'),
'completed': data.get('completed'), 'completed': status_data.get('completed'),
'creditsUsed': data.get('creditsUsed'), 'creditsUsed': status_data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'), 'expiresAt': status_data.get('expiresAt'),
'next': data.get('next'), 'data': status_data.get('data'),
'data': data.get('data'), 'error': status_data.get('error'),
'error': data.get('error') 'next': status_data.get('next', None)
} }
else: else:
self._handle_error(response, 'check batch scrape status') self._handle_error(response, 'check batch scrape status')