0
+ assert 'metadata' in response['data'][0]
+ assert 'title' in response['data'][0]['metadata']
+ assert 'description' in response['data'][0]['metadata']
+ assert 'language' in response['data'][0]['metadata']
+ assert 'sourceURL' in response['data'][0]['metadata']
+ assert 'statusCode' in response['data'][0]['metadata']
+ assert 'error' not in response['data'][0]['metadata']
def test_crawl_url_with_idempotency_key_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
uniqueIdempotencyKey = str(uuid4())
- response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
+ response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey)
assert response is not None
- assert len(response) > 0
- assert 'content' in response[0]
- assert "_Roast_" in response[0]['content']
+ assert 'id' in response
with pytest.raises(Exception) as excinfo:
- app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
- assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
+ app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey)
+ assert "Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
- response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
+ response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False)
assert response is not None
- assert 'jobId' in response
+ assert 'id' in response
- time.sleep(30) # wait for 30 seconds
- status_response = app.check_crawl_status(response['jobId'])
+ max_checks = 15
+ checks = 0
+ status_response = app.check_crawl_status(response['id'])
+
+ while status_response['status'] == 'scraping' and checks < max_checks:
+ time.sleep(1) # wait for 1 second
+ assert 'partial_data' not in status_response
+ assert 'current' not in status_response
+ assert 'data' in status_response
+ assert 'totalCount' in status_response
+ assert 'creditsUsed' in status_response
+ assert 'expiresAt' in status_response
+ assert 'status' in status_response
+ assert 'next' in status_response
+ assert status_response['totalCount'] > 0
+ assert status_response['creditsUsed'] > 0
+ assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
+ assert status_response['status'] == 'scraping'
+ assert '/v1/crawl/' in status_response['next']
+ status_response = app.check_crawl_status(response['id'])
+ checks += 1
+
assert status_response is not None
+ assert 'totalCount' in status_response
+ assert status_response['totalCount'] > 0
+ assert 'creditsUsed' in status_response
+ assert status_response['creditsUsed'] > 0
+ assert 'expiresAt' in status_response
+ assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
assert 'status' in status_response
assert status_response['status'] == 'completed'
- assert 'data' in status_response
assert len(status_response['data']) > 0
+ assert 'markdown' in status_response['data'][0]
+ assert len(status_response['data'][0]['markdown']) > 10
+ assert 'content' not in status_response['data'][0]
+ assert 'html' in status_response['data'][0]
+ assert "
0
+ assert 'metadata' in status_response['data'][0]
+ assert 'title' in status_response['data'][0]['metadata']
+ assert 'description' in status_response['data'][0]['metadata']
+ assert 'language' in status_response['data'][0]['metadata']
+ assert 'sourceURL' in status_response['data'][0]['metadata']
+ assert 'statusCode' in status_response['data'][0]['metadata']
+ assert 'error' not in status_response['data'][0]['metadata']
+
+def test_invalid_api_key_on_map():
+ invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
+ with pytest.raises(Exception) as excinfo:
+ invalid_app.map_url('https://roastmywebsite.ai')
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
+
+def test_blocklisted_url_on_map():
+ app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
+ blocklisted_url = "https://facebook.com/fake-test"
+ with pytest.raises(Exception) as excinfo:
+ app.map_url(blocklisted_url)
+ assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
+
+def test_successful_response_with_valid_preview_token_on_map():
+ app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
+ response = app.map_url('https://roastmywebsite.ai')
+ assert response is not None
+ assert len(response) > 0
+
+def test_successful_response_for_valid_map():
+ app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
+ response = app.map_url('https://roastmywebsite.ai')
+ assert response is not None
+ assert len(response) > 0
+ assert any("https://" in link for link in response)
+ filtered_links = [link for link in response if "roastmywebsite.ai" in link]
+ assert len(filtered_links) > 0
def test_search_e2e():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@@ -136,33 +324,29 @@ def test_search_e2e():
app.search("test query")
assert "Search is not supported in v1" in str(excinfo.value)
-def test_llm_extraction():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
- response = app.scrape_url("https://mendable.ai", {
- 'extractorOptions': {
- 'mode': 'llm-extraction',
- 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
- 'extractionSchema': {
- 'type': 'object',
- 'properties': {
- 'company_mission': {'type': 'string'},
- 'supports_sso': {'type': 'boolean'},
- 'is_open_source': {'type': 'boolean'}
- },
- 'required': ['company_mission', 'supports_sso', 'is_open_source']
- }
- }
- })
- assert response is not None
- assert 'llm_extraction' in response
- llm_extraction = response['llm_extraction']
- assert 'company_mission' in llm_extraction
- assert isinstance(llm_extraction['supports_sso'], bool)
- assert isinstance(llm_extraction['is_open_source'], bool)
+# def test_llm_extraction():
+# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+# response = app.scrape_url("https://mendable.ai", {
+# 'extractorOptions': {
+# 'mode': 'llm-extraction',
+# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+# 'extractionSchema': {
+# 'type': 'object',
+# 'properties': {
+# 'company_mission': {'type': 'string'},
+# 'supports_sso': {'type': 'boolean'},
+# 'is_open_source': {'type': 'boolean'}
+# },
+# 'required': ['company_mission', 'supports_sso', 'is_open_source']
+# }
+# }
+# })
+# assert response is not None
+# assert 'llm_extraction' in response
+# llm_extraction = response['llm_extraction']
+# assert 'company_mission' in llm_extraction
+# assert isinstance(llm_extraction['supports_sso'], bool)
+# assert isinstance(llm_extraction['is_open_source'], bool)
+
-def test_map_e2e():
- app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
- response = app.map_url('https://roastmywebsite.ai')
- assert response is not None
- assert isinstance(response, list)
\ No newline at end of file
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index f67afbdb..4f71cc78 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -244,8 +244,9 @@ class FirecrawlApp:
)
if response.status_code == 200:
response = response.json()
- if response['success'] and 'data' in response:
- return response['data']
+ print(response)
+ if response['success'] and 'links' in response:
+ return response['links']
else:
raise Exception(f'Failed to map URL. Error: {response["error"]}')
else:
@@ -387,18 +388,19 @@ class FirecrawlApp:
Raises:
Exception: An exception with a message containing the status code and error details from the response.
"""
- error_message = response.json().get('error', 'No additional error details provided.')
+ error_message = response.json().get('error', 'No error message provided.')
+ error_details = response.json().get('details', 'No additional error details provided.')
if response.status_code == 402:
- message = f"Payment Required: Failed to {action}. {error_message}"
+ message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
elif response.status_code == 408:
- message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
+ message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
elif response.status_code == 409:
- message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
+ message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
elif response.status_code == 500:
- message = f"Internal Server Error: Failed to {action}. {error_message}"
+ message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
else:
- message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
+ message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
# Raise an HTTPError with the custom message and attach the response
raise requests.exceptions.HTTPError(message, response=response)