mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Add new example Web Scraping and Extraction with Firecrawl and Claude
This commit is contained in:
parent
ff08d7093e
commit
51d1a2e5f2
|
@ -0,0 +1,259 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Web Scraping and Extraction with Firecrawl and Claude\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 1: Import Required Libraries"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"from firecrawl import FirecrawlApp\n",
|
||||||
|
"from anthropic import Anthropic\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"\n",
|
||||||
|
"# Load environment variables\n",
|
||||||
|
"load_dotenv()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 2: Set Up API Keys and URL"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"URL to scrape: https://mendable.ai\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Retrieve API keys from environment variables\n",
|
||||||
|
"anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
|
||||||
|
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Set the URL to scrape\n",
|
||||||
|
"url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"URL to scrape: {url}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 3: Initialize Firecrawl and Anthropic Clients"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Firecrawl and Anthropic clients initialized.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Initialize FirecrawlApp and Anthropic client\n",
|
||||||
|
"firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
|
||||||
|
"anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Firecrawl and Anthropic clients initialized.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 4: Scrape the URL using Firecrawl"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Page content scraped. Length: 16199 characters\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Scrape the URL using Firecrawl\n",
|
||||||
|
"page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 5: Prepare the Prompt for Claude"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Prompt prepared for Claude.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Prepare the prompt for Claude\n",
|
||||||
|
"prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
|
||||||
|
"1. The title of the page\n",
|
||||||
|
"2. Whether the company is part of Y Combinator (YC)\n",
|
||||||
|
"3. Whether the company/product is open source\n",
|
||||||
|
"\n",
|
||||||
|
"Return the information in JSON format with the following schema:\n",
|
||||||
|
"{{\n",
|
||||||
|
" \"main_header_title\": string,\n",
|
||||||
|
" \"is_yc_company\": boolean,\n",
|
||||||
|
" \"is_open_source\": boolean\n",
|
||||||
|
"}}\n",
|
||||||
|
"\n",
|
||||||
|
"Webpage content:\n",
|
||||||
|
"{page_content['content']}\n",
|
||||||
|
"\n",
|
||||||
|
"Return only the JSON, nothing else.\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Prompt prepared for Claude.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 6: Query Claude"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Claude response received.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Query Claude\n",
|
||||||
|
"response = anthropic_client.messages.create(\n",
|
||||||
|
" model=\"claude-3-opus-20240229\",\n",
|
||||||
|
" max_tokens=1000,\n",
|
||||||
|
" messages=[\n",
|
||||||
|
" {\"role\": \"user\", \"content\": prompt}\n",
|
||||||
|
" ]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Claude response received.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 7: Parse and Display the Result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{\n",
|
||||||
|
" \"title\": \"Just in time answers for Sales and Support\",\n",
|
||||||
|
" \"is_yc_company\": true,\n",
|
||||||
|
" \"is_open_source\": false\n",
|
||||||
|
"}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Parse and print the result\n",
|
||||||
|
"result = json.loads(response.content[0].text)\n",
|
||||||
|
"print(json.dumps(result, indent=2))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user