mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Add new example Web Scraping and Extraction with Firecrawl and Claude
This commit is contained in:
parent
ff08d7093e
commit
51d1a2e5f2
|
@ -0,0 +1,259 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Web Scraping and Extraction with Firecrawl and Claude\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Import Required Libraries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"from anthropic import Anthropic\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 2: Set Up API Keys and URL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"URL to scrape: https://mendable.ai\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Retrieve API keys from environment variables\n",
|
||||
"anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
|
||||
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||
"\n",
|
||||
"# Set the URL to scrape\n",
|
||||
"url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n",
|
||||
"\n",
|
||||
"print(f\"URL to scrape: {url}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 3: Initialize Firecrawl and Anthropic Clients"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Firecrawl and Anthropic clients initialized.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Initialize FirecrawlApp and Anthropic client\n",
|
||||
"firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
|
||||
"anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
|
||||
"\n",
|
||||
"print(\"Firecrawl and Anthropic clients initialized.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Scrape the URL using Firecrawl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Page content scraped. Length: 16199 characters\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Scrape the URL using Firecrawl\n",
|
||||
"page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
|
||||
"\n",
|
||||
"print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: Prepare the Prompt for Claude"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prompt prepared for Claude.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Prepare the prompt for Claude\n",
|
||||
"prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
|
||||
"1. The title of the page\n",
|
||||
"2. Whether the company is part of Y Combinator (YC)\n",
|
||||
"3. Whether the company/product is open source\n",
|
||||
"\n",
|
||||
"Return the information in JSON format with the following schema:\n",
|
||||
"{{\n",
|
||||
" \"main_header_title\": string,\n",
|
||||
" \"is_yc_company\": boolean,\n",
|
||||
" \"is_open_source\": boolean\n",
|
||||
"}}\n",
|
||||
"\n",
|
||||
"Webpage content:\n",
|
||||
"{page_content['content']}\n",
|
||||
"\n",
|
||||
"Return only the JSON, nothing else.\"\"\"\n",
|
||||
"\n",
|
||||
"print(\"Prompt prepared for Claude.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 6: Query Claude"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Claude response received.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Query Claude\n",
|
||||
"response = anthropic_client.messages.create(\n",
|
||||
" model=\"claude-3-opus-20240229\",\n",
|
||||
" max_tokens=1000,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": prompt}\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Claude response received.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 7: Parse and Display the Result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\n",
|
||||
" \"title\": \"Just in time answers for Sales and Support\",\n",
|
||||
" \"is_yc_company\": true,\n",
|
||||
" \"is_open_source\": false\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Parse and print the result\n",
|
||||
"result = json.loads(response.content[0].text)\n",
|
||||
"print(json.dumps(result, indent=2))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
Loading…
Reference in New Issue
Block a user