Add new example Web Scraping and Extraction with Firecrawl and Claude

This commit is contained in:
Eric Ciarla 2024-08-28 09:35:43 -04:00
parent ff08d7093e
commit 51d1a2e5f2

View File

@ -0,0 +1,259 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Web Scraping and Extraction with Firecrawl and Claude\n",
"\n",
"This notebook demonstrates how to use Firecrawl to scrape web content and Claude to extract structured data from it."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 1: Import Required Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import json\n",
"from firecrawl import FirecrawlApp\n",
"from anthropic import Anthropic\n",
"from dotenv import load_dotenv\n",
"\n",
"# Load environment variables\n",
"load_dotenv()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2: Set Up API Keys and URL"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"URL to scrape: https://mendable.ai\n"
]
}
],
"source": [
"# Retrieve API keys from environment variables\n",
"anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n",
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
"\n",
"# Set the URL to scrape\n",
"url = \"https://mendable.ai\" # Replace with the actual URL you want to scrape\n",
"\n",
"print(f\"URL to scrape: {url}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 3: Initialize Firecrawl and Anthropic Clients"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Firecrawl and Anthropic clients initialized.\n"
]
}
],
"source": [
"# Initialize FirecrawlApp and Anthropic client\n",
"firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)\n",
"anthropic_client = Anthropic(api_key=anthropic_api_key)\n",
"\n",
"print(\"Firecrawl and Anthropic clients initialized.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 4: Scrape the URL using Firecrawl"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Page content scraped. Length: 16199 characters\n"
]
}
],
"source": [
"# Scrape the URL using Firecrawl\n",
"page_content = firecrawl_app.scrape_url(url, params={\"pageOptions\": {\"onlyMainContent\": True}})\n",
"\n",
"print(f\"Page content scraped. Length: {len(page_content['content'])} characters\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 5: Prepare the Prompt for Claude"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prompt prepared for Claude.\n"
]
}
],
"source": [
"# Prepare the prompt for Claude\n",
"prompt = f\"\"\"Analyze the following webpage content and extract the following information:\n",
"1. The title of the page\n",
"2. Whether the company is part of Y Combinator (YC)\n",
"3. Whether the company/product is open source\n",
"\n",
"Return the information in JSON format with the following schema:\n",
"{{\n",
" \"main_header_title\": string,\n",
" \"is_yc_company\": boolean,\n",
" \"is_open_source\": boolean\n",
"}}\n",
"\n",
"Webpage content:\n",
"{page_content['content']}\n",
"\n",
"Return only the JSON, nothing else.\"\"\"\n",
"\n",
"print(\"Prompt prepared for Claude.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 6: Query Claude"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Claude response received.\n"
]
}
],
"source": [
"# Query Claude\n",
"response = anthropic_client.messages.create(\n",
" model=\"claude-3-opus-20240229\",\n",
" max_tokens=1000,\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": prompt}\n",
" ]\n",
")\n",
"\n",
"print(\"Claude response received.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 7: Parse and Display the Result"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"title\": \"Just in time answers for Sales and Support\",\n",
" \"is_yc_company\": true,\n",
" \"is_open_source\": false\n",
"}\n"
]
}
],
"source": [
"# Parse and print the result\n",
"result = json.loads(response.content[0].text)\n",
"print(json.dumps(result, indent=2))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}