mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Gemini caching example
This commit is contained in:
parent
45f2765601
commit
d80046d17c
|
@ -0,0 +1,166 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import datetime\n",
|
||||
"import time\n",
|
||||
"import google.generativeai as genai\n",
|
||||
"from google.generativeai import caching\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Retrieve API keys from environment variables\n",
|
||||
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
|
||||
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||
"\n",
|
||||
"# Configure the Google Generative AI module with the API key\n",
|
||||
"genai.configure(api_key=google_api_key)\n",
|
||||
"\n",
|
||||
"# Initialize the FirecrawlApp with your API key\n",
|
||||
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"No data returned from crawl.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Crawl a website\n",
|
||||
"crawl_url = 'https://dify.ai/'\n",
|
||||
"params = {\n",
|
||||
" \n",
|
||||
" 'crawlOptions': {\n",
|
||||
" 'limit': 100\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
|
||||
"\n",
|
||||
"if crawl_result is not None:\n",
|
||||
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
|
||||
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
|
||||
"\n",
|
||||
" # Save the modified results as a text file containing JSON data\n",
|
||||
" with open('crawl_result.txt', 'w') as file:\n",
|
||||
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
|
||||
"else:\n",
|
||||
" print(\"No data returned from crawl.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the video using the Files API\n",
|
||||
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
|
||||
"\n",
|
||||
"# Wait for the file to finish processing\n",
|
||||
"while text_file.state.name == \"PROCESSING\":\n",
|
||||
" print('Waiting for file to be processed.')\n",
|
||||
" time.sleep(2)\n",
|
||||
" text_file = genai.get_file(text_file.name)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-pro-001\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
" ttl=datetime.timedelta(minutes=15),\n",
|
||||
")\n",
|
||||
"# Construct a GenerativeModel which uses the created cache.\n",
|
||||
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
|
||||
"\n",
|
||||
"Here's how Firecrawl helps:\n",
|
||||
"\n",
|
||||
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
|
||||
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
|
||||
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
|
||||
"\n",
|
||||
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
|
||||
"\n",
|
||||
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Query the model\n",
|
||||
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
|
||||
"response_dict = response.to_dict()\n",
|
||||
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
|
||||
"print(response_text)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user