Gemini caching example

2024-11-16 11:42:24 +08:00 · 2024-06-26 09:48:15 -04:00 · 2024-06-26 09:48:15 -04:00 · d80046d17c
commit d80046d17c
parent 45f2765601
1 changed files with 166 additions and 0 deletions
--- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
+++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import datetime\n",
+    "import time\n",
+    "import google.generativeai as genai\n",
+    "from google.generativeai import caching\n",
+    "from dotenv import load_dotenv\n",
+    "from firecrawl import FirecrawlApp\n",
+    "import json\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Retrieve API keys from environment variables\n",
+    "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
+    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
+    "\n",
+    "# Configure the Google Generative AI module with the API key\n",
+    "genai.configure(api_key=google_api_key)\n",
+    "\n",
+    "# Initialize the FirecrawlApp with your API key\n",
+    "app = FirecrawlApp(api_key=firecrawl_api_key)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No data returned from crawl.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Crawl a website\n",
+    "crawl_url = 'https://dify.ai/'\n",
+    "params = {\n",
+    "   \n",
+    "    'crawlOptions': {\n",
+    "        'limit': 100\n",
+    "    }\n",
+    "}\n",
+    "crawl_result = app.crawl_url(crawl_url, params=params)\n",
+    "\n",
+    "if crawl_result is not None:\n",
+    "    # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
+    "    cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
+    "\n",
+    "    # Save the modified results as a text file containing JSON data\n",
+    "    with open('crawl_result.txt', 'w') as file:\n",
+    "        file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
+    "else:\n",
+    "    print(\"No data returned from crawl.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload the video using the Files API\n",
+    "text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
+    "\n",
+    "# Wait for the file to finish processing\n",
+    "while text_file.state.name == \"PROCESSING\":\n",
+    "    print('Waiting for file to be processed.')\n",
+    "    time.sleep(2)\n",
+    "    text_file = genai.get_file(text_file.name)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a cache with a 5 minute TTL\n",
+    "cache = caching.CachedContent.create(\n",
+    "    model=\"models/gemini-1.5-pro-001\",\n",
+    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
+    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
+    "    contents=[text_file],\n",
+    "    ttl=datetime.timedelta(minutes=15),\n",
+    ")\n",
+    "# Construct a GenerativeModel which uses the created cache.\n",
+    "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
+      "\n",
+      "Here's how Firecrawl helps:\n",
+      "\n",
+      "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
+      "* **Clean Output:**  Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
+      "* **Parallel Crawling:**  Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
+      "\n",
+      "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
+      "\n",
+      "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Query the model\n",
+    "response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
+    "response_dict = response.to_dict()\n",
+    "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
+    "print(response_text)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}