Update app.py

This commit is contained in:
Rishi Raj Jain 2024-10-20 18:08:38 +05:30 committed by GitHub
parent 8a4ee4482d
commit d113199a29
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,13 +1,13 @@
import csv import csv
import json import json
import os import os
import uuid
from dotenv import load_dotenv from dotenv import load_dotenv
from firecrawl import FirecrawlApp from firecrawl import FirecrawlApp
from openai import OpenAI from openai import OpenAI
from serpapi import GoogleSearch from serpapi import GoogleSearch
from tqdm import tqdm from swarm import Agent
from swarm.repl import run_demo_loop
load_dotenv() load_dotenv()
@ -17,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def search_google(query, objective): def search_google(query, objective):
"""Search Google using SerpAPI.""" """Search Google using SerpAPI."""
# print(f"Parameters: query={query}, objective={objective}") print(f"Parameters: query={query}, objective={objective}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
results = search.get_dict().get("organic_results", []) results = search.get_dict().get("organic_results", [])
return {"objective": objective, "results": results} return {"objective": objective, "results": results}
def scrape_url(url, objective): def scrape_url(url, objective):
"""Scrape a website using Firecrawl.""" """Scrape a website using Firecrawl."""
# print(f"Parameters: url={url}, objective={objective}") print(f"Parameters: url={url}, objective={objective}")
scrape_status = app.scrape_url( scrape_status = app.scrape_url(
url, url,
params={'formats': ['markdown']} params={'formats': ['markdown']}
@ -33,29 +33,29 @@ def scrape_url(url, objective):
def crawl_url(url, objective): def crawl_url(url, objective):
"""Crawl a website using Firecrawl.""" """Crawl a website using Firecrawl."""
# print(f"Parameters: url={url}, objective={objective}") print(f"Parameters: url={url}, objective={objective}")
# If using a crawled url set, pass the ID in the function call below # If using a crawled url set, pass the ID in the function call below
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
# scrape_status['results'] = scrape_status['data'] # scrape_status['results'] = scrape_status['data']
scrape_status = app.crawl_url( scrape_status = app.crawl_url(
url, url,
params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
) )
return {"objective": objective, "results": scrape_status} return {"objective": objective, "results": scrape_status}
def analyze_website_content(content, objective): def analyze_website_content(content, objective):
"""Analyze the scraped website content using OpenAI.""" """Analyze the scraped website content using OpenAI."""
# print(f"Parameters: content={content[:50]}..., objective={objective}") print(f"Parameters: content={content[:50]}..., objective={objective}")
analysis = generate_completion( analysis = generate_completion(
"website data extractor", "website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
"Objective: " + objective + "\nContent: " + content "Objective: " + objective + "\nContent: " + content
) )
return {"objective": objective, "results": analysis} return {"objective": objective, "results": json.loads(analysis)}
def generate_completion(role, task, content): def generate_completion(role, task, content):
"""Generate a completion using OpenAI.""" """Generate a completion using OpenAI."""
# print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create( response = client.chat.completions.create(
model="gpt-4o", model="gpt-4o",
messages=[ messages=[
@ -76,31 +76,55 @@ def read_websites_from_csv(file_path):
def write_results_to_json(results, file_path): def write_results_to_json(results, file_path):
"""Write results to a JSON file.""" """Write results to a JSON file."""
with open(file_path, mode='w') as file: with open(file_path, mode='w', encoding='utf-8') as file:
json.dump(results, file, indent=4) json.dump(json.loads(results), file, ensure_ascii=False)
def process_websites(file_path): def handoff_to_search_google():
"""Process websites from a CSV file and write results to a new JSON file.""" """Hand off the search query to the search google agent."""
results = [] return google_search_agent
websites = read_websites_from_csv(file_path)
for website in websites: def handoff_to_map_url():
search_results = search_google(website, "Search website") """Hand off the url to the map url agent."""
if search_results['results']: return crawl_website_agent
top_result = search_results['results'][0]
url = top_result['link'] def handoff_to_analyst():
unique_filename = f'output_{uuid.uuid4()}.json' """Hand off the website content to the analyst agent."""
crawl_results = crawl_url(url, "Crawl website") return analyst_agent
if crawl_results['results']:
for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): def handoff_to_writer():
analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") """Hand off the results to the writer agent."""
try: return writer_agent
result = json.loads(analysis_results['results'])
if result: user_interface_agent = Agent(
results.append(result) name="User Interface Agent",
write_results_to_json(results, unique_filename) instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.",
except: functions=[read_websites_from_csv, handoff_to_search_google],
continue )
google_search_agent = Agent(
name="Google Search Agent",
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.",
functions=[search_google, handoff_to_map_url],
)
crawl_website_agent = Agent(
name="Crawl Website Agent",
instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.",
functions=[crawl_url, handoff_to_analyst],
)
analyst_agent = Agent(
name="Analyst Agent",
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.",
functions=[analyze_website_content, handoff_to_writer],
)
writer_agent = Agent(
name="Writer Agent",
instructions="You are a writer agent that writes the final results to a JSON file.",
functions=[write_results_to_json],
)
if __name__ == "__main__": if __name__ == "__main__":
# Process websites from the CSV file # Run the demo loop with the user interface agent
process_websites('websites.csv') run_demo_loop(user_interface_agent, stream=True)