mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Update app.py
This commit is contained in:
parent
8a4ee4482d
commit
d113199a29
|
@ -1,13 +1,13 @@
|
||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import uuid
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from firecrawl import FirecrawlApp
|
from firecrawl import FirecrawlApp
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from serpapi import GoogleSearch
|
from serpapi import GoogleSearch
|
||||||
from tqdm import tqdm
|
from swarm import Agent
|
||||||
|
from swarm.repl import run_demo_loop
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -17,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
def search_google(query, objective):
|
def search_google(query, objective):
|
||||||
"""Search Google using SerpAPI."""
|
"""Search Google using SerpAPI."""
|
||||||
# print(f"Parameters: query={query}, objective={objective}")
|
print(f"Parameters: query={query}, objective={objective}")
|
||||||
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
||||||
results = search.get_dict().get("organic_results", [])
|
results = search.get_dict().get("organic_results", [])
|
||||||
return {"objective": objective, "results": results}
|
return {"objective": objective, "results": results}
|
||||||
|
|
||||||
def scrape_url(url, objective):
|
def scrape_url(url, objective):
|
||||||
"""Scrape a website using Firecrawl."""
|
"""Scrape a website using Firecrawl."""
|
||||||
# print(f"Parameters: url={url}, objective={objective}")
|
print(f"Parameters: url={url}, objective={objective}")
|
||||||
scrape_status = app.scrape_url(
|
scrape_status = app.scrape_url(
|
||||||
url,
|
url,
|
||||||
params={'formats': ['markdown']}
|
params={'formats': ['markdown']}
|
||||||
|
@ -33,29 +33,29 @@ def scrape_url(url, objective):
|
||||||
|
|
||||||
def crawl_url(url, objective):
|
def crawl_url(url, objective):
|
||||||
"""Crawl a website using Firecrawl."""
|
"""Crawl a website using Firecrawl."""
|
||||||
# print(f"Parameters: url={url}, objective={objective}")
|
print(f"Parameters: url={url}, objective={objective}")
|
||||||
# If using a crawled url set, pass the ID in the function call below
|
# If using a crawled url set, pass the ID in the function call below
|
||||||
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
|
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
|
||||||
# scrape_status['results'] = scrape_status['data']
|
# scrape_status['results'] = scrape_status['data']
|
||||||
scrape_status = app.crawl_url(
|
scrape_status = app.crawl_url(
|
||||||
url,
|
url,
|
||||||
params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
|
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
|
||||||
)
|
)
|
||||||
return {"objective": objective, "results": scrape_status}
|
return {"objective": objective, "results": scrape_status}
|
||||||
|
|
||||||
def analyze_website_content(content, objective):
|
def analyze_website_content(content, objective):
|
||||||
"""Analyze the scraped website content using OpenAI."""
|
"""Analyze the scraped website content using OpenAI."""
|
||||||
# print(f"Parameters: content={content[:50]}..., objective={objective}")
|
print(f"Parameters: content={content[:50]}..., objective={objective}")
|
||||||
analysis = generate_completion(
|
analysis = generate_completion(
|
||||||
"website data extractor",
|
"website data extractor",
|
||||||
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
|
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
|
||||||
"Objective: " + objective + "\nContent: " + content
|
"Objective: " + objective + "\nContent: " + content
|
||||||
)
|
)
|
||||||
return {"objective": objective, "results": analysis}
|
return {"objective": objective, "results": json.loads(analysis)}
|
||||||
|
|
||||||
def generate_completion(role, task, content):
|
def generate_completion(role, task, content):
|
||||||
"""Generate a completion using OpenAI."""
|
"""Generate a completion using OpenAI."""
|
||||||
# print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model="gpt-4o",
|
model="gpt-4o",
|
||||||
messages=[
|
messages=[
|
||||||
|
@ -76,31 +76,55 @@ def read_websites_from_csv(file_path):
|
||||||
|
|
||||||
def write_results_to_json(results, file_path):
|
def write_results_to_json(results, file_path):
|
||||||
"""Write results to a JSON file."""
|
"""Write results to a JSON file."""
|
||||||
with open(file_path, mode='w') as file:
|
with open(file_path, mode='w', encoding='utf-8') as file:
|
||||||
json.dump(results, file, indent=4)
|
json.dump(json.loads(results), file, ensure_ascii=False)
|
||||||
|
|
||||||
def process_websites(file_path):
|
def handoff_to_search_google():
|
||||||
"""Process websites from a CSV file and write results to a new JSON file."""
|
"""Hand off the search query to the search google agent."""
|
||||||
results = []
|
return google_search_agent
|
||||||
websites = read_websites_from_csv(file_path)
|
|
||||||
for website in websites:
|
def handoff_to_map_url():
|
||||||
search_results = search_google(website, "Search website")
|
"""Hand off the url to the map url agent."""
|
||||||
if search_results['results']:
|
return crawl_website_agent
|
||||||
top_result = search_results['results'][0]
|
|
||||||
url = top_result['link']
|
def handoff_to_analyst():
|
||||||
unique_filename = f'output_{uuid.uuid4()}.json'
|
"""Hand off the website content to the analyst agent."""
|
||||||
crawl_results = crawl_url(url, "Crawl website")
|
return analyst_agent
|
||||||
if crawl_results['results']:
|
|
||||||
for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
|
def handoff_to_writer():
|
||||||
analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
|
"""Hand off the results to the writer agent."""
|
||||||
try:
|
return writer_agent
|
||||||
result = json.loads(analysis_results['results'])
|
|
||||||
if result:
|
user_interface_agent = Agent(
|
||||||
results.append(result)
|
name="User Interface Agent",
|
||||||
write_results_to_json(results, unique_filename)
|
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.",
|
||||||
except:
|
functions=[read_websites_from_csv, handoff_to_search_google],
|
||||||
continue
|
)
|
||||||
|
|
||||||
|
google_search_agent = Agent(
|
||||||
|
name="Google Search Agent",
|
||||||
|
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.",
|
||||||
|
functions=[search_google, handoff_to_map_url],
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl_website_agent = Agent(
|
||||||
|
name="Crawl Website Agent",
|
||||||
|
instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.",
|
||||||
|
functions=[crawl_url, handoff_to_analyst],
|
||||||
|
)
|
||||||
|
|
||||||
|
analyst_agent = Agent(
|
||||||
|
name="Analyst Agent",
|
||||||
|
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.",
|
||||||
|
functions=[analyze_website_content, handoff_to_writer],
|
||||||
|
)
|
||||||
|
|
||||||
|
writer_agent = Agent(
|
||||||
|
name="Writer Agent",
|
||||||
|
instructions="You are a writer agent that writes the final results to a JSON file.",
|
||||||
|
functions=[write_results_to_json],
|
||||||
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Process websites from the CSV file
|
# Run the demo loop with the user interface agent
|
||||||
process_websites('websites.csv')
|
run_demo_loop(user_interface_agent, stream=True)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user