This commit is contained in:
Eric Ciarla 2024-10-21 12:01:09 -04:00
parent cf98d69bbb
commit 22d375ad29
3 changed files with 32 additions and 716 deletions

View File

@ -15,43 +15,35 @@ load_dotenv()
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def search_google(query, objective):
"""Search Google using SerpAPI."""
print(f"Parameters: query={query}, objective={objective}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
results = search.get_dict().get("organic_results", [])
return {"objective": objective, "results": results}
def scrape_url(url, objective):
"""Scrape a website using Firecrawl."""
def crawl_and_analyze_url(url, objective):
"""Crawl a website using Firecrawl and analyze the content."""
print(f"Parameters: url={url}, objective={objective}")
scrape_status = app.scrape_url(
# Crawl the website
crawl_status = app.crawl_url(
url,
params={'formats': ['markdown']}
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
poll_interval=5
)
return {"objective": objective, "results": scrape_status}
def crawl_url(url, objective):
"""Crawl a website using Firecrawl."""
print(f"Parameters: url={url}, objective={objective}")
# If using a crawled url set, pass the ID in the function call below
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
# scrape_status['results'] = scrape_status['data']
scrape_status = app.crawl_url(
url,
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
)
return {"objective": objective, "results": scrape_status}
def analyze_website_content(content, objective):
"""Analyze the scraped website content using OpenAI."""
print(f"Parameters: content={content[:50]}..., objective={objective}")
crawl_status = crawl_status['data']
# Process each 'markdown' element individually
combined_results = []
for item in crawl_status:
if 'markdown' in item:
content = item['markdown']
# Analyze the content
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
"Objective: " + objective + "\nContent: " + content
)
return {"objective": objective, "results": json.loads(analysis)}
# Parse the JSON result
try:
result = json.loads(analysis)
combined_results.append(result)
except json.JSONDecodeError:
print(f"Could not parse JSON from analysis: {analysis}")
# Combine the results
return {"objective": objective, "results": combined_results}
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
@ -65,64 +57,20 @@ def generate_completion(role, task, content):
)
return response.choices[0].message.content
def read_websites_from_csv(file_path):
"""Read websites from a CSV file."""
websites = []
with open(file_path, mode='r') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
websites.append(row['website'])
return websites
def write_results_to_json(results, file_path):
"""Write results to a JSON file."""
with open(file_path, mode='w', encoding='utf-8') as file:
json.dump(json.loads(results), file, ensure_ascii=False)
def handoff_to_search_google():
"""Hand off the search query to the search google agent."""
return google_search_agent
def handoff_to_map_url():
"""Hand off the url to the map url agent."""
def handoff_to_crawl_url():
"""Hand off the url to the crawl url agent."""
return crawl_website_agent
def handoff_to_analyst():
"""Hand off the website content to the analyst agent."""
return analyst_agent
def handoff_to_writer():
"""Hand off the results to the writer agent."""
return writer_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.",
functions=[read_websites_from_csv, handoff_to_search_google],
)
google_search_agent = Agent(
name="Google Search Agent",
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.",
functions=[search_google, handoff_to_map_url],
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
functions=[handoff_to_crawl_url],
)
crawl_website_agent = Agent(
name="Crawl Website Agent",
instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.",
functions=[crawl_url, handoff_to_analyst],
)
analyst_agent = Agent(
name="Analyst Agent",
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.",
functions=[analyze_website_content, handoff_to_writer],
)
writer_agent = Agent(
name="Writer Agent",
instructions="You are a writer agent that writes the final results to a JSON file.",
functions=[write_results_to_json],
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
functions=[crawl_and_analyze_url],
)
if __name__ == "__main__":

View File

@ -1,630 +0,0 @@
[
{
"contacts": [
{
"name": "Canan Dagdeviren",
"email": null,
"title": null,
"company": null
},
{
"name": "Media Lab Communications",
"email": "press@media.mit.edu",
"title": null,
"company": "MIT Media Lab"
}
]
},
{
"people": [
{
"name": "Xan Foote",
"title": "Group Contact",
"email": "fluidadmin@media.mit.edu"
}
],
"companies": [
{
"name": "MIT Media Lab",
"title": "Fluid Interfaces"
}
]
},
{
"emails": [],
"people": [
{
"name": "Personal Robots",
"title": "Group",
"company": "MIT Media Lab"
}
],
"companies": [
{
"name": "MIT Media Lab",
"title": "Personal Robots group"
}
]
},
{
"people": [
{
"name": "David Sweeney",
"title": "Author"
},
{
"name": "Rosalind W. Picard",
"title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
},
{
"name": "Pattie Maes",
"title": "Professor of Media Technology; Germeshausen Professor"
},
{
"name": "Hugh Herr",
"title": "Professor of Media Arts and Sciences"
},
{
"name": "Deblina Sarkar",
"title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
},
{
"name": "Canan Dagdeviren",
"title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
},
{
"name": "Dava Newman",
"title": "Director; Apollo Professor of Astronautics"
},
{
"name": "Cynthia Breazeal",
"title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
},
{
"name": "Susan Blumenthal, MD",
"title": "Visiting Professor; Director's Circle Member"
}
],
"emails": [],
"companies": []
},
{
"people": [
{
"name": "Dan Blondell",
"title": "I2"
}
],
"companies": [],
"emails": []
},
{
"people": [
{
"name": "Canan Dagdeviren",
"title": "Copyright Holder"
},
{
"name": "Jonathan Williams",
"title": "Copyright Holder"
},
{
"name": "Sara V. Fernandez",
"title": "Courtesy of"
},
{
"name": "Irmandy Wicaksono",
"title": "Courtesy of"
}
],
"companies": [
{
"name": "MIT Media Lab",
"title": "Interdisciplinary Research Institution"
}
],
"emails": []
},
{
"people": [
{
"name": "David Sweeney",
"title": null
},
{
"name": "Sarah Beckmann",
"title": null
},
{
"name": "Behnaz Farahi",
"title": "Assistant Professor, Transformative Design"
},
{
"name": "Paul Liang",
"title": "Assistant Professor, AI + Human Experience"
},
{
"name": "Rosalind W. Picard",
"title": null
},
{
"name": "Guillermo Herrera-Arcos",
"title": null
},
{
"name": "Christine Higgins",
"title": null
},
{
"name": "Patrick Chwalek",
"title": null
},
{
"name": "Sarra Shubart",
"title": null
},
{
"name": "Amanda Diehl",
"title": null
},
{
"name": "Chia Evers",
"title": null
},
{
"name": "Matthew Groh",
"title": null
},
{
"name": "Cl\u00e9mence Taillandier",
"title": null
},
{
"name": "Cody Paige",
"title": null
},
{
"name": "Minoo Rathnasabapathy",
"title": null
},
{
"name": "Alex Berke",
"title": null
}
],
"emails": [
"web-admin@media.mit.edu"
],
"companies": [
{
"name": "MIT Media Lab"
},
{
"name": "Samsung"
},
{
"name": "Castrol"
}
]
},
{
"people": [
{
"name": "Tod Machover",
"title": "Opera Composer"
}
],
"companies": [
{
"name": "Future Worlds",
"title": "Design and action for the future we want to live in"
},
{
"name": "NOAA",
"title": "The Challenge: To secure a sustainable future for all living things"
},
{
"name": "MIT Media Lab",
"title": "Research and development in interdisciplinary expertise"
}
]
},
{
"emails": [
"r-admin@media.mit.edu"
],
"people": [
{
"name": "Affective Computing group",
"title": "MIT Media Lab"
}
],
"companies": []
},
{
"people": [
{
"name": "David Sweeney",
"email": null,
"title": "Author at Samsung Newsroom"
},
{
"name": "Pattie Maes",
"email": null,
"title": "Professor of Media Technology; Germeshausen Professor"
},
{
"name": "Rosalind W. Picard",
"email": null,
"title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
}
],
"companies": [
{
"name": "Samsung",
"email": null,
"title": "Collaborator"
},
{
"name": "MIT Media Lab",
"email": null,
"title": "Collaborator"
}
]
},
{
"people": [
{
"name": "Canan Dagdeviren",
"title": null
},
{
"name": "Jonathan Williams",
"title": null
},
{
"name": "Sara V. Fernandez",
"title": null
},
{
"name": "Irmandy Wicaksono",
"title": null
}
],
"companies": [
{
"name": "MIT Media Lab",
"title": null
}
],
"emails": []
},
{
"people": [],
"emails": [],
"companies": [],
"titles": []
},
{
"emails": [],
"people": [
{
"name": "Andy Ryan",
"title": "Photographer",
"company": "MIT Media Lab"
}
],
"companies": [
{
"name": "MIT Media Lab",
"department": "Program in Media Arts and Sciences"
},
{
"name": "MIT",
"department": "Center for Bits and Atoms"
}
]
},
{
"people": [
{
"name": "Dan Allen",
"title": "Media Lab"
}
],
"companies": [
{
"name": "MIT Media Lab"
},
{
"name": "Castrol"
}
],
"emails": []
},
{
"people": [
{
"name": "Pat Pataranutaporn",
"title": "Former Graduate Student"
},
{
"name": "Pattie Maes",
"title": "Professor of Media Technology; Germeshausen Professor"
},
{
"name": "Kavin Winson",
"title": "Researcher at KASIKORN Labs"
},
{
"name": "Peggy Yin",
"title": "Harvard University Undergraduate"
},
{
"name": "Auttasak Lapapirojn",
"title": "KASIKORN Labs"
},
{
"name": "Pichayoot Ouppaphan",
"title": "KASIKORN Labs"
},
{
"name": "Monchai Lertsutthiwong",
"title": "Head of AI Research at KASIKORN Business-Technology Group"
},
{
"name": "Hal Hershfield",
"title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles"
},
{
"name": "Jeremy Bailenson",
"title": "Thomas More Storke Professor of Communication at Stanford University"
},
{
"name": "Thanawit Prasongpongchai",
"title": "Designer at KBTG and Visiting Scientist at the Media Lab"
}
],
"companies": [
{
"name": "MIT",
"role": "AI and simulation research"
},
{
"name": "KASIKORN Labs",
"role": "Research and co-authorship"
},
{
"name": "KASIKORN Business-Technology Group",
"role": "AI research support"
}
]
},
{
"people": [
{
"name": "Andy Ryan",
"title": "Copyright"
}
],
"companies": [
{
"name": "MIT Media Lab",
"collaborator": "Castrol",
"project": "Space Research"
}
],
"emails": []
},
{
"people": [
{
"name": "Fadel Adib",
"title": "Associate Professor of Media Arts and Sciences"
},
{
"name": "Edward Boyden",
"title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology"
},
{
"name": "Cynthia Breazeal",
"title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
},
{
"name": "Canan Dagdeviren",
"title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
},
{
"name": "Kevin Esvelt",
"title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications"
},
{
"name": "Behnaz Farahi",
"title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor"
},
{
"name": "Hugh Herr",
"title": "Professor of Media Arts and Sciences"
},
{
"name": "Hiroshi Ishii",
"title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab"
},
{
"name": "Joseph M. Jacobson",
"title": "Associate Professor of Media Arts and Sciences"
},
{
"name": "Kent Larson",
"title": "Professor of the Practice"
},
{
"name": "Paul Pu Liang",
"title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science"
},
{
"name": "Zach Lieberman",
"title": "Adjunct Associate Professor of Media Arts and Sciences"
},
{
"name": "Andrew Lippman",
"title": "Senior Research Scientist"
},
{
"name": "Tod Machover",
"title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences"
},
{
"name": "Pattie Maes",
"title": "Professor of Media Technology; Germeshausen Professor"
},
{
"name": "Dava Newman",
"title": "Director; Apollo Professor of Astronautics"
},
{
"name": "Joseph A. Paradiso",
"title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences"
},
{
"name": "Alex 'Sandy' Pentland",
"title": "Professor Post Tenure of Media Arts and Sciences"
},
{
"name": "Rosalind W. Picard",
"title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
},
{
"name": "Ramesh Raskar",
"title": "Associate Professor of Media Arts and Sciences"
},
{
"name": "Mitchel Resnick",
"title": "LEGO Papert Professor of Learning Research"
},
{
"name": "Deb Roy",
"title": "Professor of Media Arts and Sciences"
},
{
"name": "Deblina Sarkar",
"title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
},
{
"name": "Danielle Wood",
"title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics"
}
],
"emails": [],
"companies": []
},
{
"people": [
{
"name": "Canan Dagdeviren",
"title": "Individual",
"email": null
},
{
"name": "Jonathan Williams",
"title": "Individual",
"email": null
}
],
"companies": [],
"emails": []
},
{
"people": [
{
"name": "Dava Newman",
"title": "Media Lab Director"
},
{
"name": "Xin Liu",
"title": "Media Lab Alum"
}
],
"companies": [
{
"name": "MIT Media Lab"
},
{
"name": "Boston Museum of Science"
}
],
"emails": []
},
{
"people": [
{
"name": "Behnaz Farahi",
"title": "Assistant Professor, Transformative Design",
"affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)"
},
{
"name": "Paul Liang",
"title": "Assistant Professor, AI + Human Experience",
"affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing"
},
{
"name": "Barmak Heshmat",
"title": "Co-founder",
"company": "Brelyon"
},
{
"name": "Mohammad Tariqul Islam",
"title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow"
},
{
"name": "Hao-Tung Yang",
"title": "Recipient of the T.S. Lin Fellowship Award"
},
{
"name": "Deblina Sarkar",
"title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award"
},
{
"name": "Danielle Wood",
"title": "2024 Just Tech Fellow"
},
{
"name": "Baju Joy",
"title": "Whitaker Health Sciences Fellowship Award Recipient"
},
{
"name": "Max Addae",
"title": "2024 Guthman Musical Instrument Competition Winner"
},
{
"name": "Tod Machover",
"title": "Head of Opera of the Future",
"affiliation": "MIT"
},
{
"name": "Sharif Islam",
"title": "ESIP Community Fellow and Future Earth Coasts Fellow",
"affiliation": "Postdoctoral associate in the Space Enabled research group"
}
],
"companies": [
{
"name": "Samsung",
"collaboration": "MIT Media Lab"
},
{
"name": "Brelyon",
"co_founder": "Barmak Heshmat"
},
{
"name": "Castrol",
"collaboration": "AstroAnt Payload Program"
},
{
"name": "Augmental",
"product": "Mouth-based touchpad"
}
],
"email_addresses": []
}
]

View File

@ -1,2 +0,0 @@
website
https://www.media.mit.edu/
1 website
2 https://www.media.mit.edu/