diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index 842e1345..70063071 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -15,43 +15,35 @@ load_dotenv() app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -def search_google(query, objective): - """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") - search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) - results = search.get_dict().get("organic_results", []) - return {"objective": objective, "results": results} - -def scrape_url(url, objective): - """Scrape a website using Firecrawl.""" +def crawl_and_analyze_url(url, objective): + """Crawl a website using Firecrawl and analyze the content.""" print(f"Parameters: url={url}, objective={objective}") - scrape_status = app.scrape_url( + # Crawl the website + crawl_status = app.crawl_url( url, - params={'formats': ['markdown']} + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}, + poll_interval=5 ) - return {"objective": objective, "results": scrape_status} - -def crawl_url(url, objective): - """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") - # If using a crawled url set, pass the ID in the function call below - # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") - # scrape_status['results'] = scrape_status['data'] - scrape_status = app.crawl_url( - url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} - ) - return {"objective": objective, "results": scrape_status} - -def analyze_website_content(content, objective): - """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") - analysis = generate_completion( - "website data extractor", - f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", - "Objective: " + objective + "\nContent: " + content - ) - return {"objective": objective, "results": json.loads(analysis)} + crawl_status = crawl_status['data'] + # Process each 'markdown' element individually + combined_results = [] + for item in crawl_status: + if 'markdown' in item: + content = item['markdown'] + # Analyze the content + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + # Parse the JSON result + try: + result = json.loads(analysis) + combined_results.append(result) + except json.JSONDecodeError: + print(f"Could not parse JSON from analysis: {analysis}") + # Combine the results + return {"objective": objective, "results": combined_results} def generate_completion(role, task, content): """Generate a completion using OpenAI.""" @@ -65,64 +57,20 @@ def generate_completion(role, task, content): ) return response.choices[0].message.content -def read_websites_from_csv(file_path): - """Read websites from a CSV file.""" - websites = [] - with open(file_path, mode='r') as file: - csv_reader = csv.DictReader(file) - for row in csv_reader: - websites.append(row['website']) - return websites - -def write_results_to_json(results, file_path): - """Write results to a JSON file.""" - with open(file_path, mode='w', encoding='utf-8') as file: - json.dump(json.loads(results), file, ensure_ascii=False) - -def handoff_to_search_google(): - """Hand off the search query to the search google agent.""" - return google_search_agent - -def handoff_to_map_url(): - """Hand off the url to the map url agent.""" +def handoff_to_crawl_url(): + """Hand off the url to the crawl url agent.""" return crawl_website_agent -def handoff_to_analyst(): - """Hand off the website content to the analyst agent.""" - return analyst_agent - -def handoff_to_writer(): - """Hand off the results to the writer agent.""" - return writer_agent - user_interface_agent = Agent( name="User Interface Agent", - instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.", - functions=[read_websites_from_csv, handoff_to_search_google], -) - -google_search_agent = Agent( - name="Google Search Agent", - instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.", - functions=[search_google, handoff_to_map_url], + instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.", + functions=[handoff_to_crawl_url], ) crawl_website_agent = Agent( name="Crawl Website Agent", - instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.", - functions=[crawl_url, handoff_to_analyst], -) - -analyst_agent = Agent( - name="Analyst Agent", - instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.", - functions=[analyze_website_content, handoff_to_writer], -) - -writer_agent = Agent( - name="Writer Agent", - instructions="You are a writer agent that writes the final results to a JSON file.", - functions=[write_results_to_json], + instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.", + functions=[crawl_and_analyze_url], ) if __name__ == "__main__": diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json deleted file mode 100644 index 8f1f5bd8..00000000 --- a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json +++ /dev/null @@ -1,630 +0,0 @@ -[ - { - "contacts": [ - { - "name": "Canan Dagdeviren", - "email": null, - "title": null, - "company": null - }, - { - "name": "Media Lab Communications", - "email": "press@media.mit.edu", - "title": null, - "company": "MIT Media Lab" - } - ] - }, - { - "people": [ - { - "name": "Xan Foote", - "title": "Group Contact", - "email": "fluidadmin@media.mit.edu" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Fluid Interfaces" - } - ] - }, - { - "emails": [], - "people": [ - { - "name": "Personal Robots", - "title": "Group", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Personal Robots group" - } - ] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": "Author" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Susan Blumenthal, MD", - "title": "Visiting Professor; Director's Circle Member" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Dan Blondell", - "title": "I2" - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Copyright Holder" - }, - { - "name": "Jonathan Williams", - "title": "Copyright Holder" - }, - { - "name": "Sara V. Fernandez", - "title": "Courtesy of" - }, - { - "name": "Irmandy Wicaksono", - "title": "Courtesy of" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Interdisciplinary Research Institution" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": null - }, - { - "name": "Sarah Beckmann", - "title": null - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience" - }, - { - "name": "Rosalind W. Picard", - "title": null - }, - { - "name": "Guillermo Herrera-Arcos", - "title": null - }, - { - "name": "Christine Higgins", - "title": null - }, - { - "name": "Patrick Chwalek", - "title": null - }, - { - "name": "Sarra Shubart", - "title": null - }, - { - "name": "Amanda Diehl", - "title": null - }, - { - "name": "Chia Evers", - "title": null - }, - { - "name": "Matthew Groh", - "title": null - }, - { - "name": "Cl\u00e9mence Taillandier", - "title": null - }, - { - "name": "Cody Paige", - "title": null - }, - { - "name": "Minoo Rathnasabapathy", - "title": null - }, - { - "name": "Alex Berke", - "title": null - } - ], - "emails": [ - "web-admin@media.mit.edu" - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Samsung" - }, - { - "name": "Castrol" - } - ] - }, - { - "people": [ - { - "name": "Tod Machover", - "title": "Opera Composer" - } - ], - "companies": [ - { - "name": "Future Worlds", - "title": "Design and action for the future we want to live in" - }, - { - "name": "NOAA", - "title": "The Challenge: To secure a sustainable future for all living things" - }, - { - "name": "MIT Media Lab", - "title": "Research and development in interdisciplinary expertise" - } - ] - }, - { - "emails": [ - "r-admin@media.mit.edu" - ], - "people": [ - { - "name": "Affective Computing group", - "title": "MIT Media Lab" - } - ], - "companies": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "email": null, - "title": "Author at Samsung Newsroom" - }, - { - "name": "Pattie Maes", - "email": null, - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Rosalind W. Picard", - "email": null, - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - } - ], - "companies": [ - { - "name": "Samsung", - "email": null, - "title": "Collaborator" - }, - { - "name": "MIT Media Lab", - "email": null, - "title": "Collaborator" - } - ] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": null - }, - { - "name": "Jonathan Williams", - "title": null - }, - { - "name": "Sara V. Fernandez", - "title": null - }, - { - "name": "Irmandy Wicaksono", - "title": null - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": null - } - ], - "emails": [] - }, - { - "people": [], - "emails": [], - "companies": [], - "titles": [] - }, - { - "emails": [], - "people": [ - { - "name": "Andy Ryan", - "title": "Photographer", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "department": "Program in Media Arts and Sciences" - }, - { - "name": "MIT", - "department": "Center for Bits and Atoms" - } - ] - }, - { - "people": [ - { - "name": "Dan Allen", - "title": "Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Castrol" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Pat Pataranutaporn", - "title": "Former Graduate Student" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Kavin Winson", - "title": "Researcher at KASIKORN Labs" - }, - { - "name": "Peggy Yin", - "title": "Harvard University Undergraduate" - }, - { - "name": "Auttasak Lapapirojn", - "title": "KASIKORN Labs" - }, - { - "name": "Pichayoot Ouppaphan", - "title": "KASIKORN Labs" - }, - { - "name": "Monchai Lertsutthiwong", - "title": "Head of AI Research at KASIKORN Business-Technology Group" - }, - { - "name": "Hal Hershfield", - "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles" - }, - { - "name": "Jeremy Bailenson", - "title": "Thomas More Storke Professor of Communication at Stanford University" - }, - { - "name": "Thanawit Prasongpongchai", - "title": "Designer at KBTG and Visiting Scientist at the Media Lab" - } - ], - "companies": [ - { - "name": "MIT", - "role": "AI and simulation research" - }, - { - "name": "KASIKORN Labs", - "role": "Research and co-authorship" - }, - { - "name": "KASIKORN Business-Technology Group", - "role": "AI research support" - } - ] - }, - { - "people": [ - { - "name": "Andy Ryan", - "title": "Copyright" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "collaborator": "Castrol", - "project": "Space Research" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Fadel Adib", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Edward Boyden", - "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Kevin Esvelt", - "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications" - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Hiroshi Ishii", - "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab" - }, - { - "name": "Joseph M. Jacobson", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Kent Larson", - "title": "Professor of the Practice" - }, - { - "name": "Paul Pu Liang", - "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science" - }, - { - "name": "Zach Lieberman", - "title": "Adjunct Associate Professor of Media Arts and Sciences" - }, - { - "name": "Andrew Lippman", - "title": "Senior Research Scientist" - }, - { - "name": "Tod Machover", - "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Joseph A. Paradiso", - "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Alex 'Sandy' Pentland", - "title": "Professor Post Tenure of Media Arts and Sciences" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Ramesh Raskar", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Mitchel Resnick", - "title": "LEGO Papert Professor of Learning Research" - }, - { - "name": "Deb Roy", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Danielle Wood", - "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Individual", - "email": null - }, - { - "name": "Jonathan Williams", - "title": "Individual", - "email": null - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Dava Newman", - "title": "Media Lab Director" - }, - { - "name": "Xin Liu", - "title": "Media Lab Alum" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Boston Museum of Science" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing" - }, - { - "name": "Barmak Heshmat", - "title": "Co-founder", - "company": "Brelyon" - }, - { - "name": "Mohammad Tariqul Islam", - "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow" - }, - { - "name": "Hao-Tung Yang", - "title": "Recipient of the T.S. Lin Fellowship Award" - }, - { - "name": "Deblina Sarkar", - "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award" - }, - { - "name": "Danielle Wood", - "title": "2024 Just Tech Fellow" - }, - { - "name": "Baju Joy", - "title": "Whitaker Health Sciences Fellowship Award Recipient" - }, - { - "name": "Max Addae", - "title": "2024 Guthman Musical Instrument Competition Winner" - }, - { - "name": "Tod Machover", - "title": "Head of Opera of the Future", - "affiliation": "MIT" - }, - { - "name": "Sharif Islam", - "title": "ESIP Community Fellow and Future Earth Coasts Fellow", - "affiliation": "Postdoctoral associate in the Space Enabled research group" - } - ], - "companies": [ - { - "name": "Samsung", - "collaboration": "MIT Media Lab" - }, - { - "name": "Brelyon", - "co_founder": "Barmak Heshmat" - }, - { - "name": "Castrol", - "collaboration": "AstroAnt Payload Program" - }, - { - "name": "Augmental", - "product": "Mouth-based touchpad" - } - ], - "email_addresses": [] - } -] diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv deleted file mode 100644 index f029ccfc..00000000 --- a/examples/sales_web_crawler/websites.csv +++ /dev/null @@ -1,2 +0,0 @@ -website -https://www.media.mit.edu/