From 10381b5d3cda7f9b635008b59cb90d61ca87c0ed Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:51:18 +0530 Subject: [PATCH 01/12] Create app.py --- examples/sales_web_crawler/app.py | 99 +++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 examples/sales_web_crawler/app.py diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py new file mode 100644 index 00000000..ae14fc62 --- /dev/null +++ b/examples/sales_web_crawler/app.py @@ -0,0 +1,99 @@ +import os +import csv +import json + +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from openai import OpenAI +from serpapi import GoogleSearch + +load_dotenv() + +# Initialize FirecrawlApp and OpenAI +app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def search_google(query, objective): + """Search Google using SerpAPI.""" + print(f"Parameters: query={query}, objective={objective}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + results = search.get_dict().get("organic_results", []) + return {"objective": objective, "results": results} + +def scrape_url(url, objective): + """Scrape a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + scrape_status = app.scrape_url( + url, + params={'formats': ['markdown']} + ) + return {"objective": objective, "results": scrape_status} + +def crawl_url(url, objective): + """Crawl a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + # If using a crawled url set, pass the ID in the function call below + # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") + # scrape_status['results'] = scrape_status['data'] + scrape_status = app.crawl_url( + url, + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + ) + return {"objective": objective, "results": scrape_status} + +def analyze_website_content(content, objective): + """Analyze the scraped website content using OpenAI.""" + print(f"Parameters: content={content[:50]}..., objective={objective}") + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + return {"objective": objective, "results": analysis} + +def generate_completion(role, task, content): + """Generate a completion using OpenAI.""" + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": f"You are a {role}. {task}"}, + {"role": "user", "content": content} + ] + ) + return response.choices[0].message.content + +def read_websites_from_csv(file_path): + """Read websites from a CSV file.""" + websites = [] + with open(file_path, mode='r') as file: + csv_reader = csv.DictReader(file) + for row in csv_reader: + websites.append(row['website']) + return websites + +def write_results_to_json(results, file_path): + """Write results to a JSON file.""" + with open(file_path, mode='w') as file: + json.dump(results, file, indent=4) + +def process_websites(file_path): + """Process websites from a CSV file and write results to a new JSON file.""" + results = [] + websites = read_websites_from_csv(file_path) + for website in websites: + search_results = search_google(website, "Search website") + if search_results['results']: + top_result = search_results['results'][0] + url = top_result['link'] + crawl_results = crawl_url(url, "Crawl website") + if crawl_results['results']: + for each_result in crawl_results['results']['data'][:2]: + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") + print(analysis_results['results']) + results.append(json.loads(analysis_results['results'])) + write_results_to_json(results, 'enriched_data.json') + +if __name__ == "__main__": + # Process websites from the CSV file + process_websites('websites.csv') From 11fd630e55128b40c56e3768308db056bed2e9a5 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:14 +0530 Subject: [PATCH 02/12] Create requirements.txt --- examples/sales_web_crawler/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/sales_web_crawler/requirements.txt diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt new file mode 100644 index 00000000..685c8e33 --- /dev/null +++ b/examples/sales_web_crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py +openai +google-search-results From adfc493c9b5cf22e692f0c456c68e8c6f71b9d53 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:26 +0530 Subject: [PATCH 03/12] Create websites.csv --- examples/sales_web_crawler/websites.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/sales_web_crawler/websites.csv diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv new file mode 100644 index 00000000..32bee52d --- /dev/null +++ b/examples/sales_web_crawler/websites.csv @@ -0,0 +1,2 @@ +website +https://www.launchfa.st From ba3ee8ead6c5b704d0305f5e4e49539646b7d9ea Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:47 +0530 Subject: [PATCH 04/12] Create .env.example --- examples/sales_web_crawler/.env.example | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/sales_web_crawler/.env.example diff --git a/examples/sales_web_crawler/.env.example b/examples/sales_web_crawler/.env.example new file mode 100644 index 00000000..06ccc66d --- /dev/null +++ b/examples/sales_web_crawler/.env.example @@ -0,0 +1,3 @@ +OPENAI_API_KEY= +FIRECRAWL_API_KEY= +SERP_API_KEY= From f5af938ea29eae582aba97bafeef1292c29b14fe Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:17 +0530 Subject: [PATCH 05/12] Update requirements.txt --- examples/sales_web_crawler/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt index 685c8e33..180c5d6c 100644 --- a/examples/sales_web_crawler/requirements.txt +++ b/examples/sales_web_crawler/requirements.txt @@ -1,3 +1,4 @@ firecrawl-py openai google-search-results +tqdm From 2022db7f0a3824abbab452bf957c2ec867b8a13a Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:25 +0530 Subject: [PATCH 06/12] Update websites.csv --- examples/sales_web_crawler/websites.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv index 32bee52d..eef3403e 100644 --- a/examples/sales_web_crawler/websites.csv +++ b/examples/sales_web_crawler/websites.csv @@ -1,2 +1,2 @@ website -https://www.launchfa.st +https://precog.iiit.ac.in/ From 7d8519218ae2ed674fd7aa6995fe94221ad0de73 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:39 +0530 Subject: [PATCH 07/12] Update app.py --- examples/sales_web_crawler/app.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index ae14fc62..f76280e9 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -1,11 +1,13 @@ -import os import csv import json +import os +import uuid from dotenv import load_dotenv from firecrawl import FirecrawlApp from openai import OpenAI from serpapi import GoogleSearch +from tqdm import tqdm load_dotenv() @@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def search_google(query, objective): """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") + # print(f"Parameters: query={query}, objective={objective}") search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) results = search.get_dict().get("organic_results", []) return {"objective": objective, "results": results} def scrape_url(url, objective): """Scrape a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") scrape_status = app.scrape_url( url, params={'formats': ['markdown']} @@ -31,19 +33,19 @@ def scrape_url(url, objective): def crawl_url(url, objective): """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") # If using a crawled url set, pass the ID in the function call below # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status['results'] = scrape_status['data'] scrape_status = app.crawl_url( url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} ) return {"objective": objective, "results": scrape_status} def analyze_website_content(content, objective): """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") + # print(f"Parameters: content={content[:50]}..., objective={objective}") analysis = generate_completion( "website data extractor", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", @@ -53,7 +55,7 @@ def analyze_website_content(content, objective): def generate_completion(role, task, content): """Generate a completion using OpenAI.""" - print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") response = client.chat.completions.create( model="gpt-4o", messages=[ @@ -86,13 +88,18 @@ def process_websites(file_path): if search_results['results']: top_result = search_results['results'][0] url = top_result['link'] + unique_filename = f'output_{uuid.uuid4()}.json' crawl_results = crawl_url(url, "Crawl website") if crawl_results['results']: - for each_result in crawl_results['results']['data'][:2]: - analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") - print(analysis_results['results']) - results.append(json.loads(analysis_results['results'])) - write_results_to_json(results, 'enriched_data.json') + for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") + try: + result = json.loads(analysis_results['results']) + if result: + results.append(result) + write_results_to_json(results, unique_filename) + except: + continue if __name__ == "__main__": # Process websites from the CSV file From 42ec08c76ea8ae1d5e9228cc072a52af2ab301e1 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 03:53:41 +0530 Subject: [PATCH 08/12] Update websites.csv --- examples/sales_web_crawler/websites.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv index eef3403e..f029ccfc 100644 --- a/examples/sales_web_crawler/websites.csv +++ b/examples/sales_web_crawler/websites.csv @@ -1,2 +1,2 @@ website -https://precog.iiit.ac.in/ +https://www.media.mit.edu/ From 8a4ee4482d703bf5b7b45aeb2027a6482b2a211c Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 03:54:14 +0530 Subject: [PATCH 09/12] Create output_01f6efd5-1297-4745-94b5-5972c10f17d6.json --- ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json new file mode 100644 index 00000000..8f1f5bd8 --- /dev/null +++ b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json @@ -0,0 +1,630 @@ +[ + { + "contacts": [ + { + "name": "Canan Dagdeviren", + "email": null, + "title": null, + "company": null + }, + { + "name": "Media Lab Communications", + "email": "press@media.mit.edu", + "title": null, + "company": "MIT Media Lab" + } + ] + }, + { + "people": [ + { + "name": "Xan Foote", + "title": "Group Contact", + "email": "fluidadmin@media.mit.edu" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Fluid Interfaces" + } + ] + }, + { + "emails": [], + "people": [ + { + "name": "Personal Robots", + "title": "Group", + "company": "MIT Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Personal Robots group" + } + ] + }, + { + "people": [ + { + "name": "David Sweeney", + "title": "Author" + }, + { + "name": "Rosalind W. Picard", + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Hugh Herr", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Deblina Sarkar", + "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" + }, + { + "name": "Canan Dagdeviren", + "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" + }, + { + "name": "Dava Newman", + "title": "Director; Apollo Professor of Astronautics" + }, + { + "name": "Cynthia Breazeal", + "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" + }, + { + "name": "Susan Blumenthal, MD", + "title": "Visiting Professor; Director's Circle Member" + } + ], + "emails": [], + "companies": [] + }, + { + "people": [ + { + "name": "Dan Blondell", + "title": "I2" + } + ], + "companies": [], + "emails": [] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": "Copyright Holder" + }, + { + "name": "Jonathan Williams", + "title": "Copyright Holder" + }, + { + "name": "Sara V. Fernandez", + "title": "Courtesy of" + }, + { + "name": "Irmandy Wicaksono", + "title": "Courtesy of" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Interdisciplinary Research Institution" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "David Sweeney", + "title": null + }, + { + "name": "Sarah Beckmann", + "title": null + }, + { + "name": "Behnaz Farahi", + "title": "Assistant Professor, Transformative Design" + }, + { + "name": "Paul Liang", + "title": "Assistant Professor, AI + Human Experience" + }, + { + "name": "Rosalind W. Picard", + "title": null + }, + { + "name": "Guillermo Herrera-Arcos", + "title": null + }, + { + "name": "Christine Higgins", + "title": null + }, + { + "name": "Patrick Chwalek", + "title": null + }, + { + "name": "Sarra Shubart", + "title": null + }, + { + "name": "Amanda Diehl", + "title": null + }, + { + "name": "Chia Evers", + "title": null + }, + { + "name": "Matthew Groh", + "title": null + }, + { + "name": "Cl\u00e9mence Taillandier", + "title": null + }, + { + "name": "Cody Paige", + "title": null + }, + { + "name": "Minoo Rathnasabapathy", + "title": null + }, + { + "name": "Alex Berke", + "title": null + } + ], + "emails": [ + "web-admin@media.mit.edu" + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Samsung" + }, + { + "name": "Castrol" + } + ] + }, + { + "people": [ + { + "name": "Tod Machover", + "title": "Opera Composer" + } + ], + "companies": [ + { + "name": "Future Worlds", + "title": "Design and action for the future we want to live in" + }, + { + "name": "NOAA", + "title": "The Challenge: To secure a sustainable future for all living things" + }, + { + "name": "MIT Media Lab", + "title": "Research and development in interdisciplinary expertise" + } + ] + }, + { + "emails": [ + "r-admin@media.mit.edu" + ], + "people": [ + { + "name": "Affective Computing group", + "title": "MIT Media Lab" + } + ], + "companies": [] + }, + { + "people": [ + { + "name": "David Sweeney", + "email": null, + "title": "Author at Samsung Newsroom" + }, + { + "name": "Pattie Maes", + "email": null, + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Rosalind W. Picard", + "email": null, + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + } + ], + "companies": [ + { + "name": "Samsung", + "email": null, + "title": "Collaborator" + }, + { + "name": "MIT Media Lab", + "email": null, + "title": "Collaborator" + } + ] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": null + }, + { + "name": "Jonathan Williams", + "title": null + }, + { + "name": "Sara V. Fernandez", + "title": null + }, + { + "name": "Irmandy Wicaksono", + "title": null + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": null + } + ], + "emails": [] + }, + { + "people": [], + "emails": [], + "companies": [], + "titles": [] + }, + { + "emails": [], + "people": [ + { + "name": "Andy Ryan", + "title": "Photographer", + "company": "MIT Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "department": "Program in Media Arts and Sciences" + }, + { + "name": "MIT", + "department": "Center for Bits and Atoms" + } + ] + }, + { + "people": [ + { + "name": "Dan Allen", + "title": "Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Castrol" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Pat Pataranutaporn", + "title": "Former Graduate Student" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Kavin Winson", + "title": "Researcher at KASIKORN Labs" + }, + { + "name": "Peggy Yin", + "title": "Harvard University Undergraduate" + }, + { + "name": "Auttasak Lapapirojn", + "title": "KASIKORN Labs" + }, + { + "name": "Pichayoot Ouppaphan", + "title": "KASIKORN Labs" + }, + { + "name": "Monchai Lertsutthiwong", + "title": "Head of AI Research at KASIKORN Business-Technology Group" + }, + { + "name": "Hal Hershfield", + "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles" + }, + { + "name": "Jeremy Bailenson", + "title": "Thomas More Storke Professor of Communication at Stanford University" + }, + { + "name": "Thanawit Prasongpongchai", + "title": "Designer at KBTG and Visiting Scientist at the Media Lab" + } + ], + "companies": [ + { + "name": "MIT", + "role": "AI and simulation research" + }, + { + "name": "KASIKORN Labs", + "role": "Research and co-authorship" + }, + { + "name": "KASIKORN Business-Technology Group", + "role": "AI research support" + } + ] + }, + { + "people": [ + { + "name": "Andy Ryan", + "title": "Copyright" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "collaborator": "Castrol", + "project": "Space Research" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Fadel Adib", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Edward Boyden", + "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology" + }, + { + "name": "Cynthia Breazeal", + "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" + }, + { + "name": "Canan Dagdeviren", + "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" + }, + { + "name": "Kevin Esvelt", + "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications" + }, + { + "name": "Behnaz Farahi", + "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor" + }, + { + "name": "Hugh Herr", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Hiroshi Ishii", + "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab" + }, + { + "name": "Joseph M. Jacobson", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Kent Larson", + "title": "Professor of the Practice" + }, + { + "name": "Paul Pu Liang", + "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science" + }, + { + "name": "Zach Lieberman", + "title": "Adjunct Associate Professor of Media Arts and Sciences" + }, + { + "name": "Andrew Lippman", + "title": "Senior Research Scientist" + }, + { + "name": "Tod Machover", + "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Dava Newman", + "title": "Director; Apollo Professor of Astronautics" + }, + { + "name": "Joseph A. Paradiso", + "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences" + }, + { + "name": "Alex 'Sandy' Pentland", + "title": "Professor Post Tenure of Media Arts and Sciences" + }, + { + "name": "Rosalind W. Picard", + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + }, + { + "name": "Ramesh Raskar", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Mitchel Resnick", + "title": "LEGO Papert Professor of Learning Research" + }, + { + "name": "Deb Roy", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Deblina Sarkar", + "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" + }, + { + "name": "Danielle Wood", + "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics" + } + ], + "emails": [], + "companies": [] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": "Individual", + "email": null + }, + { + "name": "Jonathan Williams", + "title": "Individual", + "email": null + } + ], + "companies": [], + "emails": [] + }, + { + "people": [ + { + "name": "Dava Newman", + "title": "Media Lab Director" + }, + { + "name": "Xin Liu", + "title": "Media Lab Alum" + } + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Boston Museum of Science" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Behnaz Farahi", + "title": "Assistant Professor, Transformative Design", + "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)" + }, + { + "name": "Paul Liang", + "title": "Assistant Professor, AI + Human Experience", + "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing" + }, + { + "name": "Barmak Heshmat", + "title": "Co-founder", + "company": "Brelyon" + }, + { + "name": "Mohammad Tariqul Islam", + "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow" + }, + { + "name": "Hao-Tung Yang", + "title": "Recipient of the T.S. Lin Fellowship Award" + }, + { + "name": "Deblina Sarkar", + "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award" + }, + { + "name": "Danielle Wood", + "title": "2024 Just Tech Fellow" + }, + { + "name": "Baju Joy", + "title": "Whitaker Health Sciences Fellowship Award Recipient" + }, + { + "name": "Max Addae", + "title": "2024 Guthman Musical Instrument Competition Winner" + }, + { + "name": "Tod Machover", + "title": "Head of Opera of the Future", + "affiliation": "MIT" + }, + { + "name": "Sharif Islam", + "title": "ESIP Community Fellow and Future Earth Coasts Fellow", + "affiliation": "Postdoctoral associate in the Space Enabled research group" + } + ], + "companies": [ + { + "name": "Samsung", + "collaboration": "MIT Media Lab" + }, + { + "name": "Brelyon", + "co_founder": "Barmak Heshmat" + }, + { + "name": "Castrol", + "collaboration": "AstroAnt Payload Program" + }, + { + "name": "Augmental", + "product": "Mouth-based touchpad" + } + ], + "email_addresses": [] + } +] From d113199a297a98e0b13a4438838486bb2f21f736 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sun, 20 Oct 2024 18:08:38 +0530 Subject: [PATCH 10/12] Update app.py --- examples/sales_web_crawler/app.py | 92 +++++++++++++++++++------------ 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index f76280e9..842e1345 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -1,13 +1,13 @@ import csv import json import os -import uuid from dotenv import load_dotenv from firecrawl import FirecrawlApp from openai import OpenAI from serpapi import GoogleSearch -from tqdm import tqdm +from swarm import Agent +from swarm.repl import run_demo_loop load_dotenv() @@ -17,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def search_google(query, objective): """Search Google using SerpAPI.""" - # print(f"Parameters: query={query}, objective={objective}") + print(f"Parameters: query={query}, objective={objective}") search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) results = search.get_dict().get("organic_results", []) return {"objective": objective, "results": results} def scrape_url(url, objective): """Scrape a website using Firecrawl.""" - # print(f"Parameters: url={url}, objective={objective}") + print(f"Parameters: url={url}, objective={objective}") scrape_status = app.scrape_url( url, params={'formats': ['markdown']} @@ -33,29 +33,29 @@ def scrape_url(url, objective): def crawl_url(url, objective): """Crawl a website using Firecrawl.""" - # print(f"Parameters: url={url}, objective={objective}") + print(f"Parameters: url={url}, objective={objective}") # If using a crawled url set, pass the ID in the function call below # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status['results'] = scrape_status['data'] scrape_status = app.crawl_url( url, - params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} ) return {"objective": objective, "results": scrape_status} def analyze_website_content(content, objective): """Analyze the scraped website content using OpenAI.""" - # print(f"Parameters: content={content[:50]}..., objective={objective}") + print(f"Parameters: content={content[:50]}..., objective={objective}") analysis = generate_completion( "website data extractor", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", "Objective: " + objective + "\nContent: " + content ) - return {"objective": objective, "results": analysis} + return {"objective": objective, "results": json.loads(analysis)} def generate_completion(role, task, content): """Generate a completion using OpenAI.""" - # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") response = client.chat.completions.create( model="gpt-4o", messages=[ @@ -76,31 +76,55 @@ def read_websites_from_csv(file_path): def write_results_to_json(results, file_path): """Write results to a JSON file.""" - with open(file_path, mode='w') as file: - json.dump(results, file, indent=4) + with open(file_path, mode='w', encoding='utf-8') as file: + json.dump(json.loads(results), file, ensure_ascii=False) -def process_websites(file_path): - """Process websites from a CSV file and write results to a new JSON file.""" - results = [] - websites = read_websites_from_csv(file_path) - for website in websites: - search_results = search_google(website, "Search website") - if search_results['results']: - top_result = search_results['results'][0] - url = top_result['link'] - unique_filename = f'output_{uuid.uuid4()}.json' - crawl_results = crawl_url(url, "Crawl website") - if crawl_results['results']: - for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): - analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") - try: - result = json.loads(analysis_results['results']) - if result: - results.append(result) - write_results_to_json(results, unique_filename) - except: - continue +def handoff_to_search_google(): + """Hand off the search query to the search google agent.""" + return google_search_agent + +def handoff_to_map_url(): + """Hand off the url to the map url agent.""" + return crawl_website_agent + +def handoff_to_analyst(): + """Hand off the website content to the analyst agent.""" + return analyst_agent + +def handoff_to_writer(): + """Hand off the results to the writer agent.""" + return writer_agent + +user_interface_agent = Agent( + name="User Interface Agent", + instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.", + functions=[read_websites_from_csv, handoff_to_search_google], +) + +google_search_agent = Agent( + name="Google Search Agent", + instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.", + functions=[search_google, handoff_to_map_url], +) + +crawl_website_agent = Agent( + name="Crawl Website Agent", + instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.", + functions=[crawl_url, handoff_to_analyst], +) + +analyst_agent = Agent( + name="Analyst Agent", + instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.", + functions=[analyze_website_content, handoff_to_writer], +) + +writer_agent = Agent( + name="Writer Agent", + instructions="You are a writer agent that writes the final results to a JSON file.", + functions=[write_results_to_json], +) if __name__ == "__main__": - # Process websites from the CSV file - process_websites('websites.csv') + # Run the demo loop with the user interface agent + run_demo_loop(user_interface_agent, stream=True) From cf98d69bbbf5e8d9afd546ef7dff74373bce7249 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sun, 20 Oct 2024 18:09:38 +0530 Subject: [PATCH 11/12] Update requirements.txt --- examples/sales_web_crawler/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt index 180c5d6c..d7be486c 100644 --- a/examples/sales_web_crawler/requirements.txt +++ b/examples/sales_web_crawler/requirements.txt @@ -1,4 +1,4 @@ firecrawl-py openai google-search-results -tqdm +git+https://github.com/openai/swarm.git From 22d375ad293296c3533c2195bd6be9a3fbb841ad Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 21 Oct 2024 12:01:09 -0400 Subject: [PATCH 12/12] Updates --- examples/sales_web_crawler/app.py | 116 +--- ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ------------------ examples/sales_web_crawler/websites.csv | 2 - 3 files changed, 32 insertions(+), 716 deletions(-) delete mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json delete mode 100644 examples/sales_web_crawler/websites.csv diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index 842e1345..70063071 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -15,43 +15,35 @@ load_dotenv() app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -def search_google(query, objective): - """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") - search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) - results = search.get_dict().get("organic_results", []) - return {"objective": objective, "results": results} - -def scrape_url(url, objective): - """Scrape a website using Firecrawl.""" +def crawl_and_analyze_url(url, objective): + """Crawl a website using Firecrawl and analyze the content.""" print(f"Parameters: url={url}, objective={objective}") - scrape_status = app.scrape_url( + # Crawl the website + crawl_status = app.crawl_url( url, - params={'formats': ['markdown']} + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}, + poll_interval=5 ) - return {"objective": objective, "results": scrape_status} - -def crawl_url(url, objective): - """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") - # If using a crawled url set, pass the ID in the function call below - # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") - # scrape_status['results'] = scrape_status['data'] - scrape_status = app.crawl_url( - url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} - ) - return {"objective": objective, "results": scrape_status} - -def analyze_website_content(content, objective): - """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") - analysis = generate_completion( - "website data extractor", - f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", - "Objective: " + objective + "\nContent: " + content - ) - return {"objective": objective, "results": json.loads(analysis)} + crawl_status = crawl_status['data'] + # Process each 'markdown' element individually + combined_results = [] + for item in crawl_status: + if 'markdown' in item: + content = item['markdown'] + # Analyze the content + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + # Parse the JSON result + try: + result = json.loads(analysis) + combined_results.append(result) + except json.JSONDecodeError: + print(f"Could not parse JSON from analysis: {analysis}") + # Combine the results + return {"objective": objective, "results": combined_results} def generate_completion(role, task, content): """Generate a completion using OpenAI.""" @@ -65,64 +57,20 @@ def generate_completion(role, task, content): ) return response.choices[0].message.content -def read_websites_from_csv(file_path): - """Read websites from a CSV file.""" - websites = [] - with open(file_path, mode='r') as file: - csv_reader = csv.DictReader(file) - for row in csv_reader: - websites.append(row['website']) - return websites - -def write_results_to_json(results, file_path): - """Write results to a JSON file.""" - with open(file_path, mode='w', encoding='utf-8') as file: - json.dump(json.loads(results), file, ensure_ascii=False) - -def handoff_to_search_google(): - """Hand off the search query to the search google agent.""" - return google_search_agent - -def handoff_to_map_url(): - """Hand off the url to the map url agent.""" +def handoff_to_crawl_url(): + """Hand off the url to the crawl url agent.""" return crawl_website_agent -def handoff_to_analyst(): - """Hand off the website content to the analyst agent.""" - return analyst_agent - -def handoff_to_writer(): - """Hand off the results to the writer agent.""" - return writer_agent - user_interface_agent = Agent( name="User Interface Agent", - instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.", - functions=[read_websites_from_csv, handoff_to_search_google], -) - -google_search_agent = Agent( - name="Google Search Agent", - instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.", - functions=[search_google, handoff_to_map_url], + instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.", + functions=[handoff_to_crawl_url], ) crawl_website_agent = Agent( name="Crawl Website Agent", - instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.", - functions=[crawl_url, handoff_to_analyst], -) - -analyst_agent = Agent( - name="Analyst Agent", - instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.", - functions=[analyze_website_content, handoff_to_writer], -) - -writer_agent = Agent( - name="Writer Agent", - instructions="You are a writer agent that writes the final results to a JSON file.", - functions=[write_results_to_json], + instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.", + functions=[crawl_and_analyze_url], ) if __name__ == "__main__": diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json deleted file mode 100644 index 8f1f5bd8..00000000 --- a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json +++ /dev/null @@ -1,630 +0,0 @@ -[ - { - "contacts": [ - { - "name": "Canan Dagdeviren", - "email": null, - "title": null, - "company": null - }, - { - "name": "Media Lab Communications", - "email": "press@media.mit.edu", - "title": null, - "company": "MIT Media Lab" - } - ] - }, - { - "people": [ - { - "name": "Xan Foote", - "title": "Group Contact", - "email": "fluidadmin@media.mit.edu" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Fluid Interfaces" - } - ] - }, - { - "emails": [], - "people": [ - { - "name": "Personal Robots", - "title": "Group", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Personal Robots group" - } - ] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": "Author" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Susan Blumenthal, MD", - "title": "Visiting Professor; Director's Circle Member" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Dan Blondell", - "title": "I2" - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Copyright Holder" - }, - { - "name": "Jonathan Williams", - "title": "Copyright Holder" - }, - { - "name": "Sara V. Fernandez", - "title": "Courtesy of" - }, - { - "name": "Irmandy Wicaksono", - "title": "Courtesy of" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Interdisciplinary Research Institution" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": null - }, - { - "name": "Sarah Beckmann", - "title": null - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience" - }, - { - "name": "Rosalind W. Picard", - "title": null - }, - { - "name": "Guillermo Herrera-Arcos", - "title": null - }, - { - "name": "Christine Higgins", - "title": null - }, - { - "name": "Patrick Chwalek", - "title": null - }, - { - "name": "Sarra Shubart", - "title": null - }, - { - "name": "Amanda Diehl", - "title": null - }, - { - "name": "Chia Evers", - "title": null - }, - { - "name": "Matthew Groh", - "title": null - }, - { - "name": "Cl\u00e9mence Taillandier", - "title": null - }, - { - "name": "Cody Paige", - "title": null - }, - { - "name": "Minoo Rathnasabapathy", - "title": null - }, - { - "name": "Alex Berke", - "title": null - } - ], - "emails": [ - "web-admin@media.mit.edu" - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Samsung" - }, - { - "name": "Castrol" - } - ] - }, - { - "people": [ - { - "name": "Tod Machover", - "title": "Opera Composer" - } - ], - "companies": [ - { - "name": "Future Worlds", - "title": "Design and action for the future we want to live in" - }, - { - "name": "NOAA", - "title": "The Challenge: To secure a sustainable future for all living things" - }, - { - "name": "MIT Media Lab", - "title": "Research and development in interdisciplinary expertise" - } - ] - }, - { - "emails": [ - "r-admin@media.mit.edu" - ], - "people": [ - { - "name": "Affective Computing group", - "title": "MIT Media Lab" - } - ], - "companies": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "email": null, - "title": "Author at Samsung Newsroom" - }, - { - "name": "Pattie Maes", - "email": null, - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Rosalind W. Picard", - "email": null, - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - } - ], - "companies": [ - { - "name": "Samsung", - "email": null, - "title": "Collaborator" - }, - { - "name": "MIT Media Lab", - "email": null, - "title": "Collaborator" - } - ] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": null - }, - { - "name": "Jonathan Williams", - "title": null - }, - { - "name": "Sara V. Fernandez", - "title": null - }, - { - "name": "Irmandy Wicaksono", - "title": null - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": null - } - ], - "emails": [] - }, - { - "people": [], - "emails": [], - "companies": [], - "titles": [] - }, - { - "emails": [], - "people": [ - { - "name": "Andy Ryan", - "title": "Photographer", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "department": "Program in Media Arts and Sciences" - }, - { - "name": "MIT", - "department": "Center for Bits and Atoms" - } - ] - }, - { - "people": [ - { - "name": "Dan Allen", - "title": "Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Castrol" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Pat Pataranutaporn", - "title": "Former Graduate Student" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Kavin Winson", - "title": "Researcher at KASIKORN Labs" - }, - { - "name": "Peggy Yin", - "title": "Harvard University Undergraduate" - }, - { - "name": "Auttasak Lapapirojn", - "title": "KASIKORN Labs" - }, - { - "name": "Pichayoot Ouppaphan", - "title": "KASIKORN Labs" - }, - { - "name": "Monchai Lertsutthiwong", - "title": "Head of AI Research at KASIKORN Business-Technology Group" - }, - { - "name": "Hal Hershfield", - "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles" - }, - { - "name": "Jeremy Bailenson", - "title": "Thomas More Storke Professor of Communication at Stanford University" - }, - { - "name": "Thanawit Prasongpongchai", - "title": "Designer at KBTG and Visiting Scientist at the Media Lab" - } - ], - "companies": [ - { - "name": "MIT", - "role": "AI and simulation research" - }, - { - "name": "KASIKORN Labs", - "role": "Research and co-authorship" - }, - { - "name": "KASIKORN Business-Technology Group", - "role": "AI research support" - } - ] - }, - { - "people": [ - { - "name": "Andy Ryan", - "title": "Copyright" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "collaborator": "Castrol", - "project": "Space Research" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Fadel Adib", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Edward Boyden", - "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Kevin Esvelt", - "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications" - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Hiroshi Ishii", - "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab" - }, - { - "name": "Joseph M. Jacobson", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Kent Larson", - "title": "Professor of the Practice" - }, - { - "name": "Paul Pu Liang", - "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science" - }, - { - "name": "Zach Lieberman", - "title": "Adjunct Associate Professor of Media Arts and Sciences" - }, - { - "name": "Andrew Lippman", - "title": "Senior Research Scientist" - }, - { - "name": "Tod Machover", - "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Joseph A. Paradiso", - "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Alex 'Sandy' Pentland", - "title": "Professor Post Tenure of Media Arts and Sciences" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Ramesh Raskar", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Mitchel Resnick", - "title": "LEGO Papert Professor of Learning Research" - }, - { - "name": "Deb Roy", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Danielle Wood", - "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Individual", - "email": null - }, - { - "name": "Jonathan Williams", - "title": "Individual", - "email": null - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Dava Newman", - "title": "Media Lab Director" - }, - { - "name": "Xin Liu", - "title": "Media Lab Alum" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Boston Museum of Science" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing" - }, - { - "name": "Barmak Heshmat", - "title": "Co-founder", - "company": "Brelyon" - }, - { - "name": "Mohammad Tariqul Islam", - "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow" - }, - { - "name": "Hao-Tung Yang", - "title": "Recipient of the T.S. Lin Fellowship Award" - }, - { - "name": "Deblina Sarkar", - "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award" - }, - { - "name": "Danielle Wood", - "title": "2024 Just Tech Fellow" - }, - { - "name": "Baju Joy", - "title": "Whitaker Health Sciences Fellowship Award Recipient" - }, - { - "name": "Max Addae", - "title": "2024 Guthman Musical Instrument Competition Winner" - }, - { - "name": "Tod Machover", - "title": "Head of Opera of the Future", - "affiliation": "MIT" - }, - { - "name": "Sharif Islam", - "title": "ESIP Community Fellow and Future Earth Coasts Fellow", - "affiliation": "Postdoctoral associate in the Space Enabled research group" - } - ], - "companies": [ - { - "name": "Samsung", - "collaboration": "MIT Media Lab" - }, - { - "name": "Brelyon", - "co_founder": "Barmak Heshmat" - }, - { - "name": "Castrol", - "collaboration": "AstroAnt Payload Program" - }, - { - "name": "Augmental", - "product": "Mouth-based touchpad" - } - ], - "email_addresses": [] - } -] diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv deleted file mode 100644 index f029ccfc..00000000 --- a/examples/sales_web_crawler/websites.csv +++ /dev/null @@ -1,2 +0,0 @@ -website -https://www.media.mit.edu/