From 10381b5d3cda7f9b635008b59cb90d61ca87c0ed Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 00:51:18 +0530
Subject: [PATCH 01/12] Create app.py

---
 examples/sales_web_crawler/app.py | 99 +++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 examples/sales_web_crawler/app.py

diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py
new file mode 100644
index 00000000..ae14fc62
--- /dev/null
+++ b/examples/sales_web_crawler/app.py
@@ -0,0 +1,99 @@
+import os
+import csv
+import json
+
+from dotenv import load_dotenv
+from firecrawl import FirecrawlApp
+from openai import OpenAI
+from serpapi import GoogleSearch
+
+load_dotenv()
+
+# Initialize FirecrawlApp and OpenAI
+app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+def search_google(query, objective):
+    """Search Google using SerpAPI."""
+    print(f"Parameters: query={query}, objective={objective}")
+    search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
+    results = search.get_dict().get("organic_results", [])
+    return {"objective": objective, "results": results}
+
+def scrape_url(url, objective):
+    """Scrape a website using Firecrawl."""
+    print(f"Parameters: url={url}, objective={objective}")
+    scrape_status = app.scrape_url(
+        url,
+        params={'formats': ['markdown']}
+    )
+    return {"objective": objective, "results": scrape_status}
+
+def crawl_url(url, objective):
+    """Crawl a website using Firecrawl."""
+    print(f"Parameters: url={url}, objective={objective}")
+    # If using a crawled url set, pass the ID in the function call below
+    # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
+    # scrape_status['results'] = scrape_status['data']
+    scrape_status = app.crawl_url(
+        url,
+        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
+    )
+    return {"objective": objective, "results": scrape_status}
+
+def analyze_website_content(content, objective):
+    """Analyze the scraped website content using OpenAI."""
+    print(f"Parameters: content={content[:50]}..., objective={objective}")
+    analysis = generate_completion(
+        "website data extractor",
+        f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
+        "Objective: " + objective + "\nContent: " + content
+    )
+    return {"objective": objective, "results": analysis}
+
+def generate_completion(role, task, content):
+    """Generate a completion using OpenAI."""
+    print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {"role": "system", "content": f"You are a {role}. {task}"},
+            {"role": "user", "content": content}
+        ]
+    )
+    return response.choices[0].message.content
+
+def read_websites_from_csv(file_path):
+    """Read websites from a CSV file."""
+    websites = []
+    with open(file_path, mode='r') as file:
+        csv_reader = csv.DictReader(file)
+        for row in csv_reader:
+            websites.append(row['website'])
+    return websites
+
+def write_results_to_json(results, file_path):
+    """Write results to a JSON file."""
+    with open(file_path, mode='w') as file:
+        json.dump(results, file, indent=4)
+
+def process_websites(file_path):
+    """Process websites from a CSV file and write results to a new JSON file."""
+    results = []
+    websites = read_websites_from_csv(file_path)
+    for website in websites:
+        search_results = search_google(website, "Search website")
+        if search_results['results']:
+            top_result = search_results['results'][0]
+            url = top_result['link']
+            crawl_results = crawl_url(url, "Crawl website")
+            if crawl_results['results']:
+                for each_result in crawl_results['results']['data'][:2]:
+                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.")
+                    print(analysis_results['results'])
+                    results.append(json.loads(analysis_results['results']))
+    write_results_to_json(results, 'enriched_data.json')
+
+if __name__ == "__main__":
+    # Process websites from the CSV file
+    process_websites('websites.csv')

From 11fd630e55128b40c56e3768308db056bed2e9a5 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 00:52:14 +0530
Subject: [PATCH 02/12] Create requirements.txt

---
 examples/sales_web_crawler/requirements.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 examples/sales_web_crawler/requirements.txt

diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt
new file mode 100644
index 00000000..685c8e33
--- /dev/null
+++ b/examples/sales_web_crawler/requirements.txt
@@ -0,0 +1,3 @@
+firecrawl-py
+openai
+google-search-results

From adfc493c9b5cf22e692f0c456c68e8c6f71b9d53 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 00:52:26 +0530
Subject: [PATCH 03/12] Create websites.csv

---
 examples/sales_web_crawler/websites.csv | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 examples/sales_web_crawler/websites.csv

diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv
new file mode 100644
index 00000000..32bee52d
--- /dev/null
+++ b/examples/sales_web_crawler/websites.csv
@@ -0,0 +1,2 @@
+website
+https://www.launchfa.st

From ba3ee8ead6c5b704d0305f5e4e49539646b7d9ea Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 00:52:47 +0530
Subject: [PATCH 04/12] Create .env.example

---
 examples/sales_web_crawler/.env.example | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 examples/sales_web_crawler/.env.example

diff --git a/examples/sales_web_crawler/.env.example b/examples/sales_web_crawler/.env.example
new file mode 100644
index 00000000..06ccc66d
--- /dev/null
+++ b/examples/sales_web_crawler/.env.example
@@ -0,0 +1,3 @@
+OPENAI_API_KEY=
+FIRECRAWL_API_KEY=
+SERP_API_KEY=

From f5af938ea29eae582aba97bafeef1292c29b14fe Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 02:27:17 +0530
Subject: [PATCH 05/12] Update requirements.txt

---
 examples/sales_web_crawler/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt
index 685c8e33..180c5d6c 100644
--- a/examples/sales_web_crawler/requirements.txt
+++ b/examples/sales_web_crawler/requirements.txt
@@ -1,3 +1,4 @@
 firecrawl-py
 openai
 google-search-results
+tqdm

From 2022db7f0a3824abbab452bf957c2ec867b8a13a Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 02:27:25 +0530
Subject: [PATCH 06/12] Update websites.csv

---
 examples/sales_web_crawler/websites.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv
index 32bee52d..eef3403e 100644
--- a/examples/sales_web_crawler/websites.csv
+++ b/examples/sales_web_crawler/websites.csv
@@ -1,2 +1,2 @@
 website
-https://www.launchfa.st
+https://precog.iiit.ac.in/

From 7d8519218ae2ed674fd7aa6995fe94221ad0de73 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 02:27:39 +0530
Subject: [PATCH 07/12] Update app.py

---
 examples/sales_web_crawler/app.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py
index ae14fc62..f76280e9 100644
--- a/examples/sales_web_crawler/app.py
+++ b/examples/sales_web_crawler/app.py
@@ -1,11 +1,13 @@
-import os
 import csv
 import json
+import os
+import uuid
 
 from dotenv import load_dotenv
 from firecrawl import FirecrawlApp
 from openai import OpenAI
 from serpapi import GoogleSearch
+from tqdm import tqdm
 
 load_dotenv()
 
@@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 def search_google(query, objective):
     """Search Google using SerpAPI."""
-    print(f"Parameters: query={query}, objective={objective}")
+    # print(f"Parameters: query={query}, objective={objective}")
     search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
     results = search.get_dict().get("organic_results", [])
     return {"objective": objective, "results": results}
 
 def scrape_url(url, objective):
     """Scrape a website using Firecrawl."""
-    print(f"Parameters: url={url}, objective={objective}")
+    # print(f"Parameters: url={url}, objective={objective}")
     scrape_status = app.scrape_url(
         url,
         params={'formats': ['markdown']}
@@ -31,19 +33,19 @@ def scrape_url(url, objective):
 
 def crawl_url(url, objective):
     """Crawl a website using Firecrawl."""
-    print(f"Parameters: url={url}, objective={objective}")
+    # print(f"Parameters: url={url}, objective={objective}")
     # If using a crawled url set, pass the ID in the function call below
     # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
     # scrape_status['results'] = scrape_status['data']
     scrape_status = app.crawl_url(
         url,
-        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
+        params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
     )
     return {"objective": objective, "results": scrape_status}
 
 def analyze_website_content(content, objective):
     """Analyze the scraped website content using OpenAI."""
-    print(f"Parameters: content={content[:50]}..., objective={objective}")
+    # print(f"Parameters: content={content[:50]}..., objective={objective}")
     analysis = generate_completion(
         "website data extractor",
         f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
@@ -53,7 +55,7 @@ def analyze_website_content(content, objective):
 
 def generate_completion(role, task, content):
     """Generate a completion using OpenAI."""
-    print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
+    # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=[
@@ -86,13 +88,18 @@ def process_websites(file_path):
         if search_results['results']:
             top_result = search_results['results'][0]
             url = top_result['link']
+            unique_filename = f'output_{uuid.uuid4()}.json'
             crawl_results = crawl_url(url, "Crawl website")
             if crawl_results['results']:
-                for each_result in crawl_results['results']['data'][:2]:
-                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.")
-                    print(analysis_results['results'])
-                    results.append(json.loads(analysis_results['results']))
-    write_results_to_json(results, 'enriched_data.json')
+                for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
+                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
+                    try:
+                        result = json.loads(analysis_results['results'])
+                        if result:
+                            results.append(result)
+                            write_results_to_json(results, unique_filename)
+                    except:
+                        continue
 
 if __name__ == "__main__":
     # Process websites from the CSV file

From 42ec08c76ea8ae1d5e9228cc072a52af2ab301e1 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 03:53:41 +0530
Subject: [PATCH 08/12] Update websites.csv

---
 examples/sales_web_crawler/websites.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv
index eef3403e..f029ccfc 100644
--- a/examples/sales_web_crawler/websites.csv
+++ b/examples/sales_web_crawler/websites.csv
@@ -1,2 +1,2 @@
 website
-https://precog.iiit.ac.in/
+https://www.media.mit.edu/

From 8a4ee4482d703bf5b7b45aeb2027a6482b2a211c Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 03:54:14 +0530
Subject: [PATCH 09/12] Create output_01f6efd5-1297-4745-94b5-5972c10f17d6.json

---
 ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ++++++++++++++++++
 1 file changed, 630 insertions(+)
 create mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json

diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json
new file mode 100644
index 00000000..8f1f5bd8
--- /dev/null
+++ b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json
@@ -0,0 +1,630 @@
+[
+    {
+        "contacts": [
+            {
+                "name": "Canan Dagdeviren",
+                "email": null,
+                "title": null,
+                "company": null
+            },
+            {
+                "name": "Media Lab Communications",
+                "email": "press@media.mit.edu",
+                "title": null,
+                "company": "MIT Media Lab"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "Xan Foote",
+                "title": "Group Contact",
+                "email": "fluidadmin@media.mit.edu"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "title": "Fluid Interfaces"
+            }
+        ]
+    },
+    {
+        "emails": [],
+        "people": [
+            {
+                "name": "Personal Robots",
+                "title": "Group",
+                "company": "MIT Media Lab"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "title": "Personal Robots group"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "David Sweeney",
+                "title": "Author"
+            },
+            {
+                "name": "Rosalind W. Picard",
+                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
+            },
+            {
+                "name": "Pattie Maes",
+                "title": "Professor of Media Technology; Germeshausen Professor"
+            },
+            {
+                "name": "Hugh Herr",
+                "title": "Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Deblina Sarkar",
+                "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
+            },
+            {
+                "name": "Canan Dagdeviren",
+                "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Dava Newman",
+                "title": "Director; Apollo Professor of Astronautics"
+            },
+            {
+                "name": "Cynthia Breazeal",
+                "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
+            },
+            {
+                "name": "Susan Blumenthal, MD",
+                "title": "Visiting Professor; Director's Circle Member"
+            }
+        ],
+        "emails": [],
+        "companies": []
+    },
+    {
+        "people": [
+            {
+                "name": "Dan Blondell",
+                "title": "I2"
+            }
+        ],
+        "companies": [],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "Canan Dagdeviren",
+                "title": "Copyright Holder"
+            },
+            {
+                "name": "Jonathan Williams",
+                "title": "Copyright Holder"
+            },
+            {
+                "name": "Sara V. Fernandez",
+                "title": "Courtesy of"
+            },
+            {
+                "name": "Irmandy Wicaksono",
+                "title": "Courtesy of"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "title": "Interdisciplinary Research Institution"
+            }
+        ],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "David Sweeney",
+                "title": null
+            },
+            {
+                "name": "Sarah Beckmann",
+                "title": null
+            },
+            {
+                "name": "Behnaz Farahi",
+                "title": "Assistant Professor, Transformative Design"
+            },
+            {
+                "name": "Paul Liang",
+                "title": "Assistant Professor, AI + Human Experience"
+            },
+            {
+                "name": "Rosalind W. Picard",
+                "title": null
+            },
+            {
+                "name": "Guillermo Herrera-Arcos",
+                "title": null
+            },
+            {
+                "name": "Christine Higgins",
+                "title": null
+            },
+            {
+                "name": "Patrick Chwalek",
+                "title": null
+            },
+            {
+                "name": "Sarra Shubart",
+                "title": null
+            },
+            {
+                "name": "Amanda Diehl",
+                "title": null
+            },
+            {
+                "name": "Chia Evers",
+                "title": null
+            },
+            {
+                "name": "Matthew Groh",
+                "title": null
+            },
+            {
+                "name": "Cl\u00e9mence Taillandier",
+                "title": null
+            },
+            {
+                "name": "Cody Paige",
+                "title": null
+            },
+            {
+                "name": "Minoo Rathnasabapathy",
+                "title": null
+            },
+            {
+                "name": "Alex Berke",
+                "title": null
+            }
+        ],
+        "emails": [
+            "web-admin@media.mit.edu"
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab"
+            },
+            {
+                "name": "Samsung"
+            },
+            {
+                "name": "Castrol"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "Tod Machover",
+                "title": "Opera Composer"
+            }
+        ],
+        "companies": [
+            {
+                "name": "Future Worlds",
+                "title": "Design and action for the future we want to live in"
+            },
+            {
+                "name": "NOAA",
+                "title": "The Challenge: To secure a sustainable future for all living things"
+            },
+            {
+                "name": "MIT Media Lab",
+                "title": "Research and development in interdisciplinary expertise"
+            }
+        ]
+    },
+    {
+        "emails": [
+            "r-admin@media.mit.edu"
+        ],
+        "people": [
+            {
+                "name": "Affective Computing group",
+                "title": "MIT Media Lab"
+            }
+        ],
+        "companies": []
+    },
+    {
+        "people": [
+            {
+                "name": "David Sweeney",
+                "email": null,
+                "title": "Author at Samsung Newsroom"
+            },
+            {
+                "name": "Pattie Maes",
+                "email": null,
+                "title": "Professor of Media Technology; Germeshausen Professor"
+            },
+            {
+                "name": "Rosalind W. Picard",
+                "email": null,
+                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
+            }
+        ],
+        "companies": [
+            {
+                "name": "Samsung",
+                "email": null,
+                "title": "Collaborator"
+            },
+            {
+                "name": "MIT Media Lab",
+                "email": null,
+                "title": "Collaborator"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "Canan Dagdeviren",
+                "title": null
+            },
+            {
+                "name": "Jonathan Williams",
+                "title": null
+            },
+            {
+                "name": "Sara V. Fernandez",
+                "title": null
+            },
+            {
+                "name": "Irmandy Wicaksono",
+                "title": null
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "title": null
+            }
+        ],
+        "emails": []
+    },
+    {
+        "people": [],
+        "emails": [],
+        "companies": [],
+        "titles": []
+    },
+    {
+        "emails": [],
+        "people": [
+            {
+                "name": "Andy Ryan",
+                "title": "Photographer",
+                "company": "MIT Media Lab"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "department": "Program in Media Arts and Sciences"
+            },
+            {
+                "name": "MIT",
+                "department": "Center for Bits and Atoms"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "Dan Allen",
+                "title": "Media Lab"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab"
+            },
+            {
+                "name": "Castrol"
+            }
+        ],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "Pat Pataranutaporn",
+                "title": "Former Graduate Student"
+            },
+            {
+                "name": "Pattie Maes",
+                "title": "Professor of Media Technology; Germeshausen Professor"
+            },
+            {
+                "name": "Kavin Winson",
+                "title": "Researcher at KASIKORN Labs"
+            },
+            {
+                "name": "Peggy Yin",
+                "title": "Harvard University Undergraduate"
+            },
+            {
+                "name": "Auttasak Lapapirojn",
+                "title": "KASIKORN Labs"
+            },
+            {
+                "name": "Pichayoot Ouppaphan",
+                "title": "KASIKORN Labs"
+            },
+            {
+                "name": "Monchai Lertsutthiwong",
+                "title": "Head of AI Research at KASIKORN Business-Technology Group"
+            },
+            {
+                "name": "Hal Hershfield",
+                "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles"
+            },
+            {
+                "name": "Jeremy Bailenson",
+                "title": "Thomas More Storke Professor of Communication at Stanford University"
+            },
+            {
+                "name": "Thanawit Prasongpongchai",
+                "title": "Designer at KBTG and Visiting Scientist at the Media Lab"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT",
+                "role": "AI and simulation research"
+            },
+            {
+                "name": "KASIKORN Labs",
+                "role": "Research and co-authorship"
+            },
+            {
+                "name": "KASIKORN Business-Technology Group",
+                "role": "AI research support"
+            }
+        ]
+    },
+    {
+        "people": [
+            {
+                "name": "Andy Ryan",
+                "title": "Copyright"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab",
+                "collaborator": "Castrol",
+                "project": "Space Research"
+            }
+        ],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "Fadel Adib",
+                "title": "Associate Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Edward Boyden",
+                "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology"
+            },
+            {
+                "name": "Cynthia Breazeal",
+                "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
+            },
+            {
+                "name": "Canan Dagdeviren",
+                "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Kevin Esvelt",
+                "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications"
+            },
+            {
+                "name": "Behnaz Farahi",
+                "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor"
+            },
+            {
+                "name": "Hugh Herr",
+                "title": "Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Hiroshi Ishii",
+                "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab"
+            },
+            {
+                "name": "Joseph M. Jacobson",
+                "title": "Associate Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Kent Larson",
+                "title": "Professor of the Practice"
+            },
+            {
+                "name": "Paul Pu Liang",
+                "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science"
+            },
+            {
+                "name": "Zach Lieberman",
+                "title": "Adjunct Associate Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Andrew Lippman",
+                "title": "Senior Research Scientist"
+            },
+            {
+                "name": "Tod Machover",
+                "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences"
+            },
+            {
+                "name": "Pattie Maes",
+                "title": "Professor of Media Technology; Germeshausen Professor"
+            },
+            {
+                "name": "Dava Newman",
+                "title": "Director; Apollo Professor of Astronautics"
+            },
+            {
+                "name": "Joseph A. Paradiso",
+                "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences"
+            },
+            {
+                "name": "Alex 'Sandy' Pentland",
+                "title": "Professor Post Tenure of Media Arts and Sciences"
+            },
+            {
+                "name": "Rosalind W. Picard",
+                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
+            },
+            {
+                "name": "Ramesh Raskar",
+                "title": "Associate Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Mitchel Resnick",
+                "title": "LEGO Papert Professor of Learning Research"
+            },
+            {
+                "name": "Deb Roy",
+                "title": "Professor of Media Arts and Sciences"
+            },
+            {
+                "name": "Deblina Sarkar",
+                "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
+            },
+            {
+                "name": "Danielle Wood",
+                "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics"
+            }
+        ],
+        "emails": [],
+        "companies": []
+    },
+    {
+        "people": [
+            {
+                "name": "Canan Dagdeviren",
+                "title": "Individual",
+                "email": null
+            },
+            {
+                "name": "Jonathan Williams",
+                "title": "Individual",
+                "email": null
+            }
+        ],
+        "companies": [],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "Dava Newman",
+                "title": "Media Lab Director"
+            },
+            {
+                "name": "Xin Liu",
+                "title": "Media Lab Alum"
+            }
+        ],
+        "companies": [
+            {
+                "name": "MIT Media Lab"
+            },
+            {
+                "name": "Boston Museum of Science"
+            }
+        ],
+        "emails": []
+    },
+    {
+        "people": [
+            {
+                "name": "Behnaz Farahi",
+                "title": "Assistant Professor, Transformative Design",
+                "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)"
+            },
+            {
+                "name": "Paul Liang",
+                "title": "Assistant Professor, AI + Human Experience",
+                "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing"
+            },
+            {
+                "name": "Barmak Heshmat",
+                "title": "Co-founder",
+                "company": "Brelyon"
+            },
+            {
+                "name": "Mohammad Tariqul Islam",
+                "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow"
+            },
+            {
+                "name": "Hao-Tung Yang",
+                "title": "Recipient of the T.S. Lin Fellowship Award"
+            },
+            {
+                "name": "Deblina Sarkar",
+                "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award"
+            },
+            {
+                "name": "Danielle Wood",
+                "title": "2024 Just Tech Fellow"
+            },
+            {
+                "name": "Baju Joy",
+                "title": "Whitaker Health Sciences Fellowship Award Recipient"
+            },
+            {
+                "name": "Max Addae",
+                "title": "2024 Guthman Musical Instrument Competition Winner"
+            },
+            {
+                "name": "Tod Machover",
+                "title": "Head of Opera of the Future",
+                "affiliation": "MIT"
+            },
+            {
+                "name": "Sharif Islam",
+                "title": "ESIP Community Fellow and Future Earth Coasts Fellow",
+                "affiliation": "Postdoctoral associate in the Space Enabled research group"
+            }
+        ],
+        "companies": [
+            {
+                "name": "Samsung",
+                "collaboration": "MIT Media Lab"
+            },
+            {
+                "name": "Brelyon",
+                "co_founder": "Barmak Heshmat"
+            },
+            {
+                "name": "Castrol",
+                "collaboration": "AstroAnt Payload Program"
+            },
+            {
+                "name": "Augmental",
+                "product": "Mouth-based touchpad"
+            }
+        ],
+        "email_addresses": []
+    }
+]

From d113199a297a98e0b13a4438838486bb2f21f736 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sun, 20 Oct 2024 18:08:38 +0530
Subject: [PATCH 10/12] Update app.py

---
 examples/sales_web_crawler/app.py | 92 +++++++++++++++++++------------
 1 file changed, 58 insertions(+), 34 deletions(-)

diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py
index f76280e9..842e1345 100644
--- a/examples/sales_web_crawler/app.py
+++ b/examples/sales_web_crawler/app.py
@@ -1,13 +1,13 @@
 import csv
 import json
 import os
-import uuid
 
 from dotenv import load_dotenv
 from firecrawl import FirecrawlApp
 from openai import OpenAI
 from serpapi import GoogleSearch
-from tqdm import tqdm
+from swarm import Agent
+from swarm.repl import run_demo_loop
 
 load_dotenv()
 
@@ -17,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 def search_google(query, objective):
     """Search Google using SerpAPI."""
-    # print(f"Parameters: query={query}, objective={objective}")
+    print(f"Parameters: query={query}, objective={objective}")
     search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
     results = search.get_dict().get("organic_results", [])
     return {"objective": objective, "results": results}
 
 def scrape_url(url, objective):
     """Scrape a website using Firecrawl."""
-    # print(f"Parameters: url={url}, objective={objective}")
+    print(f"Parameters: url={url}, objective={objective}")
     scrape_status = app.scrape_url(
         url,
         params={'formats': ['markdown']}
@@ -33,29 +33,29 @@ def scrape_url(url, objective):
 
 def crawl_url(url, objective):
     """Crawl a website using Firecrawl."""
-    # print(f"Parameters: url={url}, objective={objective}")
+    print(f"Parameters: url={url}, objective={objective}")
     # If using a crawled url set, pass the ID in the function call below
     # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
     # scrape_status['results'] = scrape_status['data']
     scrape_status = app.crawl_url(
         url,
-        params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
+        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
     )
     return {"objective": objective, "results": scrape_status}
 
 def analyze_website_content(content, objective):
     """Analyze the scraped website content using OpenAI."""
-    # print(f"Parameters: content={content[:50]}..., objective={objective}")
+    print(f"Parameters: content={content[:50]}..., objective={objective}")
     analysis = generate_completion(
         "website data extractor",
         f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
         "Objective: " + objective + "\nContent: " + content
     )
-    return {"objective": objective, "results": analysis}
+    return {"objective": objective, "results": json.loads(analysis)}
 
 def generate_completion(role, task, content):
     """Generate a completion using OpenAI."""
-    # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
+    print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=[
@@ -76,31 +76,55 @@ def read_websites_from_csv(file_path):
 
 def write_results_to_json(results, file_path):
     """Write results to a JSON file."""
-    with open(file_path, mode='w') as file:
-        json.dump(results, file, indent=4)
+    with open(file_path, mode='w', encoding='utf-8') as file:
+        json.dump(json.loads(results), file, ensure_ascii=False)
 
-def process_websites(file_path):
-    """Process websites from a CSV file and write results to a new JSON file."""
-    results = []
-    websites = read_websites_from_csv(file_path)
-    for website in websites:
-        search_results = search_google(website, "Search website")
-        if search_results['results']:
-            top_result = search_results['results'][0]
-            url = top_result['link']
-            unique_filename = f'output_{uuid.uuid4()}.json'
-            crawl_results = crawl_url(url, "Crawl website")
-            if crawl_results['results']:
-                for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
-                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
-                    try:
-                        result = json.loads(analysis_results['results'])
-                        if result:
-                            results.append(result)
-                            write_results_to_json(results, unique_filename)
-                    except:
-                        continue
+def handoff_to_search_google():
+    """Hand off the search query to the search google agent."""
+    return google_search_agent
+
+def handoff_to_map_url():
+    """Hand off the url to the map url agent."""
+    return crawl_website_agent
+
+def handoff_to_analyst():
+    """Hand off the website content to the analyst agent."""
+    return analyst_agent
+
+def handoff_to_writer():
+    """Hand off the results to the writer agent."""
+    return writer_agent
+
+user_interface_agent = Agent(
+    name="User Interface Agent",
+    instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.",
+    functions=[read_websites_from_csv, handoff_to_search_google],
+)
+
+google_search_agent = Agent(
+    name="Google Search Agent",
+    instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.",
+    functions=[search_google, handoff_to_map_url],
+)
+
+crawl_website_agent = Agent(
+    name="Crawl Website Agent",
+    instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.",
+    functions=[crawl_url, handoff_to_analyst],
+)
+
+analyst_agent = Agent(
+    name="Analyst Agent",
+    instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.",
+    functions=[analyze_website_content, handoff_to_writer],
+)
+
+writer_agent = Agent(
+    name="Writer Agent",
+    instructions="You are a writer agent that writes the final results to a JSON file.",
+    functions=[write_results_to_json],
+)
 
 if __name__ == "__main__":
-    # Process websites from the CSV file
-    process_websites('websites.csv')
+    # Run the demo loop with the user interface agent
+    run_demo_loop(user_interface_agent, stream=True)

From cf98d69bbbf5e8d9afd546ef7dff74373bce7249 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sun, 20 Oct 2024 18:09:38 +0530
Subject: [PATCH 11/12] Update requirements.txt

---
 examples/sales_web_crawler/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt
index 180c5d6c..d7be486c 100644
--- a/examples/sales_web_crawler/requirements.txt
+++ b/examples/sales_web_crawler/requirements.txt
@@ -1,4 +1,4 @@
 firecrawl-py
 openai
 google-search-results
-tqdm
+git+https://github.com/openai/swarm.git

From 22d375ad293296c3533c2195bd6be9a3fbb841ad Mon Sep 17 00:00:00 2001
From: Eric Ciarla <ericciarla@yahoo.com>
Date: Mon, 21 Oct 2024 12:01:09 -0400
Subject: [PATCH 12/12] Updates

---
 examples/sales_web_crawler/app.py             | 116 +---
 ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ------------------
 examples/sales_web_crawler/websites.csv       |   2 -
 3 files changed, 32 insertions(+), 716 deletions(-)
 delete mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json
 delete mode 100644 examples/sales_web_crawler/websites.csv

diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py
index 842e1345..70063071 100644
--- a/examples/sales_web_crawler/app.py
+++ b/examples/sales_web_crawler/app.py
@@ -15,43 +15,35 @@ load_dotenv()
 app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
-def search_google(query, objective):
-    """Search Google using SerpAPI."""
-    print(f"Parameters: query={query}, objective={objective}")
-    search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
-    results = search.get_dict().get("organic_results", [])
-    return {"objective": objective, "results": results}
-
-def scrape_url(url, objective):
-    """Scrape a website using Firecrawl."""
+def crawl_and_analyze_url(url, objective):
+    """Crawl a website using Firecrawl and analyze the content."""
     print(f"Parameters: url={url}, objective={objective}")
-    scrape_status = app.scrape_url(
+    # Crawl the website
+    crawl_status = app.crawl_url(
         url,
-        params={'formats': ['markdown']}
+        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
+        poll_interval=5
     )
-    return {"objective": objective, "results": scrape_status}
-
-def crawl_url(url, objective):
-    """Crawl a website using Firecrawl."""
-    print(f"Parameters: url={url}, objective={objective}")
-    # If using a crawled url set, pass the ID in the function call below
-    # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
-    # scrape_status['results'] = scrape_status['data']
-    scrape_status = app.crawl_url(
-        url,
-        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
-    )
-    return {"objective": objective, "results": scrape_status}
-
-def analyze_website_content(content, objective):
-    """Analyze the scraped website content using OpenAI."""
-    print(f"Parameters: content={content[:50]}..., objective={objective}")
-    analysis = generate_completion(
-        "website data extractor",
-        f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
-        "Objective: " + objective + "\nContent: " + content
-    )
-    return {"objective": objective, "results": json.loads(analysis)}
+    crawl_status = crawl_status['data']
+    # Process each 'markdown' element individually
+    combined_results = []
+    for item in crawl_status:
+        if 'markdown' in item:
+            content = item['markdown']
+            # Analyze the content
+            analysis = generate_completion(
+                "website data extractor",
+                f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
+                "Objective: " + objective + "\nContent: " + content
+            )
+            # Parse the JSON result
+            try:
+                result = json.loads(analysis)
+                combined_results.append(result)
+            except json.JSONDecodeError:
+                print(f"Could not parse JSON from analysis: {analysis}")
+    # Combine the results
+    return {"objective": objective, "results": combined_results}
 
 def generate_completion(role, task, content):
     """Generate a completion using OpenAI."""
@@ -65,64 +57,20 @@ def generate_completion(role, task, content):
     )
     return response.choices[0].message.content
 
-def read_websites_from_csv(file_path):
-    """Read websites from a CSV file."""
-    websites = []
-    with open(file_path, mode='r') as file:
-        csv_reader = csv.DictReader(file)
-        for row in csv_reader:
-            websites.append(row['website'])
-    return websites
-
-def write_results_to_json(results, file_path):
-    """Write results to a JSON file."""
-    with open(file_path, mode='w', encoding='utf-8') as file:
-        json.dump(json.loads(results), file, ensure_ascii=False)
-
-def handoff_to_search_google():
-    """Hand off the search query to the search google agent."""
-    return google_search_agent
-
-def handoff_to_map_url():
-    """Hand off the url to the map url agent."""
+def handoff_to_crawl_url():
+    """Hand off the url to the crawl url agent."""
     return crawl_website_agent
 
-def handoff_to_analyst():
-    """Hand off the website content to the analyst agent."""
-    return analyst_agent
-
-def handoff_to_writer():
-    """Hand off the results to the writer agent."""
-    return writer_agent
-
 user_interface_agent = Agent(
     name="User Interface Agent",
-    instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.",
-    functions=[read_websites_from_csv, handoff_to_search_google],
-)
-
-google_search_agent = Agent(
-    name="Google Search Agent",
-    instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.",
-    functions=[search_google, handoff_to_map_url],
+    instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
+    functions=[handoff_to_crawl_url],
 )
 
 crawl_website_agent = Agent(
     name="Crawl Website Agent",
-    instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.",
-    functions=[crawl_url, handoff_to_analyst],
-)
-
-analyst_agent = Agent(
-    name="Analyst Agent",
-    instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.",
-    functions=[analyze_website_content, handoff_to_writer],
-)
-
-writer_agent = Agent(
-    name="Writer Agent",
-    instructions="You are a writer agent that writes the final results to a JSON file.",
-    functions=[write_results_to_json],
+    instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
+    functions=[crawl_and_analyze_url],
 )
 
 if __name__ == "__main__":
diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json
deleted file mode 100644
index 8f1f5bd8..00000000
--- a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json
+++ /dev/null
@@ -1,630 +0,0 @@
-[
-    {
-        "contacts": [
-            {
-                "name": "Canan Dagdeviren",
-                "email": null,
-                "title": null,
-                "company": null
-            },
-            {
-                "name": "Media Lab Communications",
-                "email": "press@media.mit.edu",
-                "title": null,
-                "company": "MIT Media Lab"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "Xan Foote",
-                "title": "Group Contact",
-                "email": "fluidadmin@media.mit.edu"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "title": "Fluid Interfaces"
-            }
-        ]
-    },
-    {
-        "emails": [],
-        "people": [
-            {
-                "name": "Personal Robots",
-                "title": "Group",
-                "company": "MIT Media Lab"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "title": "Personal Robots group"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "David Sweeney",
-                "title": "Author"
-            },
-            {
-                "name": "Rosalind W. Picard",
-                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
-            },
-            {
-                "name": "Pattie Maes",
-                "title": "Professor of Media Technology; Germeshausen Professor"
-            },
-            {
-                "name": "Hugh Herr",
-                "title": "Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Deblina Sarkar",
-                "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
-            },
-            {
-                "name": "Canan Dagdeviren",
-                "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Dava Newman",
-                "title": "Director; Apollo Professor of Astronautics"
-            },
-            {
-                "name": "Cynthia Breazeal",
-                "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
-            },
-            {
-                "name": "Susan Blumenthal, MD",
-                "title": "Visiting Professor; Director's Circle Member"
-            }
-        ],
-        "emails": [],
-        "companies": []
-    },
-    {
-        "people": [
-            {
-                "name": "Dan Blondell",
-                "title": "I2"
-            }
-        ],
-        "companies": [],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "Canan Dagdeviren",
-                "title": "Copyright Holder"
-            },
-            {
-                "name": "Jonathan Williams",
-                "title": "Copyright Holder"
-            },
-            {
-                "name": "Sara V. Fernandez",
-                "title": "Courtesy of"
-            },
-            {
-                "name": "Irmandy Wicaksono",
-                "title": "Courtesy of"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "title": "Interdisciplinary Research Institution"
-            }
-        ],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "David Sweeney",
-                "title": null
-            },
-            {
-                "name": "Sarah Beckmann",
-                "title": null
-            },
-            {
-                "name": "Behnaz Farahi",
-                "title": "Assistant Professor, Transformative Design"
-            },
-            {
-                "name": "Paul Liang",
-                "title": "Assistant Professor, AI + Human Experience"
-            },
-            {
-                "name": "Rosalind W. Picard",
-                "title": null
-            },
-            {
-                "name": "Guillermo Herrera-Arcos",
-                "title": null
-            },
-            {
-                "name": "Christine Higgins",
-                "title": null
-            },
-            {
-                "name": "Patrick Chwalek",
-                "title": null
-            },
-            {
-                "name": "Sarra Shubart",
-                "title": null
-            },
-            {
-                "name": "Amanda Diehl",
-                "title": null
-            },
-            {
-                "name": "Chia Evers",
-                "title": null
-            },
-            {
-                "name": "Matthew Groh",
-                "title": null
-            },
-            {
-                "name": "Cl\u00e9mence Taillandier",
-                "title": null
-            },
-            {
-                "name": "Cody Paige",
-                "title": null
-            },
-            {
-                "name": "Minoo Rathnasabapathy",
-                "title": null
-            },
-            {
-                "name": "Alex Berke",
-                "title": null
-            }
-        ],
-        "emails": [
-            "web-admin@media.mit.edu"
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab"
-            },
-            {
-                "name": "Samsung"
-            },
-            {
-                "name": "Castrol"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "Tod Machover",
-                "title": "Opera Composer"
-            }
-        ],
-        "companies": [
-            {
-                "name": "Future Worlds",
-                "title": "Design and action for the future we want to live in"
-            },
-            {
-                "name": "NOAA",
-                "title": "The Challenge: To secure a sustainable future for all living things"
-            },
-            {
-                "name": "MIT Media Lab",
-                "title": "Research and development in interdisciplinary expertise"
-            }
-        ]
-    },
-    {
-        "emails": [
-            "r-admin@media.mit.edu"
-        ],
-        "people": [
-            {
-                "name": "Affective Computing group",
-                "title": "MIT Media Lab"
-            }
-        ],
-        "companies": []
-    },
-    {
-        "people": [
-            {
-                "name": "David Sweeney",
-                "email": null,
-                "title": "Author at Samsung Newsroom"
-            },
-            {
-                "name": "Pattie Maes",
-                "email": null,
-                "title": "Professor of Media Technology; Germeshausen Professor"
-            },
-            {
-                "name": "Rosalind W. Picard",
-                "email": null,
-                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
-            }
-        ],
-        "companies": [
-            {
-                "name": "Samsung",
-                "email": null,
-                "title": "Collaborator"
-            },
-            {
-                "name": "MIT Media Lab",
-                "email": null,
-                "title": "Collaborator"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "Canan Dagdeviren",
-                "title": null
-            },
-            {
-                "name": "Jonathan Williams",
-                "title": null
-            },
-            {
-                "name": "Sara V. Fernandez",
-                "title": null
-            },
-            {
-                "name": "Irmandy Wicaksono",
-                "title": null
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "title": null
-            }
-        ],
-        "emails": []
-    },
-    {
-        "people": [],
-        "emails": [],
-        "companies": [],
-        "titles": []
-    },
-    {
-        "emails": [],
-        "people": [
-            {
-                "name": "Andy Ryan",
-                "title": "Photographer",
-                "company": "MIT Media Lab"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "department": "Program in Media Arts and Sciences"
-            },
-            {
-                "name": "MIT",
-                "department": "Center for Bits and Atoms"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "Dan Allen",
-                "title": "Media Lab"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab"
-            },
-            {
-                "name": "Castrol"
-            }
-        ],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "Pat Pataranutaporn",
-                "title": "Former Graduate Student"
-            },
-            {
-                "name": "Pattie Maes",
-                "title": "Professor of Media Technology; Germeshausen Professor"
-            },
-            {
-                "name": "Kavin Winson",
-                "title": "Researcher at KASIKORN Labs"
-            },
-            {
-                "name": "Peggy Yin",
-                "title": "Harvard University Undergraduate"
-            },
-            {
-                "name": "Auttasak Lapapirojn",
-                "title": "KASIKORN Labs"
-            },
-            {
-                "name": "Pichayoot Ouppaphan",
-                "title": "KASIKORN Labs"
-            },
-            {
-                "name": "Monchai Lertsutthiwong",
-                "title": "Head of AI Research at KASIKORN Business-Technology Group"
-            },
-            {
-                "name": "Hal Hershfield",
-                "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles"
-            },
-            {
-                "name": "Jeremy Bailenson",
-                "title": "Thomas More Storke Professor of Communication at Stanford University"
-            },
-            {
-                "name": "Thanawit Prasongpongchai",
-                "title": "Designer at KBTG and Visiting Scientist at the Media Lab"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT",
-                "role": "AI and simulation research"
-            },
-            {
-                "name": "KASIKORN Labs",
-                "role": "Research and co-authorship"
-            },
-            {
-                "name": "KASIKORN Business-Technology Group",
-                "role": "AI research support"
-            }
-        ]
-    },
-    {
-        "people": [
-            {
-                "name": "Andy Ryan",
-                "title": "Copyright"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab",
-                "collaborator": "Castrol",
-                "project": "Space Research"
-            }
-        ],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "Fadel Adib",
-                "title": "Associate Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Edward Boyden",
-                "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology"
-            },
-            {
-                "name": "Cynthia Breazeal",
-                "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning"
-            },
-            {
-                "name": "Canan Dagdeviren",
-                "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Kevin Esvelt",
-                "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications"
-            },
-            {
-                "name": "Behnaz Farahi",
-                "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor"
-            },
-            {
-                "name": "Hugh Herr",
-                "title": "Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Hiroshi Ishii",
-                "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab"
-            },
-            {
-                "name": "Joseph M. Jacobson",
-                "title": "Associate Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Kent Larson",
-                "title": "Professor of the Practice"
-            },
-            {
-                "name": "Paul Pu Liang",
-                "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science"
-            },
-            {
-                "name": "Zach Lieberman",
-                "title": "Adjunct Associate Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Andrew Lippman",
-                "title": "Senior Research Scientist"
-            },
-            {
-                "name": "Tod Machover",
-                "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences"
-            },
-            {
-                "name": "Pattie Maes",
-                "title": "Professor of Media Technology; Germeshausen Professor"
-            },
-            {
-                "name": "Dava Newman",
-                "title": "Director; Apollo Professor of Astronautics"
-            },
-            {
-                "name": "Joseph A. Paradiso",
-                "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences"
-            },
-            {
-                "name": "Alex 'Sandy' Pentland",
-                "title": "Professor Post Tenure of Media Arts and Sciences"
-            },
-            {
-                "name": "Rosalind W. Picard",
-                "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology"
-            },
-            {
-                "name": "Ramesh Raskar",
-                "title": "Associate Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Mitchel Resnick",
-                "title": "LEGO Papert Professor of Learning Research"
-            },
-            {
-                "name": "Deb Roy",
-                "title": "Professor of Media Arts and Sciences"
-            },
-            {
-                "name": "Deblina Sarkar",
-                "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor"
-            },
-            {
-                "name": "Danielle Wood",
-                "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics"
-            }
-        ],
-        "emails": [],
-        "companies": []
-    },
-    {
-        "people": [
-            {
-                "name": "Canan Dagdeviren",
-                "title": "Individual",
-                "email": null
-            },
-            {
-                "name": "Jonathan Williams",
-                "title": "Individual",
-                "email": null
-            }
-        ],
-        "companies": [],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "Dava Newman",
-                "title": "Media Lab Director"
-            },
-            {
-                "name": "Xin Liu",
-                "title": "Media Lab Alum"
-            }
-        ],
-        "companies": [
-            {
-                "name": "MIT Media Lab"
-            },
-            {
-                "name": "Boston Museum of Science"
-            }
-        ],
-        "emails": []
-    },
-    {
-        "people": [
-            {
-                "name": "Behnaz Farahi",
-                "title": "Assistant Professor, Transformative Design",
-                "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)"
-            },
-            {
-                "name": "Paul Liang",
-                "title": "Assistant Professor, AI + Human Experience",
-                "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing"
-            },
-            {
-                "name": "Barmak Heshmat",
-                "title": "Co-founder",
-                "company": "Brelyon"
-            },
-            {
-                "name": "Mohammad Tariqul Islam",
-                "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow"
-            },
-            {
-                "name": "Hao-Tung Yang",
-                "title": "Recipient of the T.S. Lin Fellowship Award"
-            },
-            {
-                "name": "Deblina Sarkar",
-                "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award"
-            },
-            {
-                "name": "Danielle Wood",
-                "title": "2024 Just Tech Fellow"
-            },
-            {
-                "name": "Baju Joy",
-                "title": "Whitaker Health Sciences Fellowship Award Recipient"
-            },
-            {
-                "name": "Max Addae",
-                "title": "2024 Guthman Musical Instrument Competition Winner"
-            },
-            {
-                "name": "Tod Machover",
-                "title": "Head of Opera of the Future",
-                "affiliation": "MIT"
-            },
-            {
-                "name": "Sharif Islam",
-                "title": "ESIP Community Fellow and Future Earth Coasts Fellow",
-                "affiliation": "Postdoctoral associate in the Space Enabled research group"
-            }
-        ],
-        "companies": [
-            {
-                "name": "Samsung",
-                "collaboration": "MIT Media Lab"
-            },
-            {
-                "name": "Brelyon",
-                "co_founder": "Barmak Heshmat"
-            },
-            {
-                "name": "Castrol",
-                "collaboration": "AstroAnt Payload Program"
-            },
-            {
-                "name": "Augmental",
-                "product": "Mouth-based touchpad"
-            }
-        ],
-        "email_addresses": []
-    }
-]
diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv
deleted file mode 100644
index f029ccfc..00000000
--- a/examples/sales_web_crawler/websites.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-website
-https://www.media.mit.edu/