Create o1_job_recommender.py

2024-11-16 11:42:24 +08:00 · 2024-09-26 14:46:48 -04:00 · 2024-09-26 14:46:48 -04:00 · 5c4d436f1e
commit 5c4d436f1e
parent 51bc2f25fe
1 changed files with 283 additions and 0 deletions
--- a/examples/o1_job_recommender/o1_job_recommender.py
+++ b/examples/o1_job_recommender/o1_job_recommender.py
@ -0,0 +1,283 @@
+# %%
+# %%
+import os
+import requests
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+# Load environment variables
+load_dotenv()
+
+# Initialize the FirecrawlApp with your API key
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+# Set the jobs page URL
+jobs_page_url = "https://openai.com/careers/search"
+
+# Resume
+resume_paste = """"
+Eric Ciarla
+Co-Founder @ Firecrawl
+San Francisco, California, United States
+Summary
+Building…
+Experience
+Firecrawl
+Co-Founder
+April 2024 - Present (6 months)
+San Francisco, California, United States
+Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by
+Amazon, Zapier, and Nvidia (YC S22)
+Mendable
+2 years 7 months
+Co-Founder @ Mendable.ai
+March 2022 - Present (2 years 7 months)
+San Francisco, California, United States
+- Built an AI powered search platform that that served millions of queries for
+hundreds of customers (YC S22)
+- We were one of the first LLM powered apps adopted by industry leaders like
+Coinbase, Snap, DoorDash, and MongoDB
+Co-Founder @ SideGuide
+March 2022 - Present (2 years 7 months)
+San Francisco, California, United States
+- Built and scaled an online course platform with a community of over 50,000
+developers
+- Selected for Y Combinator S22 batch, 2% acceptance rate
+Fracta
+Data Engineer
+2022 - 2022 (less than a year)
+Palo Alto, California, United States
+- Demoed tool during sales calls and provided technical support during the
+entire customer lifecycle
+Page 1 of 2
+- Mined, wrangled, & visualized geospatial and water utility data for predictive
+analytics & ML workflows (Python, QGIS)
+Ford Motor Company
+Data Scientist
+2021 - 2021 (less than a year)
+Dearborn, Michigan, United States
+- Extracted, cleaned, and joined data from multiple sources using SQL,
+Hadoop, and Alteryx
+- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the
+relationships between survey free response verbatim topics (derived from
+natural language processing models) and numerical customer experience
+scores
+MDRemindME
+Co-Founder
+2018 - 2020 (2 years)
+Durham, New Hampshire, United States
+- Founded and led a healthtech startup aimed at improving patient adherence
+to treatment plans through an innovative engagement and retention tool
+- Piloted the product with healthcare providers and patients, gathering critical
+insights to refine functionality and enhance user experience
+- Secured funding through National Science Foundation I-CORPS Grant and
+UNH Entrepreneurship Center Seed Grant
+Education
+Y Combinator
+S22
+University of New Hampshire
+Economics and Philosophy
+"""
+
+# First, scrape the jobs page using Firecrawl
+try:
+    response = requests.post(
+        "https://api.firecrawl.dev/v1/scrape",
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {firecrawl_api_key}"
+        },
+        json={
+            "url": jobs_page_url,
+            "formats": ["markdown"]
+        }
+    )
+    
+    if response.status_code == 200:
+        result = response.json()
+        if result.get('success'):
+            html_content = result['data']['markdown']
+            # Define the O1 prompt for extracting apply links
+            prompt = f"""
+            Extract up to 30 job application links from the given markdown content.
+            Return the result as a JSON object with a single key 'apply_links' containing an array of strings (the links).
+            The output should be a valid JSON object, with no additional text.
+            Do not include any JSON markdown formatting or code block indicators.
+            Provide only the raw JSON object as the response.
+
+            Example of the expected format:
+            {{"apply_links": ["https://example.com/job1", "https://example.com/job2", ...]}}
+
+            Markdown content:
+            {html_content[:100000]}
+            """
+            print(f"{Colors.GREEN}Successfully scraped the jobs page{Colors.RESET}")
+        else:
+            print(f"{Colors.RED}Failed to scrape the jobs page: {result.get('message', 'Unknown error')}{Colors.RESET}")
+            html_content = ""
+    else:
+        print(f"{Colors.RED}Error {response.status_code}: {response.text}{Colors.RESET}")
+        html_content = ""
+except requests.RequestException as e:
+    print(f"{Colors.RED}An error occurred while scraping: {str(e)}{Colors.RESET}")
+    html_content = ""
+except json.JSONDecodeError as e:
+    print(f"{Colors.RED}Error decoding JSON response: {str(e)}{Colors.RESET}")
+    html_content = ""
+except Exception as e:
+    print(f"{Colors.RED}An unexpected error occurred while scraping: {str(e)}{Colors.RESET}")
+    html_content = ""
+
+# Extract apply links from the scraped HTML using O1
+apply_links = []
+if html_content:
+    try:
+        completion = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ]
+        )
+        
+        if completion.choices:
+            print(completion.choices[0].message.content)
+            result = json.loads(completion.choices[0].message.content.strip())
+        
+            apply_links = result['apply_links']
+            print(f"{Colors.GREEN}Successfully extracted {len(apply_links)} apply links{Colors.RESET}")
+        else:
+            print(f"{Colors.RED}No apply links extracted{Colors.RESET}")
+    except json.JSONDecodeError as e:
+        print(f"{Colors.RED}Error decoding JSON from OpenAI response: {str(e)}{Colors.RESET}")
+    except KeyError as e:
+        print(f"{Colors.RED}Expected key not found in OpenAI response: {str(e)}{Colors.RESET}")
+    except Exception as e:
+        print(f"{Colors.RED}An unexpected error occurred during extraction: {str(e)}{Colors.RESET}")
+else:
+    print(f"{Colors.RED}No HTML content to process{Colors.RESET}")
+
+# Initialize a list to store the extracted data
+extracted_data = []
+
+
+# %%
+print(f"{Colors.CYAN}Apply links:{Colors.RESET}")
+for link in apply_links:
+    print(f"{Colors.YELLOW}{link}{Colors.RESET}")
+
+# %%
+# Process each apply link
+for index, link in enumerate(apply_links):
+    try:
+        response = requests.post(
+            "https://api.firecrawl.dev/v1/scrape",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {firecrawl_api_key}"
+            },
+            json={
+                "url": link,
+                "formats": ["extract"],
+                "actions": [{
+                    "type": "click",
+                    "selector": "#job-overview"
+                }],
+                "extract": {
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "job_title": {"type": "string"},
+                            "sub_division_of_organization": {"type": "string"},
+                            "key_skills": {"type": "array", "items": {"type": "string"}},
+                            "compensation": {"type": "string"},
+                            "location": {"type": "string"},
+                            "apply_link": {"type": "string"}
+                        },
+                        "required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "location", "apply_link"]
+                    }
+                }
+            }
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            if result.get('success'):
+                extracted_data.append(result['data']['extract'])
+                print(f"{Colors.GREEN}Data extracted for job {index}{Colors.RESET}")
+            else:
+                print(f"")
+        else:
+            print(f"")
+    except Exception as e:
+        print(f"")
+
+
+# %%
+# %%
+# Print the extracted data
+print(f"{Colors.CYAN}Extracted data:{Colors.RESET}")
+for job in extracted_data:
+    print(json.dumps(job, indent=2))
+    print(f"{Colors.MAGENTA}{'-' * 50}{Colors.RESET}")
+
+
+# %%
+
+
+
+
+# Use o1-preview to choose which jobs should be applied to based on the resume
+prompt = f"""
+Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:
+
+[
+  {{
+    "job_title": "Job Title",
+    "compensation": "Compensation (if available, otherwise empty string)",
+    "apply_link": "Application URL"
+  }},
+  ...
+]
+
+Based on the following resume:
+{resume_paste}
+
+And the following job listings:
+{json.dumps(extracted_data, indent=2)}
+"""
+
+completion = client.chat.completions.create(
+    model="o1-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ]
+        }
+    ]
+)
+
+recommended_jobs = json.loads(completion.choices[0].message.content.strip())
+
+print(f"{Colors.CYAN}Recommended jobs:{Colors.RESET}")
+print(json.dumps(recommended_jobs, indent=2))
+
+