mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
|
import os
|
||
|
import json
|
||
|
from firecrawl import FirecrawlApp
|
||
|
from dotenv import load_dotenv
|
||
|
from openai import OpenAI
|
||
|
|
||
|
# Load environment variables
|
||
|
load_dotenv()
|
||
|
|
||
|
# Retrieve API keys from environment variables
|
||
|
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||
|
|
||
|
# Initialize the FirecrawlApp and set OpenAI API key
|
||
|
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||
|
client = OpenAI(api_key=openai_api_key)
|
||
|
|
||
|
def main():
|
||
|
# Get user input
|
||
|
blog_url = input("Enter the blog URL: ")
|
||
|
|
||
|
if not blog_url.strip():
|
||
|
blog_url = "https://www.firecrawl.dev/blog/how-to-use-openai-o1-reasoning-models-in-applications"
|
||
|
|
||
|
# Scrape the blog content
|
||
|
print("Scraping the blog content...")
|
||
|
blog_scrape_result = app.scrape_url(blog_url, params={'formats': ['markdown']})
|
||
|
|
||
|
# Get the blog content in markdown format
|
||
|
blog_content = blog_scrape_result.get('markdown', '')
|
||
|
|
||
|
# Turn the blog URL into a top-level domain
|
||
|
top_level_domain = '/'.join(blog_url.split('/')[:3])
|
||
|
|
||
|
# Map the website to get all links
|
||
|
print("Mapping the website to get all links...")
|
||
|
site_map = app.map_url(top_level_domain)
|
||
|
|
||
|
# Get the list of URLs from the site map
|
||
|
site_links = site_map.get('links', [])
|
||
|
|
||
|
|
||
|
prompt = f"""
|
||
|
You are an AI assistant helping to improve a blog post.
|
||
|
|
||
|
Here is the original blog post content:
|
||
|
|
||
|
{blog_content}
|
||
|
|
||
|
Here is a list of other pages on the website:
|
||
|
|
||
|
{json.dumps(site_links, indent=2)}
|
||
|
|
||
|
Please revise the blog post to include internal links to some of these pages where appropriate. Make sure the internal links are relevant and enhance the content.
|
||
|
|
||
|
Only return the revised blog post in markdown format.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
|
||
|
# Function to count links in a markdown content
|
||
|
def count_links(markdown_content):
|
||
|
return len(re.findall(r'\[.*?\]\(.*?\)', markdown_content))
|
||
|
|
||
|
# Use OpenAI API to get the revised blog post
|
||
|
print("Generating the revised blog post with internal links...")
|
||
|
completion = client.chat.completions.create(
|
||
|
model="gpt-4o-mini",
|
||
|
messages=[
|
||
|
{
|
||
|
"role": "user",
|
||
|
"content": prompt
|
||
|
}
|
||
|
],
|
||
|
prediction={
|
||
|
"type": "content",
|
||
|
"content": blog_content
|
||
|
}
|
||
|
);
|
||
|
|
||
|
revised_blog_post = completion.choices[0].message.content
|
||
|
|
||
|
# Count links in the original and revised blog post
|
||
|
original_links_count = count_links(blog_content)
|
||
|
revised_links_count = count_links(revised_blog_post)
|
||
|
|
||
|
# Output a portion of the revised blog post and link counts
|
||
|
print("\nRevised blog post (first 500 characters):")
|
||
|
print(revised_blog_post[:500])
|
||
|
print(f"\nNumber of links in the original blog post: {original_links_count}")
|
||
|
print(f"Number of links in the revised blog post: {revised_links_count}")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|