Merge branch 'main' into mog/webscraper-refactor

This commit is contained in:
Gergő Móricz 2024-11-05 23:49:31 +01:00
commit e5385e62ee
16 changed files with 368 additions and 25 deletions

3
.gitignore vendored
View File

@ -30,3 +30,6 @@ apps/js-sdk/firecrawl/dist
/examples/crm_lead_enrichment/crm_lead_enrichment_env
/.venv
/examples/claude_web_crawler/firecrawl_env
/examples/haiku_web_crawler/firecrawl_env
/examples/sonnet_web_crawler/firecrawl_env
/examples/internal_link_assitant/firecrawl_env

View File

@ -6,7 +6,7 @@ If you're contributing, note that the process is similar to other open source re
## Running the project locally
First, start by installing dependencies
First, start by installing dependencies:
1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs)
2. pnpm [instructions](https://pnpm.io/installation)
@ -55,12 +55,13 @@ POSTHOG_HOST= # set if you'd like to send posthog events like job logs
First, install the dependencies using pnpm.
```bash
pnpm install
# cd apps/api # to make sure you're in the right folder
pnpm install # make sure you have pnpm version 9+!
```
### Running the project
You're going to need to open 3 terminals.
You're going to need to open 3 terminals. Here is [a video guide accurate as of Oct 2024](https://youtu.be/LHqg5QNI4UY).
### Terminal 1 - setting up redis
@ -76,6 +77,7 @@ Now, navigate to the apps/api/ directory and run:
```bash
pnpm run workers
# if you are going to use the [llm-extract feature](https://github.com/mendableai/firecrawl/pull/586/), you should also export OPENAI_API_KEY=sk-______
```
This will start the workers who are responsible for processing crawl jobs.

View File

@ -338,6 +338,8 @@ function getPlanByPriceId(price_id: string | null): PlanType {
return "growthdouble";
case process.env.STRIPE_PRICE_ID_ETIER2C:
return "etier2c";
case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh
return "etier1a";
default:
return "free";
}

View File

@ -142,6 +142,7 @@ export const scrapeOptions = z.object({
languages: z.string().array().optional(),
}).optional(),
skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true),
}).strict(strictMessage)
@ -494,6 +495,7 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
actions: pageOptions.actions,
location: pageOptions.geolocation,
skipTlsVerification: pageOptions.skipTlsVerification,
removeBase64Images: pageOptions.removeBase64Images,
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
systemPrompt: extractorOptions.extractionPrompt,
prompt: extractorOptions.userPrompt,

View File

@ -60,6 +60,7 @@ export type PageOptions = {
country?: string;
};
skipTlsVerification?: boolean;
removeBase64Images?: boolean;
mobile?: boolean;
};

View File

@ -74,6 +74,10 @@ export async function getJobPriority({
bucketLimit = 1000;
planModifier = 0.05;
break;
case "etier1a":
bucketLimit = 1000;
planModifier = 0.05;
break;
default:
bucketLimit = 25;

View File

@ -0,0 +1,7 @@
export const removeBase64Images = async (
markdown: string,
) => {
const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
markdown = markdown.replace(regex, '$1(<Base64-Image-Removed>)');
return markdown;
};

View File

@ -88,6 +88,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
removeBase64Images: meta.options.removeBase64Images,
// TODO: scrollXPaths, disableJsDom
};
@ -137,6 +138,7 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
removeBase64Images: meta.options.removeBase64Images,
};
let response = await performFireEngineScrape(
@ -174,6 +176,7 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
removeBase64Images: meta.options.removeBase64Images,
};
let response = await performFireEngineScrape(

View File

@ -24,6 +24,8 @@ export type FireEngineScrapeRequestCommon = {
// team_id?: string; // unused
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false
removeBase64Images?: boolean;
}
export type FireEngineScrapeRequestChromeCDP = {

View File

@ -1,5 +1,6 @@
// TODO: refactor
import { load } from "cheerio";
import { logger } from "../../../lib/logger";
export function extractLinks(html: string, baseUrl: string): string[] {
const $ = load(html);
@ -8,20 +9,24 @@ export function extractLinks(html: string, baseUrl: string): string[] {
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
try {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, { error });
}
// Fragment-only links (#) are ignored
}
});

View File

@ -16,6 +16,8 @@ const RATE_LIMITS = {
growth: 50,
growthdouble: 50,
etier2c: 300,
etier1a: 1000,
etier2a: 300,
},
scrape: {
default: 20,
@ -30,6 +32,8 @@ const RATE_LIMITS = {
growth: 1000,
growthdouble: 1000,
etier2c: 2500,
etier1a: 1000,
etier2a: 2500,
},
search: {
default: 20,
@ -44,6 +48,8 @@ const RATE_LIMITS = {
growth: 500,
growthdouble: 500,
etier2c: 2500,
etier1a: 1000,
etier2a: 2500,
},
map:{
default: 20,
@ -58,6 +64,8 @@ const RATE_LIMITS = {
growth: 500,
growthdouble: 500,
etier2c: 2500,
etier1a: 1000,
etier2a: 2500,
},
preview: {
free: 5,
@ -123,6 +131,20 @@ export const scrapeStatusRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds
});
export const etier1aRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "etier1a",
points: 10000,
duration: 60, // Duration in seconds
});
export const etier2aRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "etier2a",
points: 2500,
duration: 60, // Duration in seconds
});
const testSuiteTokens = [
"a01ccae",
"6254cf9",
@ -176,6 +198,14 @@ export function getRateLimiter(
if(teamId && teamId === process.env.DEV_B_TEAM_ID) {
return devBRateLimiter;
}
if(teamId && teamId === process.env.ETIER1A_TEAM_ID) {
return etier1aRateLimiter;
}
if(teamId && teamId === process.env.ETIER2A_TEAM_ID) {
return etier2aRateLimiter;
}
if(teamId && manual.includes(teamId)) {
return manualRateLimiter;

View File

@ -160,6 +160,7 @@ export type PlanType =
| "growth"
| "growthdouble"
| "etier2c"
| "etier1a"
| "free"
| "";

View File

@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
import logging
import os
from .firecrawl import FirecrawlApp
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.4.0"
@ -19,24 +19,46 @@ __version__ = "1.4.0"
logger: logging.Logger = logging.getLogger("firecrawl")
def _basic_config() -> None:
"""Set up basic configuration for logging with a specific format and date format."""
def _configure_logger() -> None:
"""
Configure the firecrawl logger for console output.
The function attaches a handler for console output with a specific format and date
format to the firecrawl logger.
"""
try:
logging.basicConfig(
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
# Create the formatter
formatter = logging.Formatter(
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Create the console handler and set the formatter
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
# Add the console handler to the firecrawl logger
logger.addHandler(console_handler)
except Exception as e:
logger.error("Failed to configure logging: %s", e)
def setup_logging() -> None:
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
env = os.environ.get(
"FIRECRAWL_LOGGING_LEVEL", "INFO"
).upper() # Default to 'INFO' level
_basic_config()
# Check if the firecrawl logger already has a handler
if logger.hasHandlers():
return # To prevent duplicate logging
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
# Attach a no-op handler to prevent warnings about no handlers
logger.addHandler(logging.NullHandler())
return
# Attach the console handler to the firecrawl logger
_configure_logger()
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
if env == "DEBUG":
logger.setLevel(logging.DEBUG)
elif env == "INFO":

View File

@ -0,0 +1,165 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import anthropic
import agentops
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
# Initialize the FirecrawlApp and OpenAI client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = anthropic.Anthropic(api_key=anthropic_api_key)
# Find the page that most likely contains the objective
def find_relevant_page_via_map(objective, url, app, client):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
completion = client.messages.create(
model="claude-3-5-haiku-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the best search parameter.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": map_prompt
}
]
}
]
)
map_search_parameter = completion.content[0].text
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
return map_website['links']
except Exception as e:
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages(map_website, objective, app, client):
try:
# Get top 2 links from the map result
top_links = map_website[:2]
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
# Scrape the pages in batch
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
for scrape_result in batch_scrape_result['data']:
# Check if objective is met
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
If the objective is not met with confidence, respond with 'Objective not met'.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
Remember:
1. Only return JSON if you are confident the objective is fully met.
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
"""
completion = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the relevant information in JSON format.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": check_prompt
}
]
}
]
)
result = completion.content[0].text
if result != "Objective not met":
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
return json.loads(result)
except json.JSONDecodeError:
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
# Main function to execute the process
def main():
# Get user input
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
if not url.strip():
url = "https://www.firecrawl.dev/"
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
if not objective.strip():
objective = "find me the pricing plans"
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
# Find the relevant page
map_website = find_relevant_page_via_map(objective, url, app, client)
print(map_website)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
# Find objective in top pages
result = find_objective_in_top_pages(map_website, objective, app, client)
if result:
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,94 @@
import os
import json
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
from openai import OpenAI
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
# Initialize the FirecrawlApp and set OpenAI API key
app = FirecrawlApp(api_key=firecrawl_api_key)
client = OpenAI(api_key=openai_api_key)
def main():
# Get user input
blog_url = input("Enter the blog URL: ")
if not blog_url.strip():
blog_url = "https://www.firecrawl.dev/blog/how-to-use-openai-o1-reasoning-models-in-applications"
# Scrape the blog content
print("Scraping the blog content...")
blog_scrape_result = app.scrape_url(blog_url, params={'formats': ['markdown']})
# Get the blog content in markdown format
blog_content = blog_scrape_result.get('markdown', '')
# Turn the blog URL into a top-level domain
top_level_domain = '/'.join(blog_url.split('/')[:3])
# Map the website to get all links
print("Mapping the website to get all links...")
site_map = app.map_url(top_level_domain)
# Get the list of URLs from the site map
site_links = site_map.get('links', [])
prompt = f"""
You are an AI assistant helping to improve a blog post.
Here is the original blog post content:
{blog_content}
Here is a list of other pages on the website:
{json.dumps(site_links, indent=2)}
Please revise the blog post to include internal links to some of these pages where appropriate. Make sure the internal links are relevant and enhance the content.
Only return the revised blog post in markdown format.
"""
import re
# Function to count links in a markdown content
def count_links(markdown_content):
return len(re.findall(r'\[.*?\]\(.*?\)', markdown_content))
# Use OpenAI API to get the revised blog post
print("Generating the revised blog post with internal links...")
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": prompt
}
],
prediction={
"type": "content",
"content": blog_content
}
);
revised_blog_post = completion.choices[0].message.content
# Count links in the original and revised blog post
original_links_count = count_links(blog_content)
revised_links_count = count_links(revised_blog_post)
# Output a portion of the revised blog post and link counts
print("\nRevised blog post (first 500 characters):")
print(revised_blog_post[:500])
print(f"\nNumber of links in the original blog post: {original_links_count}")
print(f"Number of links in the revised blog post: {revised_links_count}")
if __name__ == "__main__":
main()