Merge branch 'main' into mog/bulk-scrape

This commit is contained in:
Nicolas 2024-10-23 14:36:26 -03:00
commit 66e505317e
22 changed files with 438 additions and 17 deletions

1
.gitignore vendored
View File

@ -28,3 +28,4 @@ apps/js-sdk/firecrawl/dist
/examples/o1_web_crawler/firecrawl_env
/examples/crm_lead_enrichment/crm_lead_enrichment_env
/.venv

View File

@ -36,7 +36,7 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features)
`.env:`
```
@ -47,7 +47,7 @@ HOST=0.0.0.0
REDIS_URL=redis://redis:6379
REDIS_RATE_LIMIT_URL=redis://redis:6379
## To turn on DB authentication, you need to set up supabase.
## To turn on DB authentication, you need to set up Supabase.
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
@ -59,8 +59,8 @@ SUPABASE_SERVICE_TOKEN=
# Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
@ -176,4 +176,4 @@ By addressing these common issues, you can ensure a smoother setup and operation
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.

View File

@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
import { redlock } from "../services/redlock";
import { getValue } from "../services/redis";
import { deleteKey, getValue } from "../services/redis";
import { setValue } from "../services/redis";
import { validate } from "uuid";
import * as Sentry from "@sentry/node";
@ -128,6 +128,13 @@ export async function getACUC(
}
}
export async function clearACUC(
api_key: string,
): Promise<void> {
const cacheKeyACUC = `acuc_${api_key}`;
await deleteKey(cacheKeyACUC);
}
export async function authenticateUser(
req,
res,

View File

@ -0,0 +1,22 @@
import { Request, Response } from "express";
import { supabase_service } from "../../../services/supabase";
import { clearACUC } from "../../auth";
import { Logger } from "../../../lib/logger";
export async function acucCacheClearController(req: Request, res: Response) {
try {
const team_id: string = req.body.team_id;
const keys = await supabase_service
.from("api_keys")
.select("*")
.eq("team_id", team_id);
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
res.json({ ok: true });
} catch (error) {
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
res.status(500).json({ error: "Internal server error" });
}
}

View File

@ -78,7 +78,7 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(

View File

@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
}
).transform(val => val ? val.toUpperCase() : 'US')
}).optional(),
skipTlsVerification: z.boolean().default(false),
}).strict(strictMessage)
@ -443,6 +444,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.geolocation,
skipTlsVerification: x.skipTlsVerification
};
}

View File

@ -6,7 +6,13 @@ export function numTokensFromString(message: string, model: string): number {
const encoder = encoding_for_model(model as TiktokenModel);
// Encode the message into tokens
const tokens = encoder.encode(message);
let tokens: Uint32Array;
try {
tokens = encoder.encode(message);
} catch (error) {
message = message.replace("<|endoftext|>", "");
tokens = encoder.encode(message);
}
// Free the encoder resources after use
encoder.free();

View File

@ -54,6 +54,7 @@ export type PageOptions = {
geolocation?: {
country?: string;
};
skipTlsVerification?: boolean;
};
export type ExtractorOptions = {

View File

@ -6,6 +6,8 @@ import {
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/v0/admin/queue";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { wrap } from "./v1";
export const adminRouter = express.Router();
@ -33,3 +35,8 @@ adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);
adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController),
);

View File

@ -37,7 +37,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
}
}
req.account = { remainingCredits };
@ -95,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))

View File

@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -145,8 +145,14 @@ export class WebCrawler {
.slice(0, limit);
}
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
return response.data;
}

View File

@ -594,6 +594,7 @@ export class WebScraperDataProvider {
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
fireEngineOptions = {},
headers,
options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
actions: actions,
},
{

View File

@ -157,6 +157,7 @@ export async function scrapSingleUrl(
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
}
if (extractorOptions) {

View File

@ -81,8 +81,10 @@ class FirecrawlApp:
response = response.json()
if response['success'] and 'data' in response:
return response['data']
else:
elif "error" in response:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
else:
raise Exception(f'Failed to scrape URL. Error: {response}')
else:
self._handle_error(response, 'scrape URL')
@ -266,8 +268,10 @@ class FirecrawlApp:
response = response.json()
if response['success'] and 'links' in response:
return response
else:
elif 'error' in response:
raise Exception(f'Failed to map URL. Error: {response["error"]}')
else:
raise Exception(f'Failed to map URL. Error: {response}')
else:
self._handle_error(response, 'map')

View File

@ -0,0 +1,150 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import requests
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
grok_api_key = os.getenv("GROK_API_KEY")
# Initialize the FirecrawlApp
app = FirecrawlApp(api_key=firecrawl_api_key)
# Function to make Grok API calls
def grok_completion(prompt):
url = "https://api.x.ai/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {grok_api_key}"
}
data = {
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
}
],
"model": "grok-beta",
"stream": False,
"temperature": 0
}
response = requests.post(url, headers=headers, json=data)
return response.json()['choices'][0]['message']['content']
# Find the page that most likely contains the objective
def find_relevant_page_via_map(objective, url, app):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
map_search_parameter = grok_completion(map_prompt)
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
print(f"{Colors.MAGENTA}{map_search_parameter}{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
return map_website["links"]
except Exception as e:
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages(map_website, objective, app):
try:
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
# Get top 3 links from the map result
top_links = map_website[:3] if isinstance(map_website, list) else []
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
for link in top_links:
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
# Scrape the page
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
# Check if objective is met
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
If the objective is not met with confidence, respond with 'Objective not met'.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
Remember:
1. Only return JSON if you are confident the objective is fully met.
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
"""
result = grok_completion(check_prompt)
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
if result != "Objective not met":
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
result = result.replace("```json", "").replace("```", "")
return json.loads(result)
except json.JSONDecodeError:
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
# Main function to execute the process
def main():
# Get user input
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
# Find the relevant page
map_website = find_relevant_page_via_map(objective, url, app)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
# Find objective in top pages
result = find_objective_in_top_pages(map_website, objective, app)
if result:
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
OPENAI_API_KEY=
FIRECRAWL_API_KEY=
SERP_API_KEY=

View File

@ -0,0 +1,120 @@
import os
from firecrawl import FirecrawlApp
from swarm import Agent
from swarm.repl import run_demo_loop
import dotenv
from serpapi import GoogleSearch
from openai import OpenAI
dotenv.load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def search_google(query, objective):
"""Search Google using SerpAPI."""
print(f"Parameters: query={query}, objective={objective}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
results = search.get_dict().get("organic_results", [])
return {"objective": objective, "results": results}
def map_url_pages(url, objective):
"""Map a website's pages using Firecrawl."""
search_query = generate_completion(
"website search query generator",
f"Generate a 1-2 word search query for the website: {url} based on the objective",
"Objective: " + objective
)
print(f"Parameters: url={url}, objective={objective}, search_query={search_query}")
map_status = app.map_url(url, params={'search': search_query})
if map_status.get('status') == 'success':
links = map_status.get('links', [])
top_link = links[0] if links else None
return {"objective": objective, "results": [top_link] if top_link else []}
else:
return {"objective": objective, "results": []}
def scrape_url(url, objective):
"""Scrape a website using Firecrawl."""
print(f"Parameters: url={url}, objective={objective}")
scrape_status = app.scrape_url(
url,
params={'formats': ['markdown']}
)
return {"objective": objective, "results": scrape_status}
def analyze_website_content(content, objective):
"""Analyze the scraped website content using OpenAI."""
print(f"Parameters: content={content[:50]}..., objective={objective}")
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective.",
"Objective: " + objective + "\nContent: " + content
)
return {"objective": objective, "results": analysis}
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
def handoff_to_search_google():
"""Hand off the search query to the search google agent."""
return google_search_agent
def handoff_to_map_url():
"""Hand off the url to the map url agent."""
return map_url_agent
def handoff_to_website_scraper():
"""Hand off the url to the website scraper agent."""
return website_scraper_agent
def handoff_to_analyst():
"""Hand off the website content to the analyst agent."""
return analyst_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with an web data extraction objective that the user wants to achieve by searching the web, mapping the web pages, and extracting the content from a specific page. Be concise.",
functions=[handoff_to_search_google],
)
google_search_agent = Agent(
name="Google Search Agent",
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the map agent.",
functions=[search_google, handoff_to_map_url],
)
map_url_agent = Agent(
name="Map URL Agent",
instructions="You are a map url agent specialized in mapping the web pages. When you are done, you must hand off the results to the website scraper agent.",
functions=[map_url_pages, handoff_to_website_scraper],
)
website_scraper_agent = Agent(
name="Website Scraper Agent",
instructions="You are a website scraper agent specialized in scraping website content. When you are done, you must hand off the website content to the analyst agent to extract the data based on the objective.",
functions=[scrape_url, handoff_to_analyst],
)
analyst_agent = Agent(
name="Analyst Agent",
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must return a JSON object.",
functions=[analyze_website_content],
)
if __name__ == "__main__":
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)

View File

@ -0,0 +1,4 @@
firecrawl-py
openai
google-search-results
git+https://github.com/openai/swarm.git

View File

@ -0,0 +1,3 @@
OPENAI_API_KEY=
FIRECRAWL_API_KEY=
SERP_API_KEY=

View File

@ -0,0 +1,78 @@
import csv
import json
import os
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from openai import OpenAI
from serpapi import GoogleSearch
from swarm import Agent
from swarm.repl import run_demo_loop
load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def crawl_and_analyze_url(url, objective):
"""Crawl a website using Firecrawl and analyze the content."""
print(f"Parameters: url={url}, objective={objective}")
# Crawl the website
crawl_status = app.crawl_url(
url,
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
poll_interval=5
)
crawl_status = crawl_status['data']
# Process each 'markdown' element individually
combined_results = []
for item in crawl_status:
if 'markdown' in item:
content = item['markdown']
# Analyze the content
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
"Objective: " + objective + "\nContent: " + content
)
# Parse the JSON result
try:
result = json.loads(analysis)
combined_results.append(result)
except json.JSONDecodeError:
print(f"Could not parse JSON from analysis: {analysis}")
# Combine the results
return {"objective": objective, "results": combined_results}
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
def handoff_to_crawl_url():
"""Hand off the url to the crawl url agent."""
return crawl_website_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
functions=[handoff_to_crawl_url],
)
crawl_website_agent = Agent(
name="Crawl Website Agent",
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
functions=[crawl_and_analyze_url],
)
if __name__ == "__main__":
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)

View File

@ -0,0 +1,4 @@
firecrawl-py
openai
google-search-results
git+https://github.com/openai/swarm.git