mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into mog/bulk-scrape
This commit is contained in:
commit
66e505317e
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -28,3 +28,4 @@ apps/js-sdk/firecrawl/dist
|
|||
|
||||
/examples/o1_web_crawler/firecrawl_env
|
||||
/examples/crm_lead_enrichment/crm_lead_enrichment_env
|
||||
/.venv
|
||||
|
|
10
SELF_HOST.md
10
SELF_HOST.md
|
@ -36,7 +36,7 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap
|
|||
|
||||
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
|
||||
|
||||
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
|
||||
To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features)
|
||||
|
||||
`.env:`
|
||||
```
|
||||
|
@ -47,7 +47,7 @@ HOST=0.0.0.0
|
|||
REDIS_URL=redis://redis:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
## To turn on DB authentication, you need to set up Supabase.
|
||||
USE_DB_AUTHENTICATION=false
|
||||
|
||||
# ===== Optional ENVS ======
|
||||
|
@ -59,8 +59,8 @@ SUPABASE_SERVICE_TOKEN=
|
|||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
||||
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
|
@ -176,4 +176,4 @@ By addressing these common issues, you can ensure a smoother setup and operation
|
|||
|
||||
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
|
||||
|
||||
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
|
|
|
@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
|||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { deleteKey, getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
@ -128,6 +128,13 @@ export async function getACUC(
|
|||
}
|
||||
}
|
||||
|
||||
export async function clearACUC(
|
||||
api_key: string,
|
||||
): Promise<void> {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
await deleteKey(cacheKeyACUC);
|
||||
}
|
||||
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
|
|
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
|
@ -0,0 +1,22 @@
|
|||
import { Request, Response } from "express";
|
||||
import { supabase_service } from "../../../services/supabase";
|
||||
import { clearACUC } from "../../auth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function acucCacheClearController(req: Request, res: Response) {
|
||||
try {
|
||||
const team_id: string = req.body.team_id;
|
||||
|
||||
const keys = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
||||
|
||||
res.json({ ok: true });
|
||||
} catch (error) {
|
||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
}
|
|
@ -78,7 +78,7 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
|
|
|
@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
|
|||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US')
|
||||
}).optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
|
@ -443,6 +444,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -6,7 +6,13 @@ export function numTokensFromString(message: string, model: string): number {
|
|||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(message);
|
||||
let tokens: Uint32Array;
|
||||
try {
|
||||
tokens = encoder.encode(message);
|
||||
} catch (error) {
|
||||
message = message.replace("<|endoftext|>", "");
|
||||
tokens = encoder.encode(message);
|
||||
}
|
||||
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
|
|
|
@ -54,6 +54,7 @@ export type PageOptions = {
|
|||
geolocation?: {
|
||||
country?: string;
|
||||
};
|
||||
skipTlsVerification?: boolean;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
|
|
@ -6,6 +6,8 @@ import {
|
|||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { wrap } from "./v1";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -33,3 +35,8 @@ adminRouter.get(
|
|||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
autoscalerController
|
||||
);
|
||||
|
||||
adminRouter.post(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||
wrap(acucCacheClearController),
|
||||
);
|
||||
|
|
|
@ -37,7 +37,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
||||
}
|
||||
}
|
||||
req.account = { remainingCredits };
|
||||
|
@ -95,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
|||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
|
|
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
|||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
|
@ -145,8 +145,14 @@ export class WebCrawler {
|
|||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||
let extraArgs = {};
|
||||
if(skipTlsVerification) {
|
||||
extraArgs["httpsAgent"] = new https.Agent({
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
}
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
|
@ -594,6 +594,7 @@ export class WebScraperDataProvider {
|
|||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
|
|
@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
|
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
|
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
|
|||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
|
|
|
@ -157,6 +157,7 @@ export async function scrapSingleUrl(
|
|||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
|
|
|
@ -81,8 +81,10 @@ class FirecrawlApp:
|
|||
response = response.json()
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
elif "error" in response:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response}')
|
||||
else:
|
||||
self._handle_error(response, 'scrape URL')
|
||||
|
||||
|
@ -266,8 +268,10 @@ class FirecrawlApp:
|
|||
response = response.json()
|
||||
if response['success'] and 'links' in response:
|
||||
return response
|
||||
else:
|
||||
elif 'error' in response:
|
||||
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
||||
else:
|
||||
raise Exception(f'Failed to map URL. Error: {response}')
|
||||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
|
|
150
examples/grok_web_crawler/grok_web_crawler.py
Normal file
150
examples/grok_web_crawler/grok_web_crawler.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import requests
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
grok_api_key = os.getenv("GROK_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
|
||||
# Function to make Grok API calls
|
||||
def grok_completion(prompt):
|
||||
url = "https://api.x.ai/v1/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {grok_api_key}"
|
||||
}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
],
|
||||
"model": "grok-beta",
|
||||
"stream": False,
|
||||
"temperature": 0
|
||||
}
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
|
||||
# Find the page that most likely contains the objective
|
||||
def find_relevant_page_via_map(objective, url, app):
|
||||
try:
|
||||
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||
|
||||
map_prompt = f"""
|
||||
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
|
||||
"""
|
||||
|
||||
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
|
||||
map_search_parameter = grok_completion(map_prompt)
|
||||
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{map_search_parameter}{Colors.RESET}")
|
||||
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
|
||||
return map_website["links"]
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
|
||||
def find_objective_in_top_pages(map_website, objective, app):
|
||||
try:
|
||||
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
|
||||
# Get top 3 links from the map result
|
||||
top_links = map_website[:3] if isinstance(map_website, list) else []
|
||||
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||
|
||||
for link in top_links:
|
||||
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
|
||||
# Scrape the page
|
||||
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
|
||||
|
||||
|
||||
# Check if objective is met
|
||||
check_prompt = f"""
|
||||
Given the following scraped content and objective, determine if the objective is met.
|
||||
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
|
||||
If the objective is not met with confidence, respond with 'Objective not met'.
|
||||
|
||||
Objective: {objective}
|
||||
Scraped content: {scrape_result['markdown']}
|
||||
|
||||
Remember:
|
||||
1. Only return JSON if you are confident the objective is fully met.
|
||||
2. Keep the JSON structure as simple and flat as possible.
|
||||
3. Do not include any explanations or markdown formatting in your response.
|
||||
"""
|
||||
|
||||
result = grok_completion(check_prompt)
|
||||
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
|
||||
if result != "Objective not met":
|
||||
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
|
||||
try:
|
||||
result = result.replace("```json", "").replace("```", "")
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Main function to execute the process
|
||||
def main():
|
||||
# Get user input
|
||||
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
|
||||
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
|
||||
# Find the relevant page
|
||||
map_website = find_relevant_page_via_map(objective, url, app)
|
||||
|
||||
if map_website:
|
||||
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||
# Find objective in top pages
|
||||
result = find_objective_in_top_pages(map_website, objective, app)
|
||||
|
||||
if result:
|
||||
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,3 @@
|
|||
OPENAI_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
||||
SERP_API_KEY=
|
120
examples/openai_swarm_firecrawl_web_extractor/main.py
Normal file
120
examples/openai_swarm_firecrawl_web_extractor/main.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
from swarm import Agent
|
||||
from swarm.repl import run_demo_loop
|
||||
import dotenv
|
||||
from serpapi import GoogleSearch
|
||||
from openai import OpenAI
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Initialize FirecrawlApp and OpenAI
|
||||
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def search_google(query, objective):
|
||||
"""Search Google using SerpAPI."""
|
||||
print(f"Parameters: query={query}, objective={objective}")
|
||||
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
||||
results = search.get_dict().get("organic_results", [])
|
||||
return {"objective": objective, "results": results}
|
||||
|
||||
def map_url_pages(url, objective):
|
||||
"""Map a website's pages using Firecrawl."""
|
||||
|
||||
search_query = generate_completion(
|
||||
"website search query generator",
|
||||
f"Generate a 1-2 word search query for the website: {url} based on the objective",
|
||||
"Objective: " + objective
|
||||
)
|
||||
print(f"Parameters: url={url}, objective={objective}, search_query={search_query}")
|
||||
map_status = app.map_url(url, params={'search': search_query})
|
||||
if map_status.get('status') == 'success':
|
||||
links = map_status.get('links', [])
|
||||
top_link = links[0] if links else None
|
||||
return {"objective": objective, "results": [top_link] if top_link else []}
|
||||
else:
|
||||
return {"objective": objective, "results": []}
|
||||
|
||||
def scrape_url(url, objective):
|
||||
"""Scrape a website using Firecrawl."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
scrape_status = app.scrape_url(
|
||||
url,
|
||||
params={'formats': ['markdown']}
|
||||
)
|
||||
return {"objective": objective, "results": scrape_status}
|
||||
|
||||
def analyze_website_content(content, objective):
|
||||
"""Analyze the scraped website content using OpenAI."""
|
||||
print(f"Parameters: content={content[:50]}..., objective={objective}")
|
||||
analysis = generate_completion(
|
||||
"website data extractor",
|
||||
f"Analyze the following website content and extract a JSON object based on the objective.",
|
||||
"Objective: " + objective + "\nContent: " + content
|
||||
)
|
||||
return {"objective": objective, "results": analysis}
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{"role": "system", "content": f"You are a {role}. {task}"},
|
||||
{"role": "user", "content": content}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def handoff_to_search_google():
|
||||
"""Hand off the search query to the search google agent."""
|
||||
return google_search_agent
|
||||
|
||||
def handoff_to_map_url():
|
||||
"""Hand off the url to the map url agent."""
|
||||
return map_url_agent
|
||||
|
||||
def handoff_to_website_scraper():
|
||||
"""Hand off the url to the website scraper agent."""
|
||||
return website_scraper_agent
|
||||
|
||||
def handoff_to_analyst():
|
||||
"""Hand off the website content to the analyst agent."""
|
||||
return analyst_agent
|
||||
|
||||
|
||||
|
||||
user_interface_agent = Agent(
|
||||
name="User Interface Agent",
|
||||
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with an web data extraction objective that the user wants to achieve by searching the web, mapping the web pages, and extracting the content from a specific page. Be concise.",
|
||||
functions=[handoff_to_search_google],
|
||||
)
|
||||
|
||||
google_search_agent = Agent(
|
||||
name="Google Search Agent",
|
||||
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the map agent.",
|
||||
functions=[search_google, handoff_to_map_url],
|
||||
)
|
||||
|
||||
map_url_agent = Agent(
|
||||
name="Map URL Agent",
|
||||
instructions="You are a map url agent specialized in mapping the web pages. When you are done, you must hand off the results to the website scraper agent.",
|
||||
functions=[map_url_pages, handoff_to_website_scraper],
|
||||
)
|
||||
|
||||
website_scraper_agent = Agent(
|
||||
name="Website Scraper Agent",
|
||||
instructions="You are a website scraper agent specialized in scraping website content. When you are done, you must hand off the website content to the analyst agent to extract the data based on the objective.",
|
||||
functions=[scrape_url, handoff_to_analyst],
|
||||
)
|
||||
|
||||
analyst_agent = Agent(
|
||||
name="Analyst Agent",
|
||||
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must return a JSON object.",
|
||||
functions=[analyze_website_content],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the demo loop with the user interface agent
|
||||
run_demo_loop(user_interface_agent, stream=True)
|
|
@ -0,0 +1,4 @@
|
|||
firecrawl-py
|
||||
openai
|
||||
google-search-results
|
||||
git+https://github.com/openai/swarm.git
|
3
examples/sales_web_crawler/.env.example
Normal file
3
examples/sales_web_crawler/.env.example
Normal file
|
@ -0,0 +1,3 @@
|
|||
OPENAI_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
||||
SERP_API_KEY=
|
78
examples/sales_web_crawler/app.py
Normal file
78
examples/sales_web_crawler/app.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import csv
|
||||
import json
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
from openai import OpenAI
|
||||
from serpapi import GoogleSearch
|
||||
from swarm import Agent
|
||||
from swarm.repl import run_demo_loop
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Initialize FirecrawlApp and OpenAI
|
||||
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def crawl_and_analyze_url(url, objective):
|
||||
"""Crawl a website using Firecrawl and analyze the content."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
# Crawl the website
|
||||
crawl_status = app.crawl_url(
|
||||
url,
|
||||
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
|
||||
poll_interval=5
|
||||
)
|
||||
crawl_status = crawl_status['data']
|
||||
# Process each 'markdown' element individually
|
||||
combined_results = []
|
||||
for item in crawl_status:
|
||||
if 'markdown' in item:
|
||||
content = item['markdown']
|
||||
# Analyze the content
|
||||
analysis = generate_completion(
|
||||
"website data extractor",
|
||||
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
|
||||
"Objective: " + objective + "\nContent: " + content
|
||||
)
|
||||
# Parse the JSON result
|
||||
try:
|
||||
result = json.loads(analysis)
|
||||
combined_results.append(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Could not parse JSON from analysis: {analysis}")
|
||||
# Combine the results
|
||||
return {"objective": objective, "results": combined_results}
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{"role": "system", "content": f"You are a {role}. {task}"},
|
||||
{"role": "user", "content": content}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def handoff_to_crawl_url():
|
||||
"""Hand off the url to the crawl url agent."""
|
||||
return crawl_website_agent
|
||||
|
||||
user_interface_agent = Agent(
|
||||
name="User Interface Agent",
|
||||
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
|
||||
functions=[handoff_to_crawl_url],
|
||||
)
|
||||
|
||||
crawl_website_agent = Agent(
|
||||
name="Crawl Website Agent",
|
||||
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
|
||||
functions=[crawl_and_analyze_url],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the demo loop with the user interface agent
|
||||
run_demo_loop(user_interface_agent, stream=True)
|
4
examples/sales_web_crawler/requirements.txt
Normal file
4
examples/sales_web_crawler/requirements.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
firecrawl-py
|
||||
openai
|
||||
google-search-results
|
||||
git+https://github.com/openai/swarm.git
|
Loading…
Reference in New Issue
Block a user