mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'main' into nsc/pay-as-you-go-lw2
This commit is contained in:
commit
29b34270c8
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -29,3 +29,4 @@ apps/js-sdk/firecrawl/dist
|
|||
/examples/o1_web_crawler/firecrawl_env
|
||||
/examples/crm_lead_enrichment/crm_lead_enrichment_env
|
||||
/.venv
|
||||
/examples/claude_web_crawler/firecrawl_env
|
||||
|
|
|
@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
|||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { deleteKey, getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
@ -134,6 +134,13 @@ export async function getACUC(
|
|||
}
|
||||
}
|
||||
|
||||
export async function clearACUC(
|
||||
api_key: string,
|
||||
): Promise<void> {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
await deleteKey(cacheKeyACUC);
|
||||
}
|
||||
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
|
|
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
|
@ -0,0 +1,22 @@
|
|||
import { Request, Response } from "express";
|
||||
import { supabase_service } from "../../../services/supabase";
|
||||
import { clearACUC } from "../../auth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function acucCacheClearController(req: Request, res: Response) {
|
||||
try {
|
||||
const team_id: string = req.body.team_id;
|
||||
|
||||
const keys = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
||||
|
||||
res.json({ ok: true });
|
||||
} catch (error) {
|
||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
}
|
99
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
99
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
|
@ -0,0 +1,99 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJobs,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
let jobPriority = 20;
|
||||
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(req.body.urls.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
|
||||
const jobs = req.body.urls.map((x) => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url: x,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
|
|||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
|
@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
|
|
|
@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
|
|||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US')
|
||||
}).optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
|
@ -141,19 +142,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||
return obj;
|
||||
});
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||
},
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
|
@ -433,6 +444,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
|
|||
import { Logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
originUrl?: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
|
|
|
@ -54,6 +54,7 @@ export type PageOptions = {
|
|||
geolocation?: {
|
||||
country?: string;
|
||||
};
|
||||
skipTlsVerification?: boolean;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
|
|
@ -112,7 +112,7 @@ export async function runWebScraper({
|
|||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
|
|
|
@ -6,6 +6,8 @@ import {
|
|||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { wrap } from "./v1";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -33,3 +35,8 @@ adminRouter.get(
|
|||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
autoscalerController
|
||||
);
|
||||
|
||||
adminRouter.post(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||
wrap(acucCacheClearController),
|
||||
);
|
||||
|
|
|
@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
|||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
|
@ -29,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
||||
req.acuc = chunk;
|
||||
|
@ -94,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
|||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
|
@ -122,6 +123,15 @@ v1Router.post(
|
|||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/batch/scrape",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(batchScrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
|
@ -136,6 +146,13 @@ v1Router.get(
|
|||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/batch/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
// Yes, it uses the same controller as the normal crawl status controller
|
||||
wrap((req:any, res):any => crawlStatusController(req, res, true))
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
wrap(scrapeStatusController)
|
||||
|
|
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
|||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
|
@ -145,8 +145,14 @@ export class WebCrawler {
|
|||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||
let extraArgs = {};
|
||||
if(skipTlsVerification) {
|
||||
extraArgs["httpsAgent"] = new https.Agent({
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
}
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
|
|
@ -594,6 +594,7 @@ export class WebScraperDataProvider {
|
|||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
|
|
@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
|
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
|
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
|
|||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
|
|
|
@ -157,6 +157,7 @@ export async function scrapSingleUrl(
|
|||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
|
|
|
@ -329,7 +329,8 @@ async function processJob(job: Job, token: string) {
|
|||
job.id as string,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
job.data.v1,
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
);
|
||||
}
|
||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||
|
@ -339,7 +340,7 @@ async function processJob(job: Job, token: string) {
|
|||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.page",
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
true
|
||||
);
|
||||
}
|
||||
|
@ -365,7 +366,7 @@ async function processJob(job: Job, token: string) {
|
|||
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
|
@ -415,8 +416,6 @@ async function processJob(job: Job, token: string) {
|
|||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
|
@ -439,7 +438,7 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
|
@ -469,7 +468,7 @@ async function processJob(job: Job, token: string) {
|
|||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
|
@ -487,7 +486,7 @@ async function processJob(job: Job, token: string) {
|
|||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -499,7 +498,7 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
|
@ -556,7 +555,8 @@ async function processJob(job: Job, token: string) {
|
|||
job.data.crawl_id ?? (job.id as string),
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
job.data.v1,
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
);
|
||||
}
|
||||
// if (job.data.v1) {
|
||||
|
@ -605,7 +605,7 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
|
|
|
@ -161,4 +161,4 @@ export type PlanType =
|
|||
| "";
|
||||
|
||||
|
||||
export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed";
|
||||
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
|
|
@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
|
|||
});
|
||||
```
|
||||
|
||||
### Batch scraping multiple URLs
|
||||
|
||||
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
|
||||
|
||||
```js
|
||||
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
|
||||
formats: ['markdown', 'html'],
|
||||
})
|
||||
```
|
||||
|
||||
|
||||
#### Asynchronous batch scrape
|
||||
|
||||
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
||||
|
||||
```js
|
||||
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
```
|
||||
|
||||
#### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
||||
|
||||
```js
|
||||
// Batch scrape multiple URLs with WebSockets:
|
||||
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
|
||||
watch.addEventListener("document", doc => {
|
||||
console.log("DOC", doc.detail);
|
||||
});
|
||||
|
||||
watch.addEventListener("error", err => {
|
||||
console.error("ERR", err.detail.error);
|
||||
});
|
||||
|
||||
watch.addEventListener("done", state => {
|
||||
console.log("DONE", state.detail.status);
|
||||
});
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "firecrawl",
|
||||
"version": "1.6.1",
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.7.1",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
|
|
@ -154,6 +154,17 @@ export interface CrawlResponse {
|
|||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape operations.
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface BatchScrapeResponse {
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: true;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Provides detailed status of a crawl job including progress and results.
|
||||
|
@ -169,6 +180,21 @@ export interface CrawlStatusResponse {
|
|||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape job status checks.
|
||||
* Provides detailed status of a batch scrape job including progress and results.
|
||||
*/
|
||||
export interface BatchScrapeStatusResponse {
|
||||
success: true;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
next?: string;
|
||||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Parameters for mapping operations.
|
||||
* Defines options for mapping URLs during a crawl.
|
||||
|
@ -493,6 +519,144 @@ export default class FirecrawlApp {
|
|||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
||||
* @param url - The URLs to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async batchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const id: string = response.data.id;
|
||||
return this.monitorJobStatus(id, headers, pollInterval);
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
async asyncBatchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
||||
* @param urls - The URL to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns A CrawlWatcher instance to monitor the crawl job.
|
||||
*/
|
||||
async batchScrapeUrlsAndWatch(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string,
|
||||
) {
|
||||
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
||||
|
||||
if (crawl.success && crawl.id) {
|
||||
const id = crawl.id;
|
||||
return new CrawlWatcher(id, this);
|
||||
}
|
||||
|
||||
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a batch scrape job using the Firecrawl API.
|
||||
* @param id - The ID of the batch scrape operation.
|
||||
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
if (!id) {
|
||||
throw new FirecrawlError("No batch scrape ID provided", 400);
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
let allData = response.data.data;
|
||||
if (getAllData && response.data.status === "completed") {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
} else {
|
||||
this.handleError(response, "check batch scrape status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
|
|
22
apps/js-sdk/package-lock.json
generated
22
apps/js-sdk/package-lock.json
generated
|
@ -9,7 +9,7 @@
|
|||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "^1.7.0-beta.2",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
@ -423,31 +423,17 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
|
||||
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
|
||||
"version": "1.7.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
|
||||
"integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/@tsconfig/node10": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "1.7.1",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
|
|
@ -149,6 +149,69 @@ async def start_crawl_and_watch():
|
|||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
### Scraping multiple URLs in batch
|
||||
|
||||
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
|
||||
|
||||
```python
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Asynchronous batch scrape
|
||||
|
||||
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Checking batch scrape status
|
||||
|
||||
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
|
||||
|
||||
```python
|
||||
id = batch_scrape_result['id']
|
||||
status = app.check_batch_scrape_job(id)
|
||||
```
|
||||
|
||||
### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
# inside an async function...
|
||||
nest_asyncio.apply()
|
||||
|
||||
# Define event handlers
|
||||
def on_document(detail):
|
||||
print("DOC", detail)
|
||||
|
||||
def on_error(detail):
|
||||
print("ERR", detail['error'])
|
||||
|
||||
def on_done(detail):
|
||||
print("DONE", detail['status'])
|
||||
|
||||
# Function to start the crawl and watch process
|
||||
async def start_crawl_and_watch():
|
||||
# Initiate the crawl job and get the watcher
|
||||
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
|
||||
# Add event listeners
|
||||
watcher.add_event_listener("document", on_document)
|
||||
watcher.add_event_listener("error", on_error)
|
||||
watcher.add_event_listener("done", on_done)
|
||||
|
||||
# Start the watcher
|
||||
await watcher.connect()
|
||||
|
||||
# Run the event loop
|
||||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
|
|
@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
|
|||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
|
||||
# Test batch scrape
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
batch_scrape_params = {
|
||||
'formats': ['markdown', 'html'],
|
||||
}
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result['data'][0]['markdown'])
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.3.1"
|
||||
__version__ = "1.4.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
|
@ -275,6 +275,123 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
def batch_scrape_urls(self, urls: list[str],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
"""
|
||||
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape was successful.
|
||||
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
|
||||
- 'completed' (int): Number of scraped pages that completed.
|
||||
- 'total' (int): Total number of scraped pages.
|
||||
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
||||
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
||||
- 'data' (List[Dict]): List of all the scraped pages.
|
||||
|
||||
Raises:
|
||||
Exception: If the batch scrape job initiation or monitoring fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
|
||||
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate a crawl job asynchronously.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape initiation was successful.
|
||||
- 'id' (str): The unique identifier for the batch scrape job.
|
||||
- 'url' (str): The URL to check the status of the batch scrape job.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
|
||||
"""
|
||||
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
|
||||
"""
|
||||
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
|
||||
if crawl_response['success'] and 'id' in crawl_response:
|
||||
return CrawlWatcher(crawl_response['id'], self)
|
||||
else:
|
||||
raise Exception("Batch scrape job failed to start")
|
||||
|
||||
def check_batch_scrape_status(self, id: str) -> Any:
|
||||
"""
|
||||
Check the status of a batch scrape job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the batch scrape job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the batch scrape job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return {
|
||||
'success': True,
|
||||
'status': data.get('status'),
|
||||
'total': data.get('total'),
|
||||
'completed': data.get('completed'),
|
||||
'creditsUsed': data.get('creditsUsed'),
|
||||
'expiresAt': data.get('expiresAt'),
|
||||
'next': data.get('next'),
|
||||
'data': data.get('data'),
|
||||
'error': data.get('error')
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
|
164
examples/claude_web_crawler/claude_web_crawler.py
Normal file
164
examples/claude_web_crawler/claude_web_crawler.py
Normal file
|
@ -0,0 +1,164 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import anthropic
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp and OpenAI client
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
client = anthropic.Anthropic(api_key=anthropic_api_key)
|
||||
|
||||
# Find the page that most likely contains the objective
|
||||
def find_relevant_page_via_map(objective, url, app, client):
|
||||
try:
|
||||
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||
|
||||
map_prompt = f"""
|
||||
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
|
||||
"""
|
||||
|
||||
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
|
||||
completion = client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1000,
|
||||
temperature=0,
|
||||
system="You are an expert web crawler. Respond with the best search parameter.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": map_prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
map_search_parameter = completion.content[0].text
|
||||
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||
return map_website['links']
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
|
||||
def find_objective_in_top_pages(map_website, objective, app, client):
|
||||
try:
|
||||
# Get top 2 links from the map result
|
||||
top_links = map_website[:2]
|
||||
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||
|
||||
# Scrape the pages in batch
|
||||
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
|
||||
|
||||
|
||||
for scrape_result in batch_scrape_result['data']:
|
||||
|
||||
# Check if objective is met
|
||||
check_prompt = f"""
|
||||
Given the following scraped content and objective, determine if the objective is met.
|
||||
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
|
||||
If the objective is not met with confidence, respond with 'Objective not met'.
|
||||
|
||||
Objective: {objective}
|
||||
Scraped content: {scrape_result['markdown']}
|
||||
|
||||
Remember:
|
||||
1. Only return JSON if you are confident the objective is fully met.
|
||||
2. Keep the JSON structure as simple and flat as possible.
|
||||
3. Do not include any explanations or markdown formatting in your response.
|
||||
"""
|
||||
|
||||
completion = client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1000,
|
||||
temperature=0,
|
||||
system="You are an expert web crawler. Respond with the relevant information in JSON format.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": check_prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
result = completion.content[0].text
|
||||
|
||||
if result != "Objective not met":
|
||||
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Main function to execute the process
|
||||
def main():
|
||||
# Get user input
|
||||
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
|
||||
if not url.strip():
|
||||
url = "https://www.firecrawl.dev/"
|
||||
|
||||
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
|
||||
if not objective.strip():
|
||||
objective = "find me the pricing plans"
|
||||
|
||||
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
|
||||
# Find the relevant page
|
||||
map_website = find_relevant_page_via_map(objective, url, app, client)
|
||||
print(map_website)
|
||||
|
||||
if map_website:
|
||||
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||
# Find objective in top pages
|
||||
result = find_objective_in_top_pages(map_website, objective, app, client)
|
||||
|
||||
if result:
|
||||
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user