Merge pull request #789 from mendableai/mog/bulk-scrape

feat: Batch Scrape
This commit is contained in:
Nicolas 2024-10-23 16:12:42 -03:00 committed by GitHub
commit 76ca7fdcb5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 553 additions and 42 deletions

View File

@ -0,0 +1,99 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJobs,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = batchScrapeRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
remainingCredits = Infinity;
}
const pageOptions = legacyScrapeOptions(req.body);
const sc: StoredCrawl = {
crawlerOptions: null,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
await saveCrawl(id, sc);
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(req.body.urls.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}
const jobs = req.body.urls.map((x) => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url: x,
mode: "single_urls",
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions: null,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}

View File

@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
return jobs; return jobs;
} }
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) { export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
const sc = await getCrawl(req.params.jobId); const sc = await getCrawl(req.params.jobId);
if (!sc) { if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" }); return res.status(404).json({ success: false, error: "Job not found" });
@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const data = doneJobs.map(x => x.returnvalue); const data = doneJobs.map(x => x.returnvalue);
const protocol = process.env.ENV === "local" ? req.protocol : "https"; const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString()); nextURL.searchParams.set("skip", (start + data.length).toString());

View File

@ -142,19 +142,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
return obj; return obj;
}); });
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>; export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
},
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
const crawlerOptions = z.object({ const crawlerOptions = z.object({
includePaths: z.string().array().default([]), includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]), excludePaths: z.string().array().default([]),

View File

@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
import { Logger } from "./logger"; import { Logger } from "./logger";
export type StoredCrawl = { export type StoredCrawl = {
originUrl: string; originUrl?: string;
crawlerOptions: any; crawlerOptions: any;
pageOptions: any; pageOptions: any;
team_id: string; team_id: string;

View File

@ -112,7 +112,7 @@ export async function runWebScraper({
} }
// remove docs with empty content // remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls const filteredDocs = crawlerOptions?.returnOnlyUrls
? docs.map((doc) => { ? docs.map((doc) => {
if (doc.metadata.sourceURL) { if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL }; return { url: doc.metadata.sourceURL };

View File

@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search"; // import { searchController } from "../../src/controllers/v1/search";
@ -29,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
return (req, res, next) => { return (req, res, next) => {
(async () => { (async () => {
if (!minimum && req.body) { if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1; minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
} }
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum); const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
req.acuc = chunk; req.acuc = chunk;
@ -122,6 +123,15 @@ v1Router.post(
wrap(crawlController) wrap(crawlController)
); );
v1Router.post(
"/batch/scrape",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(batchScrapeController)
);
v1Router.post( v1Router.post(
"/map", "/map",
authMiddleware(RateLimiterMode.Map), authMiddleware(RateLimiterMode.Map),
@ -136,6 +146,13 @@ v1Router.get(
wrap(crawlStatusController) wrap(crawlStatusController)
); );
v1Router.get(
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
// Yes, it uses the same controller as the normal crawl status controller
wrap((req:any, res):any => crawlStatusController(req, res, true))
);
v1Router.get( v1Router.get(
"/scrape/:jobId", "/scrape/:jobId",
wrap(scrapeStatusController) wrap(scrapeStatusController)

View File

@ -365,7 +365,7 @@ async function processJob(job: Job, token: string) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (!job.data.sitemapped) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc);
@ -414,9 +414,7 @@ async function processJob(job: Job, token: string) {
} }
} }
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) {
if (!job.data.v1) { if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id); const jobIDs = await getCrawlJobs(job.data.crawl_id);

View File

@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
}); });
``` ```
### Batch scraping multiple URLs
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
```js
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
formats: ['markdown', 'html'],
})
```
#### Asynchronous batch scrape
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
```js
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
```
#### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
```js
// Batch scrape multiple URLs with WebSockets:
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);
});
watch.addEventListener("error", err => {
console.error("ERR", err.detail.error);
});
watch.addEventListener("done", state => {
console.log("DONE", state.detail.status);
});
```
## Error Handling ## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks. The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.

View File

@ -1,6 +1,6 @@
{ {
"name": "firecrawl", "name": "@mendable/firecrawl-js",
"version": "1.6.1", "version": "1.7.1",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -154,6 +154,17 @@ export interface CrawlResponse {
error?: string; error?: string;
} }
/**
* Response interface for batch scrape operations.
* Defines the structure of the response received after initiating a crawl.
*/
export interface BatchScrapeResponse {
id?: string;
url?: string;
success: true;
error?: string;
}
/** /**
* Response interface for job status checks. * Response interface for job status checks.
* Provides detailed status of a crawl job including progress and results. * Provides detailed status of a crawl job including progress and results.
@ -169,6 +180,21 @@ export interface CrawlStatusResponse {
data: FirecrawlDocument<undefined>[]; data: FirecrawlDocument<undefined>[];
}; };
/**
* Response interface for batch scrape job status checks.
* Provides detailed status of a batch scrape job including progress and results.
*/
export interface BatchScrapeStatusResponse {
success: true;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: Date;
next?: string;
data: FirecrawlDocument<undefined>[];
};
/** /**
* Parameters for mapping operations. * Parameters for mapping operations.
* Defines options for mapping URLs during a crawl. * Defines options for mapping URLs during a crawl.
@ -493,6 +519,144 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." }; return { success: false, error: "Internal server error." };
} }
/**
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
* @param url - The URLs to scrape.
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
idempotencyKey?: string
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
const id: string = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
async asyncBatchScrapeUrls(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string
): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
* @param urls - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns A CrawlWatcher instance to monitor the crawl job.
*/
async batchScrapeUrlsAndWatch(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string,
) {
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new FirecrawlError("Batch scrape job failed to start", 400);
}
/**
* Checks the status of a batch scrape job using the Firecrawl API.
* @param id - The ID of the batch scrape operation.
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
* @returns The response containing the job status.
*/
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
if (!id) {
throw new FirecrawlError("No batch scrape ID provided", 400);
}
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/batch/scrape/${id}`,
headers
);
if (response.status === 200) {
let allData = response.data.data;
if (getAllData && response.data.status === "completed") {
let statusData = response.data
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: allData,
error: response.data.error,
})
} else {
this.handleError(response, "check batch scrape status");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/** /**
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency. * @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -9,7 +9,7 @@
"version": "1.0.0", "version": "1.0.0",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^1.0.3", "@mendable/firecrawl-js": "^1.7.0-beta.2",
"axios": "^1.6.8", "axios": "^1.6.8",
"firecrawl": "^1.2.0", "firecrawl": "^1.2.0",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
@ -423,31 +423,17 @@
} }
}, },
"node_modules/@mendable/firecrawl-js": { "node_modules/@mendable/firecrawl-js": {
"version": "1.2.2", "version": "1.7.0-beta.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", "integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
"dependencies": { "dependencies": {
"axios": "^1.6.8", "axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4", "isows": "^1.0.4",
"typescript-event-target": "^1.1.1", "typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8", "zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0" "zod-to-json-schema": "^3.23.0"
} }
}, },
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/@tsconfig/node10": { "node_modules/@tsconfig/node10": {
"version": "1.0.11", "version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",

View File

@ -11,7 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@mendable/firecrawl-js": "^1.0.3", "@mendable/firecrawl-js": "1.7.0-beta.2",
"axios": "^1.6.8", "axios": "^1.6.8",
"firecrawl": "^1.2.0", "firecrawl": "^1.2.0",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",

View File

@ -149,6 +149,69 @@ async def start_crawl_and_watch():
await start_crawl_and_watch() await start_crawl_and_watch()
``` ```
### Scraping multiple URLs in batch
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
```python
idempotency_key = str(uuid.uuid4()) # optional idempotency key
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
print(batch_scrape_result)
```
### Asynchronous batch scrape
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
print(batch_scrape_result)
```
### Checking batch scrape status
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
```python
id = batch_scrape_result['id']
status = app.check_batch_scrape_job(id)
```
### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
# inside an async function...
nest_asyncio.apply()
# Define event handlers
def on_document(detail):
print("DOC", detail)
def on_error(detail):
print("ERR", detail['error'])
def on_done(detail):
print("DONE", detail['status'])
# Function to start the crawl and watch process
async def start_crawl_and_watch():
# Initiate the crawl job and get the watcher
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
# Add event listeners
watcher.add_event_listener("document", on_document)
watcher.add_event_listener("error", on_error)
watcher.add_event_listener("done", on_done)
# Start the watcher
await watcher.connect()
# Run the event loop
await start_crawl_and_watch()
```
## Error Handling ## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.

View File

@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
scrape_result = app.scrape_url('firecrawl.dev') scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown']) print(scrape_result['markdown'])
# Test batch scrape
urls = ['https://example.com', 'https://docs.firecrawl.dev']
batch_scrape_params = {
'formats': ['markdown', 'html'],
}
# Synchronous batch scrape
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
print("Synchronous Batch Scrape Result:")
print(batch_result['data'][0]['markdown'])
# Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
# Crawl a website: # Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp from .firecrawl import FirecrawlApp
__version__ = "1.3.1" __version__ = "1.4.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -275,6 +275,123 @@ class FirecrawlApp:
else: else:
self._handle_error(response, 'map') self._handle_error(response, 'map')
def batch_scrape_urls(self, urls: list[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
"""
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
- 'success' (bool): Indicates if the batch scrape was successful.
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
- 'completed' (int): Number of scraped pages that completed.
- 'total' (int): Total number of scraped pages.
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
- 'data' (List[Dict]): List of all the scraped pages.
Raises:
Exception: If the batch scrape job initiation or monitoring fails.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
id = response.json().get('id')
return self._monitor_job_status(id, headers, poll_interval)
else:
self._handle_error(response, 'start batch scrape job')
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
- 'success' (bool): Indicates if the batch scrape initiation was successful.
- 'id' (str): The unique identifier for the batch scrape job.
- 'url' (str): The URL to check the status of the batch scrape job.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start batch scrape job')
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
"""
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
"""
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Batch scrape job failed to start")
def check_batch_scrape_status(self, id: str) -> Any:
"""
Check the status of a batch scrape job using the Firecrawl API.
Args:
id (str): The ID of the batch scrape job.
Returns:
Any: The status of the batch scrape job.
Raises:
Exception: If the status check request fails.
"""
endpoint = f'/v1/batch/scrape/{id}'
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
data = response.json()
return {
'success': True,
'status': data.get('status'),
'total': data.get('total'),
'completed': data.get('completed'),
'creditsUsed': data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'),
'next': data.get('next'),
'data': data.get('data'),
'error': data.get('error')
}
else:
self._handle_error(response, 'check batch scrape status')
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
""" """
Prepare the headers for API requests. Prepare the headers for API requests.