mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge pull request #789 from mendableai/mog/bulk-scrape
feat: Batch Scrape
This commit is contained in:
commit
76ca7fdcb5
99
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
99
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
|
@ -0,0 +1,99 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJobs,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
let jobPriority = 20;
|
||||
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(req.body.urls.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
|
||||
const jobs = req.body.urls.map((x) => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url: x,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
|
|||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
|
@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
|
|
|
@ -142,19 +142,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||
return obj;
|
||||
});
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||
},
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
|
|
|
@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
|
|||
import { Logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
originUrl?: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
|
|
|
@ -112,7 +112,7 @@ export async function runWebScraper({
|
|||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
|
|
|
@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
|||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
|
@ -29,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
||||
req.acuc = chunk;
|
||||
|
@ -122,6 +123,15 @@ v1Router.post(
|
|||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/batch/scrape",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(batchScrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
|
@ -136,6 +146,13 @@ v1Router.get(
|
|||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/batch/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
// Yes, it uses the same controller as the normal crawl status controller
|
||||
wrap((req:any, res):any => crawlStatusController(req, res, true))
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
wrap(scrapeStatusController)
|
||||
|
|
|
@ -365,7 +365,7 @@ async function processJob(job: Job, token: string) {
|
|||
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
|
@ -414,9 +414,7 @@ async function processJob(job: Job, token: string) {
|
|||
}
|
||||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) {
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
|
|
|
@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
|
|||
});
|
||||
```
|
||||
|
||||
### Batch scraping multiple URLs
|
||||
|
||||
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
|
||||
|
||||
```js
|
||||
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
|
||||
formats: ['markdown', 'html'],
|
||||
})
|
||||
```
|
||||
|
||||
|
||||
#### Asynchronous batch scrape
|
||||
|
||||
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
||||
|
||||
```js
|
||||
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
```
|
||||
|
||||
#### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
||||
|
||||
```js
|
||||
// Batch scrape multiple URLs with WebSockets:
|
||||
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
|
||||
watch.addEventListener("document", doc => {
|
||||
console.log("DOC", doc.detail);
|
||||
});
|
||||
|
||||
watch.addEventListener("error", err => {
|
||||
console.error("ERR", err.detail.error);
|
||||
});
|
||||
|
||||
watch.addEventListener("done", state => {
|
||||
console.log("DONE", state.detail.status);
|
||||
});
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "firecrawl",
|
||||
"version": "1.6.1",
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.7.1",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
|
|
@ -154,6 +154,17 @@ export interface CrawlResponse {
|
|||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape operations.
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface BatchScrapeResponse {
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: true;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Provides detailed status of a crawl job including progress and results.
|
||||
|
@ -169,6 +180,21 @@ export interface CrawlStatusResponse {
|
|||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape job status checks.
|
||||
* Provides detailed status of a batch scrape job including progress and results.
|
||||
*/
|
||||
export interface BatchScrapeStatusResponse {
|
||||
success: true;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
next?: string;
|
||||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Parameters for mapping operations.
|
||||
* Defines options for mapping URLs during a crawl.
|
||||
|
@ -493,6 +519,144 @@ export default class FirecrawlApp {
|
|||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
||||
* @param url - The URLs to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async batchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const id: string = response.data.id;
|
||||
return this.monitorJobStatus(id, headers, pollInterval);
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
async asyncBatchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
||||
* @param urls - The URL to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns A CrawlWatcher instance to monitor the crawl job.
|
||||
*/
|
||||
async batchScrapeUrlsAndWatch(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string,
|
||||
) {
|
||||
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
||||
|
||||
if (crawl.success && crawl.id) {
|
||||
const id = crawl.id;
|
||||
return new CrawlWatcher(id, this);
|
||||
}
|
||||
|
||||
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a batch scrape job using the Firecrawl API.
|
||||
* @param id - The ID of the batch scrape operation.
|
||||
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
if (!id) {
|
||||
throw new FirecrawlError("No batch scrape ID provided", 400);
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
let allData = response.data.data;
|
||||
if (getAllData && response.data.status === "completed") {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
} else {
|
||||
this.handleError(response, "check batch scrape status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
|
|
22
apps/js-sdk/package-lock.json
generated
22
apps/js-sdk/package-lock.json
generated
|
@ -9,7 +9,7 @@
|
|||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "^1.7.0-beta.2",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
@ -423,31 +423,17 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
|
||||
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
|
||||
"version": "1.7.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
|
||||
"integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/@tsconfig/node10": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "1.7.0-beta.2",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
|
|
@ -149,6 +149,69 @@ async def start_crawl_and_watch():
|
|||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
### Scraping multiple URLs in batch
|
||||
|
||||
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
|
||||
|
||||
```python
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Asynchronous batch scrape
|
||||
|
||||
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Checking batch scrape status
|
||||
|
||||
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
|
||||
|
||||
```python
|
||||
id = batch_scrape_result['id']
|
||||
status = app.check_batch_scrape_job(id)
|
||||
```
|
||||
|
||||
### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
# inside an async function...
|
||||
nest_asyncio.apply()
|
||||
|
||||
# Define event handlers
|
||||
def on_document(detail):
|
||||
print("DOC", detail)
|
||||
|
||||
def on_error(detail):
|
||||
print("ERR", detail['error'])
|
||||
|
||||
def on_done(detail):
|
||||
print("DONE", detail['status'])
|
||||
|
||||
# Function to start the crawl and watch process
|
||||
async def start_crawl_and_watch():
|
||||
# Initiate the crawl job and get the watcher
|
||||
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
|
||||
# Add event listeners
|
||||
watcher.add_event_listener("document", on_document)
|
||||
watcher.add_event_listener("error", on_error)
|
||||
watcher.add_event_listener("done", on_done)
|
||||
|
||||
# Start the watcher
|
||||
await watcher.connect()
|
||||
|
||||
# Run the event loop
|
||||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
|
|
@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
|
|||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
|
||||
# Test batch scrape
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
batch_scrape_params = {
|
||||
'formats': ['markdown', 'html'],
|
||||
}
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result['data'][0]['markdown'])
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.3.1"
|
||||
__version__ = "1.4.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
|
@ -275,6 +275,123 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
def batch_scrape_urls(self, urls: list[str],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
"""
|
||||
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape was successful.
|
||||
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
|
||||
- 'completed' (int): Number of scraped pages that completed.
|
||||
- 'total' (int): Total number of scraped pages.
|
||||
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
||||
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
||||
- 'data' (List[Dict]): List of all the scraped pages.
|
||||
|
||||
Raises:
|
||||
Exception: If the batch scrape job initiation or monitoring fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
|
||||
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate a crawl job asynchronously.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape initiation was successful.
|
||||
- 'id' (str): The unique identifier for the batch scrape job.
|
||||
- 'url' (str): The URL to check the status of the batch scrape job.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
|
||||
"""
|
||||
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
|
||||
"""
|
||||
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
|
||||
if crawl_response['success'] and 'id' in crawl_response:
|
||||
return CrawlWatcher(crawl_response['id'], self)
|
||||
else:
|
||||
raise Exception("Batch scrape job failed to start")
|
||||
|
||||
def check_batch_scrape_status(self, id: str) -> Any:
|
||||
"""
|
||||
Check the status of a batch scrape job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the batch scrape job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the batch scrape job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return {
|
||||
'success': True,
|
||||
'status': data.get('status'),
|
||||
'total': data.get('total'),
|
||||
'completed': data.get('completed'),
|
||||
'creditsUsed': data.get('creditsUsed'),
|
||||
'expiresAt': data.get('expiresAt'),
|
||||
'next': data.get('next'),
|
||||
'data': data.get('data'),
|
||||
'error': data.get('error')
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
|
Loading…
Reference in New Issue
Block a user