From 03b37998fdce3f46c3d52d559f59ea432b0ff68d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Oct 2024 19:40:18 +0200 Subject: [PATCH 1/8] feat: bulk scrape --- apps/api/src/controllers/v1/bulk-scrape.ts | 99 ++++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 32 ++++--- apps/api/src/lib/crawl-redis.ts | 2 +- apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/routes/v1.ts | 16 ++++ apps/api/src/services/queue-worker.ts | 6 +- 6 files changed, 140 insertions(+), 17 deletions(-) create mode 100644 apps/api/src/controllers/v1/bulk-scrape.ts diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/bulk-scrape.ts new file mode 100644 index 00000000..3e1afbd0 --- /dev/null +++ b/apps/api/src/controllers/v1/bulk-scrape.ts @@ -0,0 +1,99 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + BulkScrapeRequest, + bulkScrapeRequestSchema, + CrawlResponse, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJobs, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function bulkScrapeController( + req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>, + res: Response +) { + req.body = bulkScrapeRequestSchema.parse(req.body); + + const id = uuidv4(); + + await logCrawl(id, req.auth.team_id); + + let { remainingCredits } = req.account; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if(!useDbAuthentication){ + remainingCredits = Infinity; + } + + const pageOptions = legacyScrapeOptions(req.body); + + const sc: StoredCrawl = { + crawlerOptions: null, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + await saveCrawl(id, sc); + + let jobPriority = 20; + + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(req.body.urls.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } + + const jobs = req.body.urls.map((x) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url: x, + mode: "single_urls", + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions: null, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + v1: true, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); + await getScrapeQueue().addBulk(jobs); + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + + return res.status(200).json({ + success: true, + id, + url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`, + }); +} + + diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 033de6e0..56c944ec 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -141,19 +141,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({ return obj; }); -// export type ScrapeRequest = { -// url: string; -// formats?: Format[]; -// headers?: { [K: string]: string }; -// includeTags?: string[]; -// excludeTags?: string[]; -// onlyMainContent?: boolean; -// timeout?: number; -// waitFor?: number; -// } - export type ScrapeRequest = z.infer; +export const bulkScrapeRequestSchema = scrapeOptions.extend({ + urls: url.array(), + origin: z.string().optional().default("api"), +}).strict(strictMessage).refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); + }, + { + message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + } +).transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; +}); + +export type BulkScrapeRequest = z.infer; + const crawlerOptions = z.object({ includePaths: z.string().array().default([]), excludePaths: z.string().array().default([]), diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index f0ece43f..379bc179 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service"; import { Logger } from "./logger"; export type StoredCrawl = { - originUrl: string; + originUrl?: string; crawlerOptions: any; pageOptions: any; team_id: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 6e642c65..8eb679e7 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -112,7 +112,7 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = crawlerOptions.returnOnlyUrls + const filteredDocs = crawlerOptions?.returnOnlyUrls ? docs.map((doc) => { if (doc.metadata.sourceURL) { return { url: doc.metadata.sourceURL }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index b0ceceb4..2bd3d3ea 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; +import { bulkScrapeController } from "../controllers/v1/bulk-scrape"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -122,6 +123,15 @@ v1Router.post( wrap(crawlController) ); +v1Router.post( + "/bulk/scrape", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(bulkScrapeController) +); + v1Router.post( "/map", authMiddleware(RateLimiterMode.Map), @@ -136,6 +146,12 @@ v1Router.get( wrap(crawlStatusController) ); +v1Router.get( + "/bulk/scrape/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + v1Router.get( "/scrape/:jobId", wrap(scrapeStatusController) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index bff51f74..1ea4775a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -365,7 +365,7 @@ async function processJob(job: Job, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - if (!job.data.sitemapped) { + if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); @@ -414,9 +414,7 @@ async function processJob(job: Job, token: string) { } } - if (await finishCrawl(job.data.crawl_id)) { - - + if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) { if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); From 3cd328cf93618b26073f94bb64b2f661528cefca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 22 Oct 2024 18:58:48 +0200 Subject: [PATCH 2/8] feat(bulk/scrape): add node and python SDK integration + docs --- apps/js-sdk/firecrawl/README.md | 40 +++++++ apps/js-sdk/firecrawl/src/index.ts | 138 +++++++++++++++++++++++++ apps/python-sdk/README.md | 63 +++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 117 +++++++++++++++++++++ 4 files changed, 358 insertions(+) diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 0f3a6824..1655d2ee 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -145,6 +145,46 @@ watch.addEventListener("done", state => { }); ``` +### Bulk scraping multiple URLs + +To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. + +```js +const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { + formats: ['markdown', 'html'], +}) +``` + + +#### Asynchronous bulk scrape + +To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape. + +```js +const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); +``` + +#### Bulk scrape with WebSockets + +To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats. + +```js +// Bulk scrape multiple URLs with WebSockets: +const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); + +watch.addEventListener("document", doc => { + console.log("DOC", doc.detail); +}); + +watch.addEventListener("error", err => { + console.error("ERR", err.detail.error); +}); + +watch.addEventListener("done", state => { + console.log("DONE", state.detail.status); +}); +``` + ## Error Handling The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks. diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a839d5d0..30797c34 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -493,6 +493,144 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } + /** + * Initiates a bulk scrape job for multiple URLs using the Firecrawl API. + * @param url - The URLs to scrape. + * @param params - Additional parameters for the scrape request. + * @param pollInterval - Time in seconds for job status checks. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns The response from the crawl operation. + */ + async bulkScrapeUrls( + urls: string[], + params?: ScrapeParams, + pollInterval: number = 2, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: any = { urls, ...(params ?? {}) }; + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/bulk/scrape`, + jsonData, + headers + ); + if (response.status === 200) { + const id: string = response.data.id; + return this.monitorJobStatus(id, headers, pollInterval); + } else { + this.handleError(response, "start bulk scrape job"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } + + async asyncBulkScrapeUrls( + urls: string[], + params?: ScrapeParams, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: any = { urls, ...(params ?? {}) }; + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/bulk/scrape`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "start bulk scrape job"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } + + /** + * Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket. + * @param urls - The URL to scrape. + * @param params - Additional parameters for the scrape request. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns A CrawlWatcher instance to monitor the crawl job. + */ + async bulkScrapeUrlsAndWatch( + urls: string[], + params?: ScrapeParams, + idempotencyKey?: string, + ) { + const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey); + + if (crawl.success && crawl.id) { + const id = crawl.id; + return new CrawlWatcher(id, this); + } + + throw new FirecrawlError("Bulk scrape job failed to start", 400); + } + + /** + * Checks the status of a bulk scrape job using the Firecrawl API. + * @param id - The ID of the bulk scrape operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) + * @returns The response containing the job status. + */ + async checkBulkScrapeStatus(id?: string, getAllData = false): Promise { + if (!id) { + throw new FirecrawlError("No bulk scrape ID provided", 400); + } + + const headers: AxiosRequestHeaders = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/bulk/scrape/${id}`, + headers + ); + if (response.status === 200) { + let allData = response.data.data; + if (getAllData && response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } + return ({ + success: response.data.success, + status: response.data.status, + total: response.data.total, + completed: response.data.completed, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, + data: allData, + error: response.data.error, + }) + } else { + this.handleError(response, "check bulk scrape status"); + } + } catch (error: any) { + throw new FirecrawlError(error.message, 500); + } + return { success: false, error: "Internal server error." }; + } + /** * Prepares the headers for an API request. * @param idempotencyKey - Optional key to ensure idempotency. diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index a437e0c6..6a7d4a0a 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -149,6 +149,69 @@ async def start_crawl_and_watch(): await start_crawl_and_watch() ``` +### Scraping multiple URLs in bulk + +To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. + +```python +idempotency_key = str(uuid.uuid4()) # optional idempotency key +bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) +print(bulk_scrape_result) +``` + +### Asynchronous bulk scrape + +To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. + +```python +bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) +print(bulk_scrape_result) +``` + +### Checking bulk scrape status + +To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job. + +```python +id = bulk_scrape_result['id'] +status = app.check_bulk_scrape_job(id) +``` + +### Bulk scrape with WebSockets + +To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. + +```python +# inside an async function... +nest_asyncio.apply() + +# Define event handlers +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + +# Function to start the crawl and watch process +async def start_crawl_and_watch(): + # Initiate the crawl job and get the watcher + watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() + +# Run the event loop +await start_crawl_and_watch() +``` + ## Error Handling The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 70f677ef..4b596619 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -271,6 +271,123 @@ class FirecrawlApp: else: self._handle_error(response, 'map') + def bulk_scrape_urls(self, urls: list[str], + params: Optional[Dict[str, Any]] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None) -> Any: + """ + Initiate a bulk scrape job for the specified URLs using the Firecrawl API. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Dict[str, Any]: A dictionary containing the scrape results. The structure includes: + - 'success' (bool): Indicates if the bulk scrape was successful. + - 'status' (str): The final status of the bulk scrape job (e.g., 'completed'). + - 'completed' (int): Number of scraped pages that completed. + - 'total' (int): Total number of scraped pages. + - 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires. + - 'data' (List[Dict]): List of all the scraped pages. + + Raises: + Exception: If the bulk scrape job initiation or monitoring fails. + """ + endpoint = f'/v1/bulk/scrape' + headers = self._prepare_headers(idempotency_key) + json_data = {'urls': urls} + if params: + json_data.update(params) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + if response.status_code == 200: + id = response.json().get('id') + return self._monitor_job_status(id, headers, poll_interval) + + else: + self._handle_error(response, 'start bulk scrape job') + + + def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + """ + Initiate a crawl job asynchronously. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes: + - 'success' (bool): Indicates if the bulk scrape initiation was successful. + - 'id' (str): The unique identifier for the bulk scrape job. + - 'url' (str): The URL to check the status of the bulk scrape job. + """ + endpoint = f'/v1/bulk/scrape' + headers = self._prepare_headers(idempotency_key) + json_data = {'urls': urls} + if params: + json_data.update(params) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, 'start bulk scrape job') + + def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + """ + Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job. + """ + crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key) + if crawl_response['success'] and 'id' in crawl_response: + return CrawlWatcher(crawl_response['id'], self) + else: + raise Exception("Bulk scrape job failed to start") + + def check_bulk_scrape_status(self, id: str) -> Any: + """ + Check the status of a bulk scrape job using the Firecrawl API. + + Args: + id (str): The ID of the bulk scrape job. + + Returns: + Any: The status of the bulk scrape job. + + Raises: + Exception: If the status check request fails. + """ + endpoint = f'/v1/bulk/scrape/{id}' + + headers = self._prepare_headers() + response = self._get_request(f'{self.api_url}{endpoint}', headers) + if response.status_code == 200: + data = response.json() + return { + 'success': True, + 'status': data.get('status'), + 'total': data.get('total'), + 'completed': data.get('completed'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } + else: + self._handle_error(response, 'check bulk scrape status') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. From 70c4e7c334af199f68ed21559e8d1f3cc0e6179a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 23 Oct 2024 19:42:02 +0200 Subject: [PATCH 3/8] feat(bulk/scrape): check credits via url list length --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 4bebcbb7..2984faaf 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -30,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R return (req, res, next) => { (async () => { if (!minimum && req.body) { - minimum = (req.body as any)?.limit ?? 1; + minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; } const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum); req.acuc = chunk; From d8abd157164c4b9b4935961c0d2b07857119d8f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 15:37:24 -0300 Subject: [PATCH 4/8] Nick: from bulk to batch --- .../v1/{bulk-scrape.ts => batch-scrape.ts} | 12 ++--- apps/api/src/controllers/v1/crawl-status.ts | 4 +- apps/api/src/controllers/v1/types.ts | 4 +- apps/api/src/routes/v1.ts | 11 ++-- apps/js-sdk/firecrawl/README.md | 18 +++---- apps/js-sdk/firecrawl/src/index.ts | 34 ++++++------ apps/python-sdk/README.md | 30 +++++------ apps/python-sdk/firecrawl/firecrawl.py | 54 +++++++++---------- 8 files changed, 84 insertions(+), 83 deletions(-) rename apps/api/src/controllers/v1/{bulk-scrape.ts => batch-scrape.ts} (87%) diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts similarity index 87% rename from apps/api/src/controllers/v1/bulk-scrape.ts rename to apps/api/src/controllers/v1/batch-scrape.ts index 3e1afbd0..7c68341b 100644 --- a/apps/api/src/controllers/v1/bulk-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -1,8 +1,8 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { - BulkScrapeRequest, - bulkScrapeRequestSchema, + BatchScrapeRequest, + batchScrapeRequestSchema, CrawlResponse, legacyScrapeOptions, RequestWithAuth, @@ -17,11 +17,11 @@ import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { getJobPriority } from "../../lib/job-priority"; -export async function bulkScrapeController( - req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>, +export async function batchScrapeController( + req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, res: Response ) { - req.body = bulkScrapeRequestSchema.parse(req.body); + req.body = batchScrapeRequestSchema.parse(req.body); const id = uuidv4(); @@ -92,7 +92,7 @@ export async function bulkScrapeController( return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`, + url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, }); } diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index b753b17b..a8d78293 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) { return jobs; } -export async function crawlStatusController(req: RequestWithAuth, res: Response) { +export async function crawlStatusController(req: RequestWithAuth, res: Response, isBatch = false) { const sc = await getCrawl(req.params.jobId); if (!sc) { return res.status(404).json({ success: false, error: "Job not found" }); @@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth x.returnvalue); const protocol = process.env.ENV === "local" ? req.protocol : "https"; - const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`); nextURL.searchParams.set("skip", (start + data.length).toString()); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 4938b074..9705b855 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -144,7 +144,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({ export type ScrapeRequest = z.infer; -export const bulkScrapeRequestSchema = scrapeOptions.extend({ +export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), origin: z.string().optional().default("api"), }).strict(strictMessage).refine( @@ -163,7 +163,7 @@ export const bulkScrapeRequestSchema = scrapeOptions.extend({ return obj; }); -export type BulkScrapeRequest = z.infer; +export type BatchScrapeRequest = z.infer; const crawlerOptions = z.object({ includePaths: z.string().array().default([]), diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 2984faaf..4e4b6052 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -17,7 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; -import { bulkScrapeController } from "../controllers/v1/bulk-scrape"; +import { batchScrapeController } from "../controllers/v1/batch-scrape"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -124,12 +124,12 @@ v1Router.post( ); v1Router.post( - "/bulk/scrape", + "/batch/scrape", authMiddleware(RateLimiterMode.Crawl), checkCreditsMiddleware(), blocklistMiddleware, idempotencyMiddleware, - wrap(bulkScrapeController) + wrap(batchScrapeController) ); v1Router.post( @@ -147,9 +147,10 @@ v1Router.get( ); v1Router.get( - "/bulk/scrape/:jobId", + "/batch/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), - wrap(crawlStatusController) + // Yes, it uses the same controller as the normal crawl status controller + wrap((req:any, res):any => crawlStatusController(req, res, true)) ); v1Router.get( diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 1655d2ee..a90907ba 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -145,32 +145,32 @@ watch.addEventListener("done", state => { }); ``` -### Bulk scraping multiple URLs +### Batch scraping multiple URLs -To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. +To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. ```js -const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { +const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'], }) ``` -#### Asynchronous bulk scrape +#### Asynchronous batch scrape -To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape. +To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape. ```js const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); ``` -#### Bulk scrape with WebSockets +#### Batch scrape with WebSockets -To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats. +To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats. ```js -// Bulk scrape multiple URLs with WebSockets: -const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); +// Batch scrape multiple URLs with WebSockets: +const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); watch.addEventListener("document", doc => { console.log("DOC", doc.detail); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 30797c34..e9985683 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -494,14 +494,14 @@ export default class FirecrawlApp { } /** - * Initiates a bulk scrape job for multiple URLs using the Firecrawl API. + * Initiates a batch scrape job for multiple URLs using the Firecrawl API. * @param url - The URLs to scrape. * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the crawl operation. */ - async bulkScrapeUrls( + async batchScrapeUrls( urls: string[], params?: ScrapeParams, pollInterval: number = 2, @@ -511,7 +511,7 @@ export default class FirecrawlApp { let jsonData: any = { urls, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + `/v1/bulk/scrape`, + this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); @@ -519,7 +519,7 @@ export default class FirecrawlApp { const id: string = response.data.id; return this.monitorJobStatus(id, headers, pollInterval); } else { - this.handleError(response, "start bulk scrape job"); + this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { @@ -531,7 +531,7 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } - async asyncBulkScrapeUrls( + async asyncBatchScrapeUrls( urls: string[], params?: ScrapeParams, idempotencyKey?: string @@ -540,14 +540,14 @@ export default class FirecrawlApp { let jsonData: any = { urls, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + `/v1/bulk/scrape`, + this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); if (response.status === 200) { return response.data; } else { - this.handleError(response, "start bulk scrape job"); + this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { @@ -560,42 +560,42 @@ export default class FirecrawlApp { } /** - * Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket. + * Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket. * @param urls - The URL to scrape. * @param params - Additional parameters for the scrape request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ - async bulkScrapeUrlsAndWatch( + async batchScrapeUrlsAndWatch( urls: string[], params?: ScrapeParams, idempotencyKey?: string, ) { - const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey); + const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey); if (crawl.success && crawl.id) { const id = crawl.id; return new CrawlWatcher(id, this); } - throw new FirecrawlError("Bulk scrape job failed to start", 400); + throw new FirecrawlError("Batch scrape job failed to start", 400); } /** - * Checks the status of a bulk scrape job using the Firecrawl API. - * @param id - The ID of the bulk scrape operation. + * Checks the status of a batch scrape job using the Firecrawl API. + * @param id - The ID of the batch scrape operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkBulkScrapeStatus(id?: string, getAllData = false): Promise { + async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { if (!id) { - throw new FirecrawlError("No bulk scrape ID provided", 400); + throw new FirecrawlError("No batch scrape ID provided", 400); } const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( - `${this.apiUrl}/v1/bulk/scrape/${id}`, + `${this.apiUrl}/v1/batch/scrape/${id}`, headers ); if (response.status === 200) { @@ -623,7 +623,7 @@ export default class FirecrawlApp { error: response.data.error, }) } else { - this.handleError(response, "check bulk scrape status"); + this.handleError(response, "check batch scrape status"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 6a7d4a0a..412c3e05 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -149,37 +149,37 @@ async def start_crawl_and_watch(): await start_crawl_and_watch() ``` -### Scraping multiple URLs in bulk +### Scraping multiple URLs in batch -To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. +To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. ```python idempotency_key = str(uuid.uuid4()) # optional idempotency key -bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) -print(bulk_scrape_result) +batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) +print(batch_scrape_result) ``` -### Asynchronous bulk scrape +### Asynchronous batch scrape -To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. +To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. ```python -bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) -print(bulk_scrape_result) +batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) +print(batch_scrape_result) ``` -### Checking bulk scrape status +### Checking batch scrape status -To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job. +To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job. ```python -id = bulk_scrape_result['id'] -status = app.check_bulk_scrape_job(id) +id = batch_scrape_result['id'] +status = app.check_batch_scrape_job(id) ``` -### Bulk scrape with WebSockets +### Batch scrape with WebSockets -To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. +To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. ```python # inside an async function... @@ -198,7 +198,7 @@ def on_done(detail): # Function to start the crawl and watch process async def start_crawl_and_watch(): # Initiate the crawl job and get the watcher - watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) + watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) # Add event listeners watcher.add_event_listener("document", on_document) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 3b8e39e0..1986ddd2 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -275,12 +275,12 @@ class FirecrawlApp: else: self._handle_error(response, 'map') - def bulk_scrape_urls(self, urls: list[str], + def batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None) -> Any: """ - Initiate a bulk scrape job for the specified URLs using the Firecrawl API. + Initiate a batch scrape job for the specified URLs using the Firecrawl API. Args: urls (list[str]): The URLs to scrape. @@ -290,18 +290,18 @@ class FirecrawlApp: Returns: Dict[str, Any]: A dictionary containing the scrape results. The structure includes: - - 'success' (bool): Indicates if the bulk scrape was successful. - - 'status' (str): The final status of the bulk scrape job (e.g., 'completed'). + - 'success' (bool): Indicates if the batch scrape was successful. + - 'status' (str): The final status of the batch scrape job (e.g., 'completed'). - 'completed' (int): Number of scraped pages that completed. - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires. + - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires. - 'data' (List[Dict]): List of all the scraped pages. Raises: - Exception: If the bulk scrape job initiation or monitoring fails. + Exception: If the batch scrape job initiation or monitoring fails. """ - endpoint = f'/v1/bulk/scrape' + endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) json_data = {'urls': urls} if params: @@ -312,10 +312,10 @@ class FirecrawlApp: return self._monitor_job_status(id, headers, poll_interval) else: - self._handle_error(response, 'start bulk scrape job') + self._handle_error(response, 'start batch scrape job') - def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: """ Initiate a crawl job asynchronously. @@ -325,12 +325,12 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes: - - 'success' (bool): Indicates if the bulk scrape initiation was successful. - - 'id' (str): The unique identifier for the bulk scrape job. - - 'url' (str): The URL to check the status of the bulk scrape job. + Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: + - 'success' (bool): Indicates if the batch scrape initiation was successful. + - 'id' (str): The unique identifier for the batch scrape job. + - 'url' (str): The URL to check the status of the batch scrape job. """ - endpoint = f'/v1/bulk/scrape' + endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) json_data = {'urls': urls} if params: @@ -339,11 +339,11 @@ class FirecrawlApp: if response.status_code == 200: return response.json() else: - self._handle_error(response, 'start bulk scrape job') + self._handle_error(response, 'start batch scrape job') - def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': """ - Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket. + Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: urls (list[str]): The URLs to scrape. @@ -351,28 +351,28 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job. + CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job. """ - crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key) + crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) if crawl_response['success'] and 'id' in crawl_response: return CrawlWatcher(crawl_response['id'], self) else: - raise Exception("Bulk scrape job failed to start") + raise Exception("Batch scrape job failed to start") - def check_bulk_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str) -> Any: """ - Check the status of a bulk scrape job using the Firecrawl API. + Check the status of a batch scrape job using the Firecrawl API. Args: - id (str): The ID of the bulk scrape job. + id (str): The ID of the batch scrape job. Returns: - Any: The status of the bulk scrape job. + Any: The status of the batch scrape job. Raises: Exception: If the status check request fails. """ - endpoint = f'/v1/bulk/scrape/{id}' + endpoint = f'/v1/batch/scrape/{id}' headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) @@ -390,7 +390,7 @@ class FirecrawlApp: 'error': data.get('error') } else: - self._handle_error(response, 'check bulk scrape status') + self._handle_error(response, 'check batch scrape status') def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ From 60b6e6b1d4bdfcec635e3a2c55f5386a13a4b05d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 15:59:40 -0300 Subject: [PATCH 5/8] Nick: fixes --- apps/js-sdk/firecrawl/package.json | 4 ++-- apps/js-sdk/firecrawl/src/index.ts | 32 +++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e50205d5..0dfb4d69 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { - "name": "firecrawl", - "version": "1.6.1", + "name": "@mendable/firecrawl-js", + "version": "1.7.0-beta.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index e9985683..3e9c7bdf 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -154,6 +154,17 @@ export interface CrawlResponse { error?: string; } +/** + * Response interface for crawling operations. + * Defines the structure of the response received after initiating a crawl. + */ +export interface BatchScrapeResponse { + id?: string; + url?: string; + success: true; + error?: string; +} + /** * Response interface for job status checks. * Provides detailed status of a crawl job including progress and results. @@ -169,6 +180,21 @@ export interface CrawlStatusResponse { data: FirecrawlDocument[]; }; +/** + * Response interface for job status checks. + * Provides detailed status of a crawl job including progress and results. + */ +export interface BatchScrapeStatusResponse { + success: true; + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: Date; + next?: string; + data: FirecrawlDocument[]; +}; + /** * Parameters for mapping operations. * Defines options for mapping URLs during a crawl. @@ -506,7 +532,7 @@ export default class FirecrawlApp { params?: ScrapeParams, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, ...(params ?? {}) }; try { @@ -535,7 +561,7 @@ export default class FirecrawlApp { urls: string[], params?: ScrapeParams, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, ...(params ?? {}) }; try { @@ -587,7 +613,7 @@ export default class FirecrawlApp { * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { + async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { if (!id) { throw new FirecrawlError("No batch scrape ID provided", 400); } From c7f217098075a974c6a274cad04e8f8111cd8379 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:04:46 -0300 Subject: [PATCH 6/8] Update example.py --- apps/python-sdk/example.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index 02c06288..e7c80b30 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-") scrape_result = app.scrape_url('firecrawl.dev') print(scrape_result['markdown']) + +# Test batch scrape +urls = ['https://example.com', 'https://docs.firecrawl.dev'] +batch_scrape_params = { + 'formats': ['markdown', 'html'], +} + +# Synchronous batch scrape +batch_result = app.batch_scrape_urls(urls, batch_scrape_params) +print("Synchronous Batch Scrape Result:") +print(batch_result['data'][0]['markdown']) + +# Asynchronous batch scrape +async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) +print("\nAsynchronous Batch Scrape Result:") +print(async_batch_result) + # Crawl a website: idempotency_key = str(uuid.uuid4()) # optional idempotency key crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) From f0054da934a6965450d8cb902afd32dd1f1e3715 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:06:08 -0300 Subject: [PATCH 7/8] Nick: lgtm --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/package-lock.json | 22 ++++------------------ apps/js-sdk/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0dfb4d69..16f1b595 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.0-beta.2", + "version": "1.7.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 975b14e8..3bba385f 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^1.0.3", + "@mendable/firecrawl-js": "^1.7.0-beta.2", "axios": "^1.6.8", "firecrawl": "^1.2.0", "ts-node": "^10.9.2", @@ -423,31 +423,17 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", - "integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", + "version": "1.7.0-beta.2", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz", + "integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, - "node_modules/@mendable/firecrawl-js/node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index ac3ef038..6324707f 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^1.0.3", + "@mendable/firecrawl-js": "1.7.0-beta.2", "axios": "^1.6.8", "firecrawl": "^1.2.0", "ts-node": "^10.9.2", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 5700a3e6..82c73348 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.3.1" +__version__ = "1.4.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From b11035814a6c3ef9809a9e1c28544872e7b4c3c9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:10:21 -0300 Subject: [PATCH 8/8] Nick: --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 16f1b595..a7fb2d83 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.0", + "version": "1.7.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3e9c7bdf..491df1e4 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -155,7 +155,7 @@ export interface CrawlResponse { } /** - * Response interface for crawling operations. + * Response interface for batch scrape operations. * Defines the structure of the response received after initiating a crawl. */ export interface BatchScrapeResponse { @@ -181,8 +181,8 @@ export interface CrawlStatusResponse { }; /** - * Response interface for job status checks. - * Provides detailed status of a crawl job including progress and results. + * Response interface for batch scrape job status checks. + * Provides detailed status of a batch scrape job including progress and results. */ export interface BatchScrapeStatusResponse { success: true;