From 503e83e83edcf578ab7acec11275ad5695bc3c41 Mon Sep 17 00:00:00 2001 From: Sebastjan Prachovskij Date: Tue, 3 Sep 2024 18:26:11 +0300 Subject: [PATCH 001/102] Add SearchApi to search Add support for engines, improve status code error Remove changes in package, add engine to env params Improve description in env example Remove unnecessary empty line Improve text --- apps/api/.env.example | 11 ++++-- apps/api/.env.local | 2 +- apps/api/src/search/index.ts | 12 ++++++- apps/api/src/search/searchapi.ts | 60 ++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/search/searchapi.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index f3c1dc1b..6ba49daa 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -1,5 +1,5 @@ # ===== Required ENVS ====== -NUM_WORKERS_PER_QUEUE=8 +NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379 @@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true # ===== Optional ENVS ====== +# SearchApi key. Head to https://searchapi.com/ to get your API key +SEARCHAPI_API_KEY= +# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines +SEARCHAPI_ENGINE= + # Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= SUPABASE_SERVICE_TOKEN= # Other Optionals diff --git a/apps/api/.env.local b/apps/api/.env.local index 17f85935..9fa41498 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -12,4 +12,4 @@ ANTHROPIC_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= - +SEARCHAPI_API_KEY= diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f4c5b6d0..3bcb85d2 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { googleSearch } from "./googlesearch"; import { fireEngineMap } from "./fireEngine"; +import { searchapi_search } from "./searchapi"; import { serper_search } from "./serper"; export async function search({ @@ -30,7 +31,16 @@ export async function search({ timeout?: number; }): Promise { try { - + if (process.env.SEARCHAPI_API_KEY) { + return await searchapi_search(query, { + num_results, + tbs, + filter, + lang, + country, + location + }); + } if (process.env.SERPER_API_KEY) { return await serper_search(query, { num_results, diff --git a/apps/api/src/search/searchapi.ts b/apps/api/src/search/searchapi.ts new file mode 100644 index 00000000..24778a77 --- /dev/null +++ b/apps/api/src/search/searchapi.ts @@ -0,0 +1,60 @@ +import axios from "axios"; +import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; + +dotenv.config(); + +interface SearchOptions { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + num_results: number; + page?: number; +} + +export async function searchapi_search(q: string, options: SearchOptions): Promise { + const params = { + q: q, + hl: options.lang, + gl: options.country, + location: options.location, + num: options.num_results, + page: options.page ?? 1, + engine: process.env.SEARCHAPI_ENGINE || "google", + }; + + const url = `https://www.searchapi.io/api/v1/search`; + + try { + const response = await axios.get(url, { + headers: { + "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`, + "Content-Type": "application/json", + "X-SearchApi-Source": "Firecrawl", + }, + params: params, + }); + + + if (response.status === 401) { + throw new Error("Unauthorized. Please check your API key."); + } + + const data = response.data; + + if (data && Array.isArray(data.organic_results)) { + return data.organic_results.map((a: any) => ({ + url: a.link, + title: a.title, + description: a.snippet, + })); + } else { + return []; + } + } catch (error) { + console.error(`There was an error searching for content: ${error.message}`); + return []; + } +} From 1c02187054ad1847dcee77728255018ec743f6d5 Mon Sep 17 00:00:00 2001 From: Stijn Smits <167638923+s-smits@users.noreply.github.com> Date: Sun, 6 Oct 2024 13:25:23 +0200 Subject: [PATCH 002/102] Update website_qa_with_gemini_caching.ipynb --- .../website_qa_with_gemini_caching.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb index 0876affa..9a2244a1 100644 --- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb @@ -98,7 +98,7 @@ "source": [ "# Create a cache with a 5 minute TTL\n", "cache = caching.CachedContent.create(\n", - " model=\"models/gemini-1.5-pro-001\",\n", + " model=\"models/gemini-1.5-pro-002\",\n", " display_name=\"website crawl testing again\", # used to identify the cache\n", " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", " contents=[text_file],\n", From 460f5581fef1d800e2a96fbf10f30b242921ac60 Mon Sep 17 00:00:00 2001 From: Stijn Smits <167638923+s-smits@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:17:47 +0200 Subject: [PATCH 003/102] Add files via upload --- ...website_qa_with_gemini_flash_caching.ipynb | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb new file mode 100644 index 00000000..19d72c9d --- /dev/null +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "import google.generativeai as genai\n", + "from google.generativeai import caching\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Configure the Google Generative AI module with the API key\n", + "genai.configure(api_key=google_api_key)\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No data returned from crawl.\n" + ] + } + ], + "source": [ + "# Crawl a website\n", + "crawl_url = 'https://dify.ai/'\n", + "params = {\n", + " \n", + " 'crawlOptions': {\n", + " 'limit': 100\n", + " }\n", + "}\n", + "crawl_result = app.crawl_url(crawl_url, params=params)\n", + "\n", + "if crawl_result is not None:\n", + " # Convert crawl results to JSON format, excluding 'content' field from each entry\n", + " cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n", + "\n", + " # Save the modified results as a text file containing JSON data\n", + " with open('crawl_result.txt', 'w') as file:\n", + " file.write(json.dumps(cleaned_crawl_result, indent=4))\n", + "else:\n", + " print(\"No data returned from crawl.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload the video using the Files API\n", + "text_file = genai.upload_file(path=\"crawl_result.txt\")\n", + "\n", + "# Wait for the file to finish processing\n", + "while text_file.state.name == \"PROCESSING\":\n", + " print('Waiting for file to be processed.')\n", + " time.sleep(2)\n", + " text_file = genai.get_file(text_file.name)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cache with a 5 minute TTL\n", + "cache = caching.CachedContent.create(\n", + " model=\"models/gemini-1.5-flash-002\",\n", + " display_name=\"website crawl testing again\", # used to identify the cache\n", + " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", + " contents=[text_file],\n", + " ttl=datetime.timedelta(minutes=15),\n", + ")\n", + "# Construct a GenerativeModel which uses the created cache.\n", + "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n", + "\n", + "Here's how Firecrawl helps:\n", + "\n", + "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n", + "* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n", + "* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n", + "\n", + "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n", + "\n", + "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n", + "\n" + ] + } + ], + "source": [ + "# Query the model\n", + "response = model.generate_content([\"What powers website scraping with Dify?\"])\n", + "response_dict = response.to_dict()\n", + "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", + "print(response_text)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0934dd88d3d30366239dd2712f4720f34f4ac430 Mon Sep 17 00:00:00 2001 From: busaud Date: Thu, 10 Oct 2024 09:35:12 +0300 Subject: [PATCH 004/102] Update README.md I believe wait_until_done was removed as of v1? --- apps/python-sdk/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index dcf44b25..a437e0c6 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -36,7 +36,6 @@ crawl_status = app.crawl_url( 'limit': 100, 'scrapeOptions': {'formats': ['markdown', 'html']} }, - wait_until_done=True, poll_interval=30 ) print(crawl_status) From 8cbd94ed2de46a56d215c7c434f78883e406e35f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:45:51 -0300 Subject: [PATCH 005/102] fix/filters failed and unknown jobs now --- apps/api/src/controllers/v1/crawl-status.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 63331c9c..084685c7 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -64,7 +64,7 @@ export async function crawlStatusController(req: RequestWithAuth !jobStatuses.some(status => status[0] === id && status[1] === "failed")); // filter the job statues - jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + jobStatuses = jobStatuses.filter(x => x[1] !== "failed" && x[1] !== "unknown"); const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); From e57a8e9d4539ed62bacf0e041cf4c4f955131b76 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:52:18 -0300 Subject: [PATCH 006/102] better explain how includePaths and excludePaths work --- apps/api/v1-openapi.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/v1-openapi.json b/apps/api/v1-openapi.json index 1ff0fb9b..1253a27b 100644 --- a/apps/api/v1-openapi.json +++ b/apps/api/v1-openapi.json @@ -341,14 +341,14 @@ "items": { "type": "string" }, - "description": "URL patterns to exclude" + "description": "Specifies URL patterns to exclude from the crawl by comparing website paths against the provided regex patterns. For example, if you set \"excludePaths\": [\"blog/*\"] for the base URL firecrawl.dev, any results matching that pattern will be excluded, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." }, "includePaths": { "type": "array", "items": { "type": "string" }, - "description": "URL patterns to include" + "description": "Specifies URL patterns to include in the crawl by comparing website paths against the provided regex patterns. Only the paths that match the specified patterns will be included in the response. For example, if you set \"includePaths\": [\"blog/*\"] for the base URL firecrawl.dev, only results matching that pattern will be included, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap." }, "maxDepth": { "type": "integer", From e916ea7e1a974cccb2cd3cfeeb72ad7128e1d59c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:55:15 -0300 Subject: [PATCH 007/102] updated openapi.json --- apps/api/v1-openapi.json | 142 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 3 deletions(-) diff --git a/apps/api/v1-openapi.json b/apps/api/v1-openapi.json index 1253a27b..6cd2b3da 100644 --- a/apps/api/v1-openapi.json +++ b/apps/api/v1-openapi.json @@ -6,7 +6,7 @@ "description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.", "contact": { "name": "Firecrawl Support", - "url": "https://firecrawl.dev", + "url": "https://firecrawl.dev/support", "email": "support@firecrawl.dev" } }, @@ -97,6 +97,127 @@ "description": "The prompt to use for the extraction without a schema (Optional)" } } + }, + "actions": { + "type": "array", + "description": "Actions to perform on the page before grabbing the content", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Wait", + "properties": { + "type": { + "type": "string", + "enum": ["wait"], + "description": "Wait for a specified amount of milliseconds" + }, + "milliseconds": { + "type": "integer", + "minimum": 1, + "description": "Number of milliseconds to wait" + } + }, + "required": ["type", "milliseconds"] + }, + { + "type": "object", + "title": "Screenshot", + "properties": { + "type": { + "type": "string", + "enum": ["screenshot"], + "description": "Take a screenshot" + }, + "fullPage": { + "type": "boolean", + "description": "Should the screenshot be full-page or viewport sized?", + "default": false + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Click", + "properties": { + "type": { + "type": "string", + "enum": ["click"], + "description": "Click on an element" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#load-more-button" + } + }, + "required": ["type", "selector"] + }, + { + "type": "object", + "title": "Write text", + "properties": { + "type": { + "type": "string", + "enum": ["write"], + "description": "Write text into an input field" + }, + "text": { + "type": "string", + "description": "Text to type", + "example": "Hello, world!" + }, + "selector": { + "type": "string", + "description": "Query selector for the input field", + "example": "#search-input" + } + }, + "required": ["type", "text", "selector"] + }, + { + "type": "object", + "title": "Press a key", + "description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.", + "properties": { + "type": { + "type": "string", + "enum": ["press"], + "description": "Press a key on the page" + }, + "key": { + "type": "string", + "description": "Key to press", + "example": "Enter" + } + }, + "required": ["type", "key"] + }, + { + "type": "object", + "title": "Scroll", + "properties": { + "type": { + "type": "string", + "enum": ["scroll"], + "description": "Scroll the page" + }, + "direction": { + "type": "string", + "enum": ["up", "down"], + "description": "Direction to scroll" + }, + "amount": { + "type": "integer", + "description": "Amount to scroll in pixels", + "minimum": 1 + } + }, + "required": ["type", "direction"] + } + ] + } } }, "required": ["url"] @@ -362,7 +483,7 @@ }, "limit": { "type": "integer", - "description": "Maximum number of pages to crawl", + "description": "Maximum number of pages to crawl. Default limit is 10000.", "default": 10 }, "allowBackwardLinks": { @@ -513,7 +634,7 @@ }, "search": { "type": "string", - "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 100 search results. However, if map finds more results, there is no limit applied." + "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." }, "ignoreSitemap": { "type": "boolean", @@ -642,6 +763,21 @@ }, "description": "List of links on the page if `links` is in `formats`" }, + "actions": { + "type": "object", + "nullable": true, + "description": "Results of the actions specified in the `actions` parameter. Only present if the `actions` parameter was provided in the request", + "properties": { + "screenshots": { + "type": "array", + "description": "Screenshot URLs, in the same order as the screenshot actions provided.", + "items": { + "type": "string", + "format": "url" + } + } + } + }, "metadata": { "type": "object", "properties": { From 257a95113231381f7334793fa806f171701890f6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 11 Oct 2024 14:21:04 -0300 Subject: [PATCH 008/102] Update auth.ts --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 38407f0c..8aa669cd 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -77,7 +77,7 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise Date: Fri, 11 Oct 2024 15:29:25 -0300 Subject: [PATCH 009/102] Update auth.ts --- apps/api/src/controllers/auth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 8aa669cd..0367358f 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -77,7 +77,7 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise Date: Fri, 11 Oct 2024 15:40:29 -0300 Subject: [PATCH 010/102] fix/added unkwown status to job filter --- apps/api/src/controllers/v0/crawl-status.ts | 2 +- apps/api/src/controllers/v1/crawl-status-ws.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 4c50b375..66522bcf 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -60,7 +60,7 @@ export async function crawlStatusController(req: Request, res: Response) { })); // Filter out failed jobs - jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed"); + jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed" && x.status !== "unknown"); // Sort jobs by timestamp jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp); diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index b67e559b..0d6d5803 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -101,7 +101,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth !jobStatuses.some(status => status[0] === id && status[1] === "failed")); // filter the job statues - jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + jobStatuses = jobStatuses.filter(x => x[1] !== "failed" && x[1] !== "unknown"); const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobs = await getJobs(doneJobIDs); const data = doneJobs.map(x => x.returnvalue); From cee124dc46aa4602216df994b59821c7314bbc13 Mon Sep 17 00:00:00 2001 From: RuhiJain <138304656+Ruhi14@users.noreply.github.com> Date: Sat, 12 Oct 2024 16:28:29 +0530 Subject: [PATCH 011/102] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 7d7d1739..dd8a740a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@

+ + contributors + + ## License Disclaimer This project is primarily licensed under the GNU Affero General Public License v3.0 (AGPL-3.0), as specified in the LICENSE file in the root directory of this repository. However, certain components of this project are licensed under the MIT License. Refer to the LICENSE files in these specific directories for details. @@ -552,3 +559,10 @@ Please note: - When using or contributing to this project, ensure you comply with the appropriate license terms for the specific component you are working with. For more details on the licensing of specific components, please refer to the LICENSE files in the respective directories or contact the project maintainers. + + +

+ + ↑ Back to Top ↑ + +

From 3315648a0f82a1dd926a1ec6609fde78efc01bbc Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 16:00:14 -0300 Subject: [PATCH 012/102] Nick: open ai swarm and firecrawl --- examples/openai_swarm_firecrawl/README.md | 37 +++++++ examples/openai_swarm_firecrawl/main.py | 103 ++++++++++++++++++ .../openai_swarm_firecrawl/requirements.txt | 2 + 3 files changed, 142 insertions(+) create mode 100644 examples/openai_swarm_firecrawl/README.md create mode 100644 examples/openai_swarm_firecrawl/main.py create mode 100644 examples/openai_swarm_firecrawl/requirements.txt diff --git a/examples/openai_swarm_firecrawl/README.md b/examples/openai_swarm_firecrawl/README.md new file mode 100644 index 00000000..b256ae7d --- /dev/null +++ b/examples/openai_swarm_firecrawl/README.md @@ -0,0 +1,37 @@ +# Swarm Firecrawl Marketing Agent + +A multi-agent system using [OpenAI Swarm](https://github.com/openai/swarm) for AI-powered marketing strategies using [Firecrawl](https://firecrawl.dev) for web scraping. + +## Agents + +1. User Interface: Manages user interactions +2. Website Scraper: Extracts clean LLM-ready content via Firecrawl API +3. Analyst: Provides marketing insights +4. Campaign Idea: Generates marketing campaign concepts +5. Copywriter: Creates compelling marketing copy + +## Requirements + +- [Firecrawl](https://firecrawl.dev) API key +- [OpenAI](https://platform.openai.com/api-keys) API key + +## Setup + +1. Install the required packages: + ``` + pip install -r requirements.txt + ``` + +2. Set up your environment variables in a `.env` file: + ``` + OPENAI_API_KEY=your_openai_api_key + FIRECRAWL_API_KEY=your_firecrawl_api_key + ``` + +## Usage + +Run the main script to start the interactive demo: + +``` +python main.py +``` \ No newline at end of file diff --git a/examples/openai_swarm_firecrawl/main.py b/examples/openai_swarm_firecrawl/main.py new file mode 100644 index 00000000..cef53fdf --- /dev/null +++ b/examples/openai_swarm_firecrawl/main.py @@ -0,0 +1,103 @@ +import os +from firecrawl import FirecrawlApp +from swarm import Agent +from swarm.repl import run_demo_loop +import dotenv +from openai import OpenAI + +dotenv.load_dotenv() + +# Initialize FirecrawlApp and OpenAI +app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def scrape_website(url): + """Scrape a website using Firecrawl.""" + scrape_status = app.scrape_url( + url, + params={'formats': ['markdown']} + ) + return scrape_status + +def analyze_website_content(content): + """Analyze the scraped website content using OpenAI.""" + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a marketing analyst. Analyze the following website content and provide key insights for marketing strategy."}, + {"role": "user", "content": content} + ] + ) + return {"analysis": response.choices[0].message.content} + +def generate_copy(brief): + """Generate marketing copy based on a brief using OpenAI.""" + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a copywriter. Create compelling marketing copy based on the following brief."}, + {"role": "user", "content": brief} + ] + ) + return {"copy": response.choices[0].message.content} + +def create_campaign_idea(target_audience, goals): + """Create a campaign idea based on target audience and goals using OpenAI.""" + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "You are a marketing strategist. Create an innovative campaign idea based on the target audience and goals provided."}, + {"role": "user", "content": f"Target Audience: {target_audience}\nGoals: {goals}"} + ] + ) + return {"campaign_idea": response.choices[0].message.content} + +def handoff_to_copywriter(): + """Hand off the campaign idea to the copywriter agent.""" + return copywriter_agent + +def handoff_to_analyst(): + """Hand off the website content to the analyst agent.""" + return analyst_agent + +def handoff_to_campaign_idea(): + """Hand off the target audience and goals to the campaign idea agent.""" + return campaign_idea_agent + +def handoff_to_website_scraper(): + """Hand off the url to the website scraper agent.""" + return website_scraper_agent + +user_interface_agent = Agent( + name="User Interface Agent", + instructions="You are a user interface agent that handles all interactions with the user. You need to always start with a URL that the user wants to create a marketing strategy for. Ask clarification questions if needed. Be concise.", + functions=[handoff_to_website_scraper], +) + +website_scraper_agent = Agent( + name="Website Scraper Agent", + instructions="You are a website scraper agent specialized in scraping website content.", + functions=[scrape_website, handoff_to_analyst], +) + +analyst_agent = Agent( + name="Analyst Agent", + instructions="You are an analyst agent that examines website content and provides insights for marketing strategies. Be concise.", + functions=[analyze_website_content, handoff_to_campaign_idea], +) + +campaign_idea_agent = Agent( + name="Campaign Idea Agent", + instructions="You are a campaign idea agent that creates innovative marketing campaign ideas based on website content and target audience. Be concise.", + functions=[create_campaign_idea, handoff_to_copywriter], +) + +copywriter_agent = Agent( + name="Copywriter Agent", + instructions="You are a copywriter agent specialized in creating compelling marketing copy based on website content and campaign ideas. Be concise.", + functions=[generate_copy], +) + +if __name__ == "__main__": + # Run the demo loop with the user interface agent + run_demo_loop(user_interface_agent, stream=True) \ No newline at end of file diff --git a/examples/openai_swarm_firecrawl/requirements.txt b/examples/openai_swarm_firecrawl/requirements.txt new file mode 100644 index 00000000..516c1dd2 --- /dev/null +++ b/examples/openai_swarm_firecrawl/requirements.txt @@ -0,0 +1,2 @@ +firecrawl-py +openai \ No newline at end of file From c2d79e184251e8b79381b6b18764f33f619afa75 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 16:00:37 -0300 Subject: [PATCH 013/102] Create .env.example --- examples/openai_swarm_firecrawl/.env.example | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/openai_swarm_firecrawl/.env.example diff --git a/examples/openai_swarm_firecrawl/.env.example b/examples/openai_swarm_firecrawl/.env.example new file mode 100644 index 00000000..c0631aee --- /dev/null +++ b/examples/openai_swarm_firecrawl/.env.example @@ -0,0 +1,2 @@ +OPENAI_API_KEY= +FIRECRAWL_API_KEY= \ No newline at end of file From 2eff7c29bbc50fdf4246174eea3d894f90887eb6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 16:12:12 -0300 Subject: [PATCH 014/102] Nick: refactor openai swarm example --- examples/openai_swarm_firecrawl/main.py | 41 ++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/examples/openai_swarm_firecrawl/main.py b/examples/openai_swarm_firecrawl/main.py index cef53fdf..63a1b898 100644 --- a/examples/openai_swarm_firecrawl/main.py +++ b/examples/openai_swarm_firecrawl/main.py @@ -19,38 +19,43 @@ def scrape_website(url): ) return scrape_status -def analyze_website_content(content): - """Analyze the scraped website content using OpenAI.""" +def generate_completion(role, task, content): + """Generate a completion using OpenAI.""" response = client.chat.completions.create( model="gpt-4o-mini", messages=[ - {"role": "system", "content": "You are a marketing analyst. Analyze the following website content and provide key insights for marketing strategy."}, + {"role": "system", "content": f"You are a {role}. {task}"}, {"role": "user", "content": content} ] ) - return {"analysis": response.choices[0].message.content} + return response.choices[0].message.content + +def analyze_website_content(content): + """Analyze the scraped website content using OpenAI.""" + analysis = generate_completion( + "marketing analyst", + "Analyze the following website content and provide key insights for marketing strategy.", + content + ) + return {"analysis": analysis} def generate_copy(brief): """Generate marketing copy based on a brief using OpenAI.""" - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "system", "content": "You are a copywriter. Create compelling marketing copy based on the following brief."}, - {"role": "user", "content": brief} - ] + copy = generate_completion( + "copywriter", + "Create compelling marketing copy based on the following brief.", + brief ) - return {"copy": response.choices[0].message.content} + return {"copy": copy} def create_campaign_idea(target_audience, goals): """Create a campaign idea based on target audience and goals using OpenAI.""" - response = client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "system", "content": "You are a marketing strategist. Create an innovative campaign idea based on the target audience and goals provided."}, - {"role": "user", "content": f"Target Audience: {target_audience}\nGoals: {goals}"} - ] + campaign_idea = generate_completion( + "marketing strategist", + "Create an innovative campaign idea based on the target audience and goals provided.", + f"Target Audience: {target_audience}\nGoals: {goals}" ) - return {"campaign_idea": response.choices[0].message.content} + return {"campaign_idea": campaign_idea} def handoff_to_copywriter(): """Hand off the campaign idea to the copywriter agent.""" From 961b1010cf5921c3dc5b8d3767dee17ccdf840db Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 17:48:37 -0300 Subject: [PATCH 015/102] Nick: rm the cache for map for 24hrs --- apps/api/src/controllers/v1/map.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index cc5f6aa3..5ed3dd51 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -63,7 +63,7 @@ export async function mapController( const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); const cacheKey = `fireEngineMap:${mapUrl}`; - const cachedResult = await redis.get(cacheKey); + const cachedResult = null; let allResults: any[]; let pagePromises: Promise[]; From 35b15f1ee6d048092a2a1c1315bf9844a4164153 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 17:59:50 -0300 Subject: [PATCH 016/102] Update fireEngine.ts --- apps/api/src/search/fireEngine.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index d5e15656..b1f30ee5 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -6,6 +6,7 @@ import { Logger } from "../lib/logger"; dotenv.config(); + export async function fireEngineMap( q: string, options: { From af06b42cb2a4297edcd6ba973b2b13b67bdcb7d3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 18:18:38 -0300 Subject: [PATCH 017/102] Update fireEngine.ts --- apps/api/src/search/fireEngine.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index b1f30ee5..1186c8a2 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -42,11 +42,12 @@ export async function fireEngineMap( url: `${process.env.FIRE_ENGINE_BETA_URL}/search`, headers: { "Content-Type": "application/json", + "X-Disable-Cache": "true" }, data: data, }; const response = await axios(config); - if (response && response) { + if (response && response.data) { return response.data; } else { return []; From d3856371c9750c5d4aae4f47ef7a35029db1edae Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 19:36:49 -0300 Subject: [PATCH 018/102] Update index.ts --- apps/api/src/index.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1cdda34e..e2b2d31d 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -20,6 +20,7 @@ import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; +import dns from 'node:dns'; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -28,13 +29,14 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; Logger.info(`Number of CPUs: ${numCPUs} available`); -const cacheable = new CacheableLookup({ - // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup:false -}); +// Disable CacheableLookup for VPC fire-engine-api +const cacheable = new CacheableLookup() + +// Install cacheable lookup for all other requests cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent) +cacheable.install(https.globalAgent); + const ws = expressWs(express()); const app = ws.app; From 03287821c25a15d47c079e1cc6f164bc7ac36805 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 12 Oct 2024 19:49:37 -0300 Subject: [PATCH 019/102] Update index.ts --- apps/api/src/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index e2b2d31d..5ccbb9cc 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -29,7 +29,6 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; Logger.info(`Number of CPUs: ${numCPUs} available`); -// Disable CacheableLookup for VPC fire-engine-api const cacheable = new CacheableLookup() From ec238a8349987b5de1fa5668456f4c28af19273b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Oct 2024 14:01:25 -0300 Subject: [PATCH 020/102] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f153b034..70f677ef 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -117,7 +117,14 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Any: The crawl job ID or the crawl results if waiting until completion. + Dict[str, Any]: A dictionary containing the crawl results. The structure includes: + - 'success' (bool): Indicates if the crawl was successful. + - 'status' (str): The final status of the crawl job (e.g., 'completed'). + - 'completed' (int): Number of scraped pages that completed. + - 'total' (int): Total number of scraped pages. + - 'creditsUsed' (int): Estimated number of API credits used for this crawl. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires. + - 'data' (List[Dict]): List of all the scraped pages. Raises: Exception: If the crawl job initiation or monitoring fails. @@ -146,7 +153,10 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Dict[str, Any]: The response from the crawl initiation request. + Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes: + - 'success' (bool): Indicates if the crawl initiation was successful. + - 'id' (str): The unique identifier for the crawl job. + - 'url' (str): The URL to check the status of the crawl job. """ endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) @@ -236,7 +246,7 @@ class FirecrawlApp: params (Optional[Dict[str, Any]]): Additional parameters for the map search. Returns: - Any: The result of the map search, typically a dictionary containing mapping data. + List[str]: A list of URLs discovered during the map search. """ endpoint = f'/v1/map' headers = self._prepare_headers() From 666082a7ddfa52242b4dd27f2c5f37f1f98a3302 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Oct 2024 14:03:19 -0300 Subject: [PATCH 021/102] Nick: bump python patch to 1.3.1 --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 3cea54ce..5700a3e6 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.3.0" +__version__ = "1.3.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 78b6127d885d9c251259f6b27c32129b159410ce Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Oct 2024 22:27:38 -0300 Subject: [PATCH 022/102] Nick: retries for acuc --- apps/api/src/controllers/auth.ts | 62 ++++++++++++++----- .../src/services/billing/credit_billing.ts | 6 +- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 0367358f..5546bc17 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -37,12 +37,17 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { return validate(potentialUuid); } -export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)) { +export async function setCachedACUC( + api_key: string, + acuc: + | AuthCreditUsageChunk + | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk) +) { const cacheKeyACUC = `acuc_${api_key}`; const redLockKey = `lock_${cacheKeyACUC}`; try { - await redlock.using([redLockKey], 10000, {}, async signal => { + await redlock.using([redLockKey], 10000, {}, async (signal) => { if (typeof acuc === "function") { acuc = acuc(JSON.parse(await getValue(cacheKeyACUC))); @@ -68,7 +73,10 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk } } -export async function getACUC(api_key: string, cacheOnly = false): Promise { +export async function getACUC( + api_key: string, + cacheOnly = false +): Promise { const cacheKeyACUC = `acuc_${api_key}`; const cachedACUC = await getValue(cacheKeyACUC); @@ -76,18 +84,38 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise setTimeout(resolve, 200)); } - const chunk: AuthCreditUsageChunk | null = data.length === 0 - ? null - : data[0].team_id === null - ? null - : data[0]; + const chunk: AuthCreditUsageChunk | null = + data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery if (chunk !== null) { @@ -132,7 +160,11 @@ export async function supaAuthenticateUser( plan?: PlanType; chunk?: AuthCreditUsageChunk; }> { - const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null); + const authHeader = + req.headers.authorization ?? + (req.headers["sec-websocket-protocol"] + ? `Bearer ${req.headers["sec-websocket-protocol"]}` + : null); if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; } @@ -162,7 +194,7 @@ export async function supaAuthenticateUser( rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); } else { rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); - } + } teamId = "preview"; } else { normalizedApi = parseApi(token); diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 3346e291..39e9f15e 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -55,11 +55,13 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: const creditsWillBeUsed = chunk.adjusted_credits_used + credits; + // In case chunk.price_credits is undefined, set it to a large number to avoid division by zero + const totalPriceCredits = chunk.price_credits ?? 100000000; // Removal of + credits - const creditUsagePercentage = creditsWillBeUsed / chunk.price_credits; + const creditUsagePercentage = creditsWillBeUsed / totalPriceCredits; // Compare the adjusted total credits used with the credits allowed by the plan - if (creditsWillBeUsed > chunk.price_credits) { + if (creditsWillBeUsed > totalPriceCredits) { sendNotification( team_id, NotificationType.LIMIT_REACHED, From 821c62c5750d66e5df895603bb3df874393e2380 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 13 Oct 2024 22:30:11 -0300 Subject: [PATCH 023/102] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 39e9f15e..694d0e5c 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -55,7 +55,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: const creditsWillBeUsed = chunk.adjusted_credits_used + credits; - // In case chunk.price_credits is undefined, set it to a large number to avoid division by zero + // In case chunk.price_credits is undefined, set it to a large number to avoid mistakes const totalPriceCredits = chunk.price_credits ?? 100000000; // Removal of + credits const creditUsagePercentage = creditsWillBeUsed / totalPriceCredits; From a6888ce17b98e4a47fbfc9796911446a6513959c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:32:09 -0300 Subject: [PATCH 024/102] Revert "Merge pull request #773 from mendableai/nsc/retries-acuc-price-credits-fallback" This reverts commit ba9ad1ef7f02722ce88c1c29fa285befd3c3ec51, reversing changes made to 666082a7ddfa52242b4dd27f2c5f37f1f98a3302. --- apps/api/src/controllers/auth.ts | 62 +++++-------------- .../src/services/billing/credit_billing.ts | 6 +- 2 files changed, 17 insertions(+), 51 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 5546bc17..0367358f 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -37,17 +37,12 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { return validate(potentialUuid); } -export async function setCachedACUC( - api_key: string, - acuc: - | AuthCreditUsageChunk - | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk) -) { +export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)) { const cacheKeyACUC = `acuc_${api_key}`; const redLockKey = `lock_${cacheKeyACUC}`; try { - await redlock.using([redLockKey], 10000, {}, async (signal) => { + await redlock.using([redLockKey], 10000, {}, async signal => { if (typeof acuc === "function") { acuc = acuc(JSON.parse(await getValue(cacheKeyACUC))); @@ -73,10 +68,7 @@ export async function setCachedACUC( } } -export async function getACUC( - api_key: string, - cacheOnly = false -): Promise { +export async function getACUC(api_key: string, cacheOnly = false): Promise { const cacheKeyACUC = `acuc_${api_key}`; const cachedACUC = await getValue(cacheKeyACUC); @@ -84,38 +76,18 @@ export async function getACUC( if (cachedACUC !== null) { return JSON.parse(cachedACUC); } else if (!cacheOnly) { - let data; - let error; - let retries = 0; - const maxRetries = 5; - - while (retries < maxRetries) { - ({ data, error } = await supabase_service.rpc( - "auth_credit_usage_chunk_test_3", - { input_key: api_key } - )); - - if (!error) { - break; - } - - Logger.warn( - `Failed to retrieve authentication and credit usage data after ${retries}, trying again...` - ); - retries++; - if (retries === maxRetries) { - throw new Error( - "Failed to retrieve authentication and credit usage data after 3 attempts: " + - JSON.stringify(error) - ); - } - - // Wait for a short time before retrying - await new Promise((resolve) => setTimeout(resolve, 200)); + const { data, error } = + await supabase_service.rpc("auth_credit_usage_chunk_test_3", { input_key: api_key }); + + if (error) { + throw new Error("Failed to retrieve authentication and credit usage data: " + JSON.stringify(error)); } - const chunk: AuthCreditUsageChunk | null = - data.length === 0 ? null : data[0].team_id === null ? null : data[0]; + const chunk: AuthCreditUsageChunk | null = data.length === 0 + ? null + : data[0].team_id === null + ? null + : data[0]; // NOTE: Should we cache null chunks? - mogery if (chunk !== null) { @@ -160,11 +132,7 @@ export async function supaAuthenticateUser( plan?: PlanType; chunk?: AuthCreditUsageChunk; }> { - const authHeader = - req.headers.authorization ?? - (req.headers["sec-websocket-protocol"] - ? `Bearer ${req.headers["sec-websocket-protocol"]}` - : null); + const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null); if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; } @@ -194,7 +162,7 @@ export async function supaAuthenticateUser( rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); } else { rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); - } + } teamId = "preview"; } else { normalizedApi = parseApi(token); diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 694d0e5c..3346e291 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -55,13 +55,11 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: const creditsWillBeUsed = chunk.adjusted_credits_used + credits; - // In case chunk.price_credits is undefined, set it to a large number to avoid mistakes - const totalPriceCredits = chunk.price_credits ?? 100000000; // Removal of + credits - const creditUsagePercentage = creditsWillBeUsed / totalPriceCredits; + const creditUsagePercentage = creditsWillBeUsed / chunk.price_credits; // Compare the adjusted total credits used with the credits allowed by the plan - if (creditsWillBeUsed > totalPriceCredits) { + if (creditsWillBeUsed > chunk.price_credits) { sendNotification( team_id, NotificationType.LIMIT_REACHED, From 2bf7b433e2c752a7f0031c4b7525d8c4f3490d77 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:18:26 -0300 Subject: [PATCH 025/102] fixed file blocking process --- apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index b27db99a..4d75df33 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -1,5 +1,5 @@ import axios, { AxiosResponse } from "axios"; -import fs from "fs"; +import fs from "fs/promises"; import { createReadStream, createWriteStream } from "node:fs"; import FormData from "form-data"; import dotenv from "dotenv"; @@ -15,7 +15,7 @@ export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promis try { const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); const content = await processPdfToText(tempFilePath, parsePDF); - fs.unlinkSync(tempFilePath); // Clean up the temporary file + await fs.unlink(tempFilePath); // Clean up the temporary file return { content, pageStatusCode, pageError }; } catch (error) { Logger.error(`Failed to fetch and process PDF: ${error.message}`); @@ -120,7 +120,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro } } else { try { - content = fs.readFileSync(filePath, "utf-8"); + content = await fs.readFile(filePath, "utf-8"); } catch (error) { Logger.error(`Failed to read PDF file: ${error}`); content = ""; @@ -131,7 +131,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro async function processPdf(file: string) { try { - const fileContent = fs.readFileSync(file); + const fileContent = await fs.readFile(file); const data = await pdf(fileContent); return data.text; } catch (error) { From c3a9630e330e4f09d77cb2b095b36cdb62043d42 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 14 Oct 2024 12:24:34 -0300 Subject: [PATCH 026/102] Reapply "Merge pull request #773 from mendableai/nsc/retries-acuc-price-credits-fallback" This reverts commit a6888ce17b98e4a47fbfc9796911446a6513959c. --- apps/api/src/controllers/auth.ts | 62 ++++++++++++++----- .../src/services/billing/credit_billing.ts | 6 +- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 0367358f..5546bc17 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -37,12 +37,17 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { return validate(potentialUuid); } -export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)) { +export async function setCachedACUC( + api_key: string, + acuc: + | AuthCreditUsageChunk + | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk) +) { const cacheKeyACUC = `acuc_${api_key}`; const redLockKey = `lock_${cacheKeyACUC}`; try { - await redlock.using([redLockKey], 10000, {}, async signal => { + await redlock.using([redLockKey], 10000, {}, async (signal) => { if (typeof acuc === "function") { acuc = acuc(JSON.parse(await getValue(cacheKeyACUC))); @@ -68,7 +73,10 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk } } -export async function getACUC(api_key: string, cacheOnly = false): Promise { +export async function getACUC( + api_key: string, + cacheOnly = false +): Promise { const cacheKeyACUC = `acuc_${api_key}`; const cachedACUC = await getValue(cacheKeyACUC); @@ -76,18 +84,38 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise setTimeout(resolve, 200)); } - const chunk: AuthCreditUsageChunk | null = data.length === 0 - ? null - : data[0].team_id === null - ? null - : data[0]; + const chunk: AuthCreditUsageChunk | null = + data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery if (chunk !== null) { @@ -132,7 +160,11 @@ export async function supaAuthenticateUser( plan?: PlanType; chunk?: AuthCreditUsageChunk; }> { - const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null); + const authHeader = + req.headers.authorization ?? + (req.headers["sec-websocket-protocol"] + ? `Bearer ${req.headers["sec-websocket-protocol"]}` + : null); if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; } @@ -162,7 +194,7 @@ export async function supaAuthenticateUser( rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); } else { rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); - } + } teamId = "preview"; } else { normalizedApi = parseApi(token); diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 3346e291..694d0e5c 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -55,11 +55,13 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: const creditsWillBeUsed = chunk.adjusted_credits_used + credits; + // In case chunk.price_credits is undefined, set it to a large number to avoid mistakes + const totalPriceCredits = chunk.price_credits ?? 100000000; // Removal of + credits - const creditUsagePercentage = creditsWillBeUsed / chunk.price_credits; + const creditUsagePercentage = creditsWillBeUsed / totalPriceCredits; // Compare the adjusted total credits used with the credits allowed by the plan - if (creditsWillBeUsed > chunk.price_credits) { + if (creditsWillBeUsed > totalPriceCredits) { sendNotification( team_id, NotificationType.LIMIT_REACHED, From 180801225b50b12662991e2a6e4f8f16c728dbdd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:44:45 -0300 Subject: [PATCH 027/102] fix/check files on crawl --- apps/api/src/scraper/WebScraper/crawler.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d5dadaf8..009a5933 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -136,6 +136,10 @@ export class WebCrawler { return false; } + if (this.isFile(link)) { + return false; + } + return true; }) .slice(0, limit); @@ -478,7 +482,14 @@ export class WebCrawler { ".webp", ".inc" ]; - return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext)); + + try { + const urlWithoutQuery = url.split('?')[0].toLowerCase(); + return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); + } catch (error) { + Logger.error(`Error processing URL in isFile: ${error}`); + return false; + } } private isSocialMediaOrEmail(url: string): boolean { From 3afaab13d9f74c12aa82f6928480d1fd3f7eb169 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 14 Oct 2024 18:14:00 -0300 Subject: [PATCH 028/102] feat/improv-crawl-status-filters --- apps/api/src/controllers/v1/crawl-status.ts | 24 +++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 084685c7..b753b17b 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -60,12 +60,24 @@ export async function crawlStatusController(req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); - jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - // filter out failed jobs - jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); - // filter the job statues - jobStatuses = jobStatuses.filter(x => x[1] !== "failed" && x[1] !== "unknown"); - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + + const throttledJobsSet = new Set(throttledJobs); + + const validJobStatuses = []; + const validJobIDs = []; + + for (const [id, status] of jobStatuses) { + if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") { + validJobStatuses.push([id, status]); + validJobIDs.push(id); + } + } + + const status: Exclude["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + + // Use validJobIDs instead of jobIDs for further processing + jobIDs = validJobIDs; + const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); From 4afcd16e02ccd7f36c9d73dbff80102329adf596 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:12:27 -0300 Subject: [PATCH 029/102] performance improv for ws --- .../api/src/controllers/v1/crawl-status-ws.ts | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 0d6d5803..3738e3a2 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -97,12 +97,23 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); - jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - // filter out failed jobs - jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); - // filter the job statues - jobStatuses = jobStatuses.filter(x => x[1] !== "failed" && x[1] !== "unknown"); - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + + const throttledJobsSet = new Set(throttledJobs); + + const validJobStatuses = []; + const validJobIDs = []; + + for (const [id, status] of jobStatuses) { + if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") { + validJobStatuses.push([id, status]); + validJobIDs.push(id); + } + } + + const status: Exclude["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + + jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing + const doneJobs = await getJobs(doneJobIDs); const data = doneJobs.map(x => x.returnvalue); From 7b1df226983a0130b472f01783527d42f336c4d0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 13:31:29 -0300 Subject: [PATCH 030/102] Delete check-queues.yml --- .github/workflows/check-queues.yml | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 .github/workflows/check-queues.yml diff --git a/.github/workflows/check-queues.yml b/.github/workflows/check-queues.yml deleted file mode 100644 index 5cb5d9ca..00000000 --- a/.github/workflows/check-queues.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Check Queues -on: - schedule: - - cron: '*/5 * * * *' - -env: - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - -jobs: - clean-jobs: - runs-on: ubuntu-latest - steps: - - name: Send GET request to check queues - run: | - response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/check-queues) - if [ "$response" -ne 200 ]; then - echo "Failed to check queues. Response: $response" - exit 1 - fi - echo "Successfully checked queues. Response: $response" From 54a54b9f33c7e95ed9d0d6419002556800c0c047 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 17:28:28 -0300 Subject: [PATCH 031/102] Nick: admin init --- .../src/services/billing/credit_billing.ts | 6 +++-- .../notification/email_notification.ts | 24 +++++++++++++++---- .../notification/notification_string.ts | 17 +++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/services/notification/notification_string.ts diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 694d0e5c..69b0617a 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -66,7 +66,8 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: team_id, NotificationType.LIMIT_REACHED, chunk.sub_current_period_start, - chunk.sub_current_period_end + chunk.sub_current_period_end, + chunk ); return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { @@ -75,7 +76,8 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: team_id, NotificationType.APPROACHING_LIMIT, chunk.sub_current_period_start, - chunk.sub_current_period_end + chunk.sub_current_period_end, + chunk ); } diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index a63d78ff..cf02892e 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -3,6 +3,9 @@ import { withAuth } from "../../lib/withAuth"; import { Resend } from "resend"; import { NotificationType } from "../../types"; import { Logger } from "../../../src/lib/logger"; +import { sendSlackWebhook } from "../alerts/slack"; +import { getNotificationString } from "./notification_string"; +import { AuthCreditUsageChunk } from "../../controllers/v1/types"; const emailTemplates: Record< NotificationType, @@ -27,19 +30,21 @@ export async function sendNotification( team_id: string, notificationType: NotificationType, startDateString: string, - endDateString: string + endDateString: string, + chunk: AuthCreditUsageChunk ) { return withAuth(sendNotificationInternal)( team_id, notificationType, startDateString, - endDateString + endDateString, + chunk ); } async function sendEmailNotification( email: string, - notificationType: NotificationType + notificationType: NotificationType, ) { const resend = new Resend(process.env.RESEND_API_KEY); @@ -66,7 +71,8 @@ export async function sendNotificationInternal( team_id: string, notificationType: NotificationType, startDateString: string, - endDateString: string + endDateString: string, + chunk: AuthCreditUsageChunk ): Promise<{ success: boolean }> { if (team_id === "preview") { return { success: true }; @@ -135,6 +141,16 @@ export async function sendNotificationInternal( }, ]); + if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { + sendSlackWebhook( + `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, + false, + process.env.SLACK_ADMIN_WEBHOOK_URL + ).catch((error) => { + Logger.debug(`Error sending slack notification: ${error}`); + }); + } + if (insertError) { Logger.debug(`Error inserting notification record: ${insertError}`); return { success: false }; diff --git a/apps/api/src/services/notification/notification_string.ts b/apps/api/src/services/notification/notification_string.ts new file mode 100644 index 00000000..8369a0ca --- /dev/null +++ b/apps/api/src/services/notification/notification_string.ts @@ -0,0 +1,17 @@ +import { NotificationType } from "../../types"; + +// depending on the notification type, return the appropriate string +export function getNotificationString( + notificationType: NotificationType +): string { + switch (notificationType) { + case NotificationType.APPROACHING_LIMIT: + return "Approaching the limit (80%)"; + case NotificationType.LIMIT_REACHED: + return "Limit reached (100%)"; + case NotificationType.RATE_LIMIT_REACHED: + return "Rate limit reached"; + default: + return "Unknown notification type"; + } +} From b4f6a0f919f0ecff915b4ad4b670c39fce17ee84 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 21:12:33 -0300 Subject: [PATCH 032/102] Nick: geolocation --- apps/api/src/controllers/v1/types.ts | 10 + apps/api/src/lib/entities.ts | 3 + apps/api/src/lib/validate-country.ts | 2261 +++++++++++++++++ apps/api/src/scraper/WebScraper/index.ts | 1 + .../scraper/WebScraper/scrapers/fireEngine.ts | 5 +- apps/api/src/scraper/WebScraper/single_url.ts | 1 + 6 files changed, 2279 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/lib/validate-country.ts diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 998f2dfa..e8520ccc 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -4,6 +4,7 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { Action, ExtractorOptions, PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; +import { countries } from "../../lib/validate-country"; export type Format = | "markdown" @@ -108,6 +109,14 @@ export const scrapeOptions = z.object({ extract: extractOptions.optional(), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), + geolocation: z.object({ + country: z.string().optional().refine( + (val) => !val || Object.keys(countries).includes(val.toUpperCase()), + { + message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", + } + ).transform(val => val ? val.toUpperCase() : 'US') + }).optional(), }).strict(strictMessage) @@ -421,6 +430,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { fullPageScreenshot: x.formats.includes("screenshot@fullPage"), parsePDF: x.parsePDF, actions: x.actions as Action[], // no strict null checking grrrr - mogery + geolocation: x.geolocation, }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 05ded7ef..ca6142ec 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -51,6 +51,9 @@ export type PageOptions = { disableJsDom?: boolean; // beta atsv?: boolean; // anti-bot solver, beta actions?: Action[]; // beta + geolocation?: { + country?: string; + }; }; export type ExtractorOptions = { diff --git a/apps/api/src/lib/validate-country.ts b/apps/api/src/lib/validate-country.ts new file mode 100644 index 00000000..bff1c25c --- /dev/null +++ b/apps/api/src/lib/validate-country.ts @@ -0,0 +1,2261 @@ +export const countries = { + AD: { + name: "Andorra", + native: "Andorra", + phone: [376], + continent: "EU", + capital: "Andorra la Vella", + currency: ["EUR"], + languages: ["ca"], + }, + AE: { + name: "United Arab Emirates", + native: "دولة الإمارات العربية المتحدة", + phone: [971], + continent: "AS", + capital: "Abu Dhabi", + currency: ["AED"], + languages: ["ar"], + }, + AF: { + name: "Afghanistan", + native: "افغانستان", + phone: [93], + continent: "AS", + capital: "Kabul", + currency: ["AFN"], + languages: ["ps", "uz", "tk"], + }, + AG: { + name: "Antigua and Barbuda", + native: "Antigua and Barbuda", + phone: [1268], + continent: "NA", + capital: "Saint John's", + currency: ["XCD"], + languages: ["en"], + }, + AI: { + name: "Anguilla", + native: "Anguilla", + phone: [1264], + continent: "NA", + capital: "The Valley", + currency: ["XCD"], + languages: ["en"], + }, + AL: { + name: "Albania", + native: "Shqipëria", + phone: [355], + continent: "EU", + capital: "Tirana", + currency: ["ALL"], + languages: ["sq"], + }, + AM: { + name: "Armenia", + native: "Հայաստան", + phone: [374], + continent: "AS", + capital: "Yerevan", + currency: ["AMD"], + languages: ["hy", "ru"], + }, + AO: { + name: "Angola", + native: "Angola", + phone: [244], + continent: "AF", + capital: "Luanda", + currency: ["AOA"], + languages: ["pt"], + }, + AQ: { + name: "Antarctica", + native: "Antarctica", + phone: [672], + continent: "AN", + capital: "", + currency: [], + languages: [], + }, + AR: { + name: "Argentina", + native: "Argentina", + phone: [54], + continent: "SA", + capital: "Buenos Aires", + currency: ["ARS"], + languages: ["es", "gn"], + }, + AS: { + name: "American Samoa", + native: "American Samoa", + phone: [1684], + continent: "OC", + capital: "Pago Pago", + currency: ["USD"], + languages: ["en", "sm"], + }, + AT: { + name: "Austria", + native: "Österreich", + phone: [43], + continent: "EU", + capital: "Vienna", + currency: ["EUR"], + languages: ["de"], + }, + AU: { + name: "Australia", + native: "Australia", + phone: [61], + continent: "OC", + capital: "Canberra", + currency: ["AUD"], + languages: ["en"], + }, + AW: { + name: "Aruba", + native: "Aruba", + phone: [297], + continent: "NA", + capital: "Oranjestad", + currency: ["AWG"], + languages: ["nl", "pa"], + }, + AX: { + name: "Aland", + native: "Åland", + phone: [358], + continent: "EU", + capital: "Mariehamn", + currency: ["EUR"], + languages: ["sv"], + partOf: "FI", + }, + AZ: { + name: "Azerbaijan", + native: "Azərbaycan", + phone: [994], + continent: "AS", + continents: ["AS", "EU"], + capital: "Baku", + currency: ["AZN"], + languages: ["az"], + }, + BA: { + name: "Bosnia and Herzegovina", + native: "Bosna i Hercegovina", + phone: [387], + continent: "EU", + capital: "Sarajevo", + currency: ["BAM"], + languages: ["bs", "hr", "sr"], + }, + BB: { + name: "Barbados", + native: "Barbados", + phone: [1246], + continent: "NA", + capital: "Bridgetown", + currency: ["BBD"], + languages: ["en"], + }, + BD: { + name: "Bangladesh", + native: "Bangladesh", + phone: [880], + continent: "AS", + capital: "Dhaka", + currency: ["BDT"], + languages: ["bn"], + }, + BE: { + name: "Belgium", + native: "België", + phone: [32], + continent: "EU", + capital: "Brussels", + currency: ["EUR"], + languages: ["nl", "fr", "de"], + }, + BF: { + name: "Burkina Faso", + native: "Burkina Faso", + phone: [226], + continent: "AF", + capital: "Ouagadougou", + currency: ["XOF"], + languages: ["fr", "ff"], + }, + BG: { + name: "Bulgaria", + native: "България", + phone: [359], + continent: "EU", + capital: "Sofia", + currency: ["BGN"], + languages: ["bg"], + }, + BH: { + name: "Bahrain", + native: "‏البحرين", + phone: [973], + continent: "AS", + capital: "Manama", + currency: ["BHD"], + languages: ["ar"], + }, + BI: { + name: "Burundi", + native: "Burundi", + phone: [257], + continent: "AF", + capital: "Bujumbura", + currency: ["BIF"], + languages: ["fr", "rn"], + }, + BJ: { + name: "Benin", + native: "Bénin", + phone: [229], + continent: "AF", + capital: "Porto-Novo", + currency: ["XOF"], + languages: ["fr"], + }, + BL: { + name: "Saint Barthelemy", + native: "Saint-Barthélemy", + phone: [590], + continent: "NA", + capital: "Gustavia", + currency: ["EUR"], + languages: ["fr"], + }, + BM: { + name: "Bermuda", + native: "Bermuda", + phone: [1441], + continent: "NA", + capital: "Hamilton", + currency: ["BMD"], + languages: ["en"], + }, + BN: { + name: "Brunei", + native: "Negara Brunei Darussalam", + phone: [673], + continent: "AS", + capital: "Bandar Seri Begawan", + currency: ["BND"], + languages: ["ms"], + }, + BO: { + name: "Bolivia", + native: "Bolivia", + phone: [591], + continent: "SA", + capital: "Sucre", + currency: ["BOB", "BOV"], + languages: ["es", "ay", "qu"], + }, + BQ: { + name: "Bonaire", + native: "Bonaire", + phone: [5997], + continent: "NA", + capital: "Kralendijk", + currency: ["USD"], + languages: ["nl"], + }, + BR: { + name: "Brazil", + native: "Brasil", + phone: [55], + continent: "SA", + capital: "Brasília", + currency: ["BRL"], + languages: ["pt"], + }, + BS: { + name: "Bahamas", + native: "Bahamas", + phone: [1242], + continent: "NA", + capital: "Nassau", + currency: ["BSD"], + languages: ["en"], + }, + BT: { + name: "Bhutan", + native: "ʼbrug-yul", + phone: [975], + continent: "AS", + capital: "Thimphu", + currency: ["BTN", "INR"], + languages: ["dz"], + }, + BV: { + name: "Bouvet Island", + native: "Bouvetøya", + phone: [47], + continent: "AN", + capital: "", + currency: ["NOK"], + languages: ["no", "nb", "nn"], + }, + BW: { + name: "Botswana", + native: "Botswana", + phone: [267], + continent: "AF", + capital: "Gaborone", + currency: ["BWP"], + languages: ["en", "tn"], + }, + BY: { + name: "Belarus", + native: "Белару́сь", + phone: [375], + continent: "EU", + capital: "Minsk", + currency: ["BYN"], + languages: ["be", "ru"], + }, + BZ: { + name: "Belize", + native: "Belize", + phone: [501], + continent: "NA", + capital: "Belmopan", + currency: ["BZD"], + languages: ["en", "es"], + }, + CA: { + name: "Canada", + native: "Canada", + phone: [1], + continent: "NA", + capital: "Ottawa", + currency: ["CAD"], + languages: ["en", "fr"], + }, + CC: { + name: "Cocos (Keeling) Islands", + native: "Cocos (Keeling) Islands", + phone: [61], + continent: "AS", + capital: "West Island", + currency: ["AUD"], + languages: ["en"], + }, + CD: { + name: "Democratic Republic of the Congo", + native: "République démocratique du Congo", + phone: [243], + continent: "AF", + capital: "Kinshasa", + currency: ["CDF"], + languages: ["fr", "ln", "kg", "sw", "lu"], + }, + CF: { + name: "Central African Republic", + native: "Ködörösêse tî Bêafrîka", + phone: [236], + continent: "AF", + capital: "Bangui", + currency: ["XAF"], + languages: ["fr", "sg"], + }, + CG: { + name: "Republic of the Congo", + native: "République du Congo", + phone: [242], + continent: "AF", + capital: "Brazzaville", + currency: ["XAF"], + languages: ["fr", "ln"], + }, + CH: { + name: "Switzerland", + native: "Schweiz", + phone: [41], + continent: "EU", + capital: "Bern", + currency: ["CHE", "CHF", "CHW"], + languages: ["de", "fr", "it"], + }, + CI: { + name: "Ivory Coast", + native: "Côte d'Ivoire", + phone: [225], + continent: "AF", + capital: "Yamoussoukro", + currency: ["XOF"], + languages: ["fr"], + }, + CK: { + name: "Cook Islands", + native: "Cook Islands", + phone: [682], + continent: "OC", + capital: "Avarua", + currency: ["NZD"], + languages: ["en"], + }, + CL: { + name: "Chile", + native: "Chile", + phone: [56], + continent: "SA", + capital: "Santiago", + currency: ["CLF", "CLP"], + languages: ["es"], + }, + CM: { + name: "Cameroon", + native: "Cameroon", + phone: [237], + continent: "AF", + capital: "Yaoundé", + currency: ["XAF"], + languages: ["en", "fr"], + }, + CN: { + name: "China", + native: "中国", + phone: [86], + continent: "AS", + capital: "Beijing", + currency: ["CNY"], + languages: ["zh"], + }, + CO: { + name: "Colombia", + native: "Colombia", + phone: [57], + continent: "SA", + capital: "Bogotá", + currency: ["COP"], + languages: ["es"], + }, + CR: { + name: "Costa Rica", + native: "Costa Rica", + phone: [506], + continent: "NA", + capital: "San José", + currency: ["CRC"], + languages: ["es"], + }, + CU: { + name: "Cuba", + native: "Cuba", + phone: [53], + continent: "NA", + capital: "Havana", + currency: ["CUC", "CUP"], + languages: ["es"], + }, + CV: { + name: "Cape Verde", + native: "Cabo Verde", + phone: [238], + continent: "AF", + capital: "Praia", + currency: ["CVE"], + languages: ["pt"], + }, + CW: { + name: "Curacao", + native: "Curaçao", + phone: [5999], + continent: "NA", + capital: "Willemstad", + currency: ["ANG"], + languages: ["nl", "pa", "en"], + }, + CX: { + name: "Christmas Island", + native: "Christmas Island", + phone: [61], + continent: "AS", + capital: "Flying Fish Cove", + currency: ["AUD"], + languages: ["en"], + }, + CY: { + name: "Cyprus", + native: "Κύπρος", + phone: [357], + continent: "EU", + capital: "Nicosia", + currency: ["EUR"], + languages: ["el", "tr", "hy"], + }, + CZ: { + name: "Czech Republic", + native: "Česká republika", + phone: [420], + continent: "EU", + capital: "Prague", + currency: ["CZK"], + languages: ["cs"], + }, + DE: { + name: "Germany", + native: "Deutschland", + phone: [49], + continent: "EU", + capital: "Berlin", + currency: ["EUR"], + languages: ["de"], + }, + DJ: { + name: "Djibouti", + native: "Djibouti", + phone: [253], + continent: "AF", + capital: "Djibouti", + currency: ["DJF"], + languages: ["fr", "ar"], + }, + DK: { + name: "Denmark", + native: "Danmark", + phone: [45], + continent: "EU", + continents: ["EU", "NA"], + capital: "Copenhagen", + currency: ["DKK"], + languages: ["da"], + }, + DM: { + name: "Dominica", + native: "Dominica", + phone: [1767], + continent: "NA", + capital: "Roseau", + currency: ["XCD"], + languages: ["en"], + }, + DO: { + name: "Dominican Republic", + native: "República Dominicana", + phone: [1809, 1829, 1849], + continent: "NA", + capital: "Santo Domingo", + currency: ["DOP"], + languages: ["es"], + }, + DZ: { + name: "Algeria", + native: "الجزائر", + phone: [213], + continent: "AF", + capital: "Algiers", + currency: ["DZD"], + languages: ["ar"], + }, + EC: { + name: "Ecuador", + native: "Ecuador", + phone: [593], + continent: "SA", + capital: "Quito", + currency: ["USD"], + languages: ["es"], + }, + EE: { + name: "Estonia", + native: "Eesti", + phone: [372], + continent: "EU", + capital: "Tallinn", + currency: ["EUR"], + languages: ["et"], + }, + EG: { + name: "Egypt", + native: "مصر‎", + phone: [20], + continent: "AF", + continents: ["AF", "AS"], + capital: "Cairo", + currency: ["EGP"], + languages: ["ar"], + }, + EH: { + name: "Western Sahara", + native: "الصحراء الغربية", + phone: [212], + continent: "AF", + capital: "El Aaiún", + currency: ["MAD", "DZD", "MRU"], + languages: ["es"], + }, + ER: { + name: "Eritrea", + native: "ኤርትራ", + phone: [291], + continent: "AF", + capital: "Asmara", + currency: ["ERN"], + languages: ["ti", "ar", "en"], + }, + ES: { + name: "Spain", + native: "España", + phone: [34], + continent: "EU", + capital: "Madrid", + currency: ["EUR"], + languages: ["es", "eu", "ca", "gl", "oc"], + }, + ET: { + name: "Ethiopia", + native: "ኢትዮጵያ", + phone: [251], + continent: "AF", + capital: "Addis Ababa", + currency: ["ETB"], + languages: ["am"], + }, + FI: { + name: "Finland", + native: "Suomi", + phone: [358], + continent: "EU", + capital: "Helsinki", + currency: ["EUR"], + languages: ["fi", "sv"], + }, + FJ: { + name: "Fiji", + native: "Fiji", + phone: [679], + continent: "OC", + capital: "Suva", + currency: ["FJD"], + languages: ["en", "fj", "hi", "ur"], + }, + FK: { + name: "Falkland Islands", + native: "Falkland Islands", + phone: [500], + continent: "SA", + capital: "Stanley", + currency: ["FKP"], + languages: ["en"], + }, + FM: { + name: "Micronesia", + native: "Micronesia", + phone: [691], + continent: "OC", + capital: "Palikir", + currency: ["USD"], + languages: ["en"], + }, + FO: { + name: "Faroe Islands", + native: "Føroyar", + phone: [298], + continent: "EU", + capital: "Tórshavn", + currency: ["DKK"], + languages: ["fo"], + }, + FR: { + name: "France", + native: "France", + phone: [33], + continent: "EU", + capital: "Paris", + currency: ["EUR"], + languages: ["fr"], + }, + GA: { + name: "Gabon", + native: "Gabon", + phone: [241], + continent: "AF", + capital: "Libreville", + currency: ["XAF"], + languages: ["fr"], + }, + GB: { + name: "United Kingdom", + native: "United Kingdom", + phone: [44], + continent: "EU", + capital: "London", + currency: ["GBP"], + languages: ["en"], + }, + GD: { + name: "Grenada", + native: "Grenada", + phone: [1473], + continent: "NA", + capital: "St. George's", + currency: ["XCD"], + languages: ["en"], + }, + GE: { + name: "Georgia", + native: "საქართველო", + phone: [995], + continent: "AS", + continents: ["AS", "EU"], + capital: "Tbilisi", + currency: ["GEL"], + languages: ["ka"], + }, + GF: { + name: "French Guiana", + native: "Guyane française", + phone: [594], + continent: "SA", + capital: "Cayenne", + currency: ["EUR"], + languages: ["fr"], + }, + GG: { + name: "Guernsey", + native: "Guernsey", + phone: [44], + continent: "EU", + capital: "St. Peter Port", + currency: ["GBP"], + languages: ["en", "fr"], + }, + GH: { + name: "Ghana", + native: "Ghana", + phone: [233], + continent: "AF", + capital: "Accra", + currency: ["GHS"], + languages: ["en"], + }, + GI: { + name: "Gibraltar", + native: "Gibraltar", + phone: [350], + continent: "EU", + capital: "Gibraltar", + currency: ["GIP"], + languages: ["en"], + }, + GL: { + name: "Greenland", + native: "Kalaallit Nunaat", + phone: [299], + continent: "NA", + capital: "Nuuk", + currency: ["DKK"], + languages: ["kl"], + }, + GM: { + name: "Gambia", + native: "Gambia", + phone: [220], + continent: "AF", + capital: "Banjul", + currency: ["GMD"], + languages: ["en"], + }, + GN: { + name: "Guinea", + native: "Guinée", + phone: [224], + continent: "AF", + capital: "Conakry", + currency: ["GNF"], + languages: ["fr", "ff"], + }, + GP: { + name: "Guadeloupe", + native: "Guadeloupe", + phone: [590], + continent: "NA", + capital: "Basse-Terre", + currency: ["EUR"], + languages: ["fr"], + }, + GQ: { + name: "Equatorial Guinea", + native: "Guinea Ecuatorial", + phone: [240], + continent: "AF", + capital: "Malabo", + currency: ["XAF"], + languages: ["es", "fr"], + }, + GR: { + name: "Greece", + native: "Ελλάδα", + phone: [30], + continent: "EU", + capital: "Athens", + currency: ["EUR"], + languages: ["el"], + }, + GS: { + name: "South Georgia and the South Sandwich Islands", + native: "South Georgia", + phone: [500], + continent: "AN", + capital: "King Edward Point", + currency: ["GBP"], + languages: ["en"], + }, + GT: { + name: "Guatemala", + native: "Guatemala", + phone: [502], + continent: "NA", + capital: "Guatemala City", + currency: ["GTQ"], + languages: ["es"], + }, + GU: { + name: "Guam", + native: "Guam", + phone: [1671], + continent: "OC", + capital: "Hagåtña", + currency: ["USD"], + languages: ["en", "ch", "es"], + }, + GW: { + name: "Guinea-Bissau", + native: "Guiné-Bissau", + phone: [245], + continent: "AF", + capital: "Bissau", + currency: ["XOF"], + languages: ["pt"], + }, + GY: { + name: "Guyana", + native: "Guyana", + phone: [592], + continent: "SA", + capital: "Georgetown", + currency: ["GYD"], + languages: ["en"], + }, + HK: { + name: "Hong Kong", + native: "香港", + phone: [852], + continent: "AS", + capital: "City of Victoria", + currency: ["HKD"], + languages: ["zh", "en"], + }, + HM: { + name: "Heard Island and McDonald Islands", + native: "Heard Island and McDonald Islands", + phone: [61], + continent: "AN", + capital: "", + currency: ["AUD"], + languages: ["en"], + }, + HN: { + name: "Honduras", + native: "Honduras", + phone: [504], + continent: "NA", + capital: "Tegucigalpa", + currency: ["HNL"], + languages: ["es"], + }, + HR: { + name: "Croatia", + native: "Hrvatska", + phone: [385], + continent: "EU", + capital: "Zagreb", + currency: ["EUR"], + languages: ["hr"], + }, + HT: { + name: "Haiti", + native: "Haïti", + phone: [509], + continent: "NA", + capital: "Port-au-Prince", + currency: ["HTG", "USD"], + languages: ["fr", "ht"], + }, + HU: { + name: "Hungary", + native: "Magyarország", + phone: [36], + continent: "EU", + capital: "Budapest", + currency: ["HUF"], + languages: ["hu"], + }, + ID: { + name: "Indonesia", + native: "Indonesia", + phone: [62], + continent: "AS", + capital: "Jakarta", + currency: ["IDR"], + languages: ["id"], + }, + IE: { + name: "Ireland", + native: "Éire", + phone: [353], + continent: "EU", + capital: "Dublin", + currency: ["EUR"], + languages: ["ga", "en"], + }, + IL: { + name: "Israel", + native: "יִשְׂרָאֵל", + phone: [972], + continent: "AS", + capital: "Jerusalem", + currency: ["ILS"], + languages: ["he", "ar"], + }, + IM: { + name: "Isle of Man", + native: "Isle of Man", + phone: [44], + continent: "EU", + capital: "Douglas", + currency: ["GBP"], + languages: ["en", "gv"], + }, + IN: { + name: "India", + native: "भारत", + phone: [91], + continent: "AS", + capital: "New Delhi", + currency: ["INR"], + languages: ["hi", "en"], + }, + IO: { + name: "British Indian Ocean Territory", + native: "British Indian Ocean Territory", + phone: [246], + continent: "AS", + capital: "Diego Garcia", + currency: ["USD"], + languages: ["en"], + }, + IQ: { + name: "Iraq", + native: "العراق", + phone: [964], + continent: "AS", + capital: "Baghdad", + currency: ["IQD"], + languages: ["ar", "ku"], + }, + IR: { + name: "Iran", + native: "ایران", + phone: [98], + continent: "AS", + capital: "Tehran", + currency: ["IRR"], + languages: ["fa"], + }, + IS: { + name: "Iceland", + native: "Ísland", + phone: [354], + continent: "EU", + capital: "Reykjavik", + currency: ["ISK"], + languages: ["is"], + }, + IT: { + name: "Italy", + native: "Italia", + phone: [39], + continent: "EU", + capital: "Rome", + currency: ["EUR"], + languages: ["it"], + }, + JE: { + name: "Jersey", + native: "Jersey", + phone: [44], + continent: "EU", + capital: "Saint Helier", + currency: ["GBP"], + languages: ["en", "fr"], + }, + JM: { + name: "Jamaica", + native: "Jamaica", + phone: [1876], + continent: "NA", + capital: "Kingston", + currency: ["JMD"], + languages: ["en"], + }, + JO: { + name: "Jordan", + native: "الأردن", + phone: [962], + continent: "AS", + capital: "Amman", + currency: ["JOD"], + languages: ["ar"], + }, + JP: { + name: "Japan", + native: "日本", + phone: [81], + continent: "AS", + capital: "Tokyo", + currency: ["JPY"], + languages: ["ja"], + }, + KE: { + name: "Kenya", + native: "Kenya", + phone: [254], + continent: "AF", + capital: "Nairobi", + currency: ["KES"], + languages: ["en", "sw"], + }, + KG: { + name: "Kyrgyzstan", + native: "Кыргызстан", + phone: [996], + continent: "AS", + capital: "Bishkek", + currency: ["KGS"], + languages: ["ky", "ru"], + }, + KH: { + name: "Cambodia", + native: "Kâmpŭchéa", + phone: [855], + continent: "AS", + capital: "Phnom Penh", + currency: ["KHR"], + languages: ["km"], + }, + KI: { + name: "Kiribati", + native: "Kiribati", + phone: [686], + continent: "OC", + capital: "South Tarawa", + currency: ["AUD"], + languages: ["en"], + }, + KM: { + name: "Comoros", + native: "Komori", + phone: [269], + continent: "AF", + capital: "Moroni", + currency: ["KMF"], + languages: ["ar", "fr"], + }, + KN: { + name: "Saint Kitts and Nevis", + native: "Saint Kitts and Nevis", + phone: [1869], + continent: "NA", + capital: "Basseterre", + currency: ["XCD"], + languages: ["en"], + }, + KP: { + name: "North Korea", + native: "북한", + phone: [850], + continent: "AS", + capital: "Pyongyang", + currency: ["KPW"], + languages: ["ko"], + }, + KR: { + name: "South Korea", + native: "대한민국", + phone: [82], + continent: "AS", + capital: "Seoul", + currency: ["KRW"], + languages: ["ko"], + }, + KW: { + name: "Kuwait", + native: "الكويت", + phone: [965], + continent: "AS", + capital: "Kuwait City", + currency: ["KWD"], + languages: ["ar"], + }, + KY: { + name: "Cayman Islands", + native: "Cayman Islands", + phone: [1345], + continent: "NA", + capital: "George Town", + currency: ["KYD"], + languages: ["en"], + }, + KZ: { + name: "Kazakhstan", + native: "Қазақстан", + phone: [7], + continent: "AS", + continents: ["AS", "EU"], + capital: "Astana", + currency: ["KZT"], + languages: ["kk", "ru"], + }, + LA: { + name: "Laos", + native: "ສປປລາວ", + phone: [856], + continent: "AS", + capital: "Vientiane", + currency: ["LAK"], + languages: ["lo"], + }, + LB: { + name: "Lebanon", + native: "لبنان", + phone: [961], + continent: "AS", + capital: "Beirut", + currency: ["LBP"], + languages: ["ar", "fr"], + }, + LC: { + name: "Saint Lucia", + native: "Saint Lucia", + phone: [1758], + continent: "NA", + capital: "Castries", + currency: ["XCD"], + languages: ["en"], + }, + LI: { + name: "Liechtenstein", + native: "Liechtenstein", + phone: [423], + continent: "EU", + capital: "Vaduz", + currency: ["CHF"], + languages: ["de"], + }, + LK: { + name: "Sri Lanka", + native: "śrī laṃkāva", + phone: [94], + continent: "AS", + capital: "Colombo", + currency: ["LKR"], + languages: ["si", "ta"], + }, + LR: { + name: "Liberia", + native: "Liberia", + phone: [231], + continent: "AF", + capital: "Monrovia", + currency: ["LRD"], + languages: ["en"], + }, + LS: { + name: "Lesotho", + native: "Lesotho", + phone: [266], + continent: "AF", + capital: "Maseru", + currency: ["LSL", "ZAR"], + languages: ["en", "st"], + }, + LT: { + name: "Lithuania", + native: "Lietuva", + phone: [370], + continent: "EU", + capital: "Vilnius", + currency: ["EUR"], + languages: ["lt"], + }, + LU: { + name: "Luxembourg", + native: "Luxembourg", + phone: [352], + continent: "EU", + capital: "Luxembourg", + currency: ["EUR"], + languages: ["fr", "de", "lb"], + }, + LV: { + name: "Latvia", + native: "Latvija", + phone: [371], + continent: "EU", + capital: "Riga", + currency: ["EUR"], + languages: ["lv"], + }, + LY: { + name: "Libya", + native: "‏ليبيا", + phone: [218], + continent: "AF", + capital: "Tripoli", + currency: ["LYD"], + languages: ["ar"], + }, + MA: { + name: "Morocco", + native: "المغرب", + phone: [212], + continent: "AF", + capital: "Rabat", + currency: ["MAD"], + languages: ["ar"], + }, + MC: { + name: "Monaco", + native: "Monaco", + phone: [377], + continent: "EU", + capital: "Monaco", + currency: ["EUR"], + languages: ["fr"], + }, + MD: { + name: "Moldova", + native: "Moldova", + phone: [373], + continent: "EU", + capital: "Chișinău", + currency: ["MDL"], + languages: ["ro"], + }, + ME: { + name: "Montenegro", + native: "Црна Гора", + phone: [382], + continent: "EU", + capital: "Podgorica", + currency: ["EUR"], + languages: ["sr", "bs", "sq", "hr"], + }, + MF: { + name: "Saint Martin", + native: "Saint-Martin", + phone: [590], + continent: "NA", + capital: "Marigot", + currency: ["EUR"], + languages: ["en", "fr", "nl"], + }, + MG: { + name: "Madagascar", + native: "Madagasikara", + phone: [261], + continent: "AF", + capital: "Antananarivo", + currency: ["MGA"], + languages: ["fr", "mg"], + }, + MH: { + name: "Marshall Islands", + native: "M̧ajeļ", + phone: [692], + continent: "OC", + capital: "Majuro", + currency: ["USD"], + languages: ["en", "mh"], + }, + MK: { + name: "North Macedonia", + native: "Северна Македонија", + phone: [389], + continent: "EU", + capital: "Skopje", + currency: ["MKD"], + languages: ["mk"], + }, + ML: { + name: "Mali", + native: "Mali", + phone: [223], + continent: "AF", + capital: "Bamako", + currency: ["XOF"], + languages: ["fr"], + }, + MM: { + name: "Myanmar (Burma)", + native: "မြန်မာ", + phone: [95], + continent: "AS", + capital: "Naypyidaw", + currency: ["MMK"], + languages: ["my"], + }, + MN: { + name: "Mongolia", + native: "Монгол улс", + phone: [976], + continent: "AS", + capital: "Ulan Bator", + currency: ["MNT"], + languages: ["mn"], + }, + MO: { + name: "Macao", + native: "澳門", + phone: [853], + continent: "AS", + capital: "", + currency: ["MOP"], + languages: ["zh", "pt"], + }, + MP: { + name: "Northern Mariana Islands", + native: "Northern Mariana Islands", + phone: [1670], + continent: "OC", + capital: "Saipan", + currency: ["USD"], + languages: ["en", "ch"], + }, + MQ: { + name: "Martinique", + native: "Martinique", + phone: [596], + continent: "NA", + capital: "Fort-de-France", + currency: ["EUR"], + languages: ["fr"], + }, + MR: { + name: "Mauritania", + native: "موريتانيا", + phone: [222], + continent: "AF", + capital: "Nouakchott", + currency: ["MRU"], + languages: ["ar"], + }, + MS: { + name: "Montserrat", + native: "Montserrat", + phone: [1664], + continent: "NA", + capital: "Plymouth", + currency: ["XCD"], + languages: ["en"], + }, + MT: { + name: "Malta", + native: "Malta", + phone: [356], + continent: "EU", + capital: "Valletta", + currency: ["EUR"], + languages: ["mt", "en"], + }, + MU: { + name: "Mauritius", + native: "Maurice", + phone: [230], + continent: "AF", + capital: "Port Louis", + currency: ["MUR"], + languages: ["en"], + }, + MV: { + name: "Maldives", + native: "Maldives", + phone: [960], + continent: "AS", + capital: "Malé", + currency: ["MVR"], + languages: ["dv"], + }, + MW: { + name: "Malawi", + native: "Malawi", + phone: [265], + continent: "AF", + capital: "Lilongwe", + currency: ["MWK"], + languages: ["en", "ny"], + }, + MX: { + name: "Mexico", + native: "México", + phone: [52], + continent: "NA", + capital: "Mexico City", + currency: ["MXN"], + languages: ["es"], + }, + MY: { + name: "Malaysia", + native: "Malaysia", + phone: [60], + continent: "AS", + capital: "Kuala Lumpur", + currency: ["MYR"], + languages: ["ms"], + }, + MZ: { + name: "Mozambique", + native: "Moçambique", + phone: [258], + continent: "AF", + capital: "Maputo", + currency: ["MZN"], + languages: ["pt"], + }, + NA: { + name: "Namibia", + native: "Namibia", + phone: [264], + continent: "AF", + capital: "Windhoek", + currency: ["NAD", "ZAR"], + languages: ["en", "af"], + }, + NC: { + name: "New Caledonia", + native: "Nouvelle-Calédonie", + phone: [687], + continent: "OC", + capital: "Nouméa", + currency: ["XPF"], + languages: ["fr"], + }, + NE: { + name: "Niger", + native: "Niger", + phone: [227], + continent: "AF", + capital: "Niamey", + currency: ["XOF"], + languages: ["fr"], + }, + NF: { + name: "Norfolk Island", + native: "Norfolk Island", + phone: [672], + continent: "OC", + capital: "Kingston", + currency: ["AUD"], + languages: ["en"], + }, + NG: { + name: "Nigeria", + native: "Nigeria", + phone: [234], + continent: "AF", + capital: "Abuja", + currency: ["NGN"], + languages: ["en"], + }, + NI: { + name: "Nicaragua", + native: "Nicaragua", + phone: [505], + continent: "NA", + capital: "Managua", + currency: ["NIO"], + languages: ["es"], + }, + NL: { + name: "Netherlands", + native: "Nederland", + phone: [31], + continent: "EU", + capital: "Amsterdam", + currency: ["EUR"], + languages: ["nl"], + }, + NO: { + name: "Norway", + native: "Norge", + phone: [47], + continent: "EU", + capital: "Oslo", + currency: ["NOK"], + languages: ["no", "nb", "nn"], + }, + NP: { + name: "Nepal", + native: "नेपाल", + phone: [977], + continent: "AS", + capital: "Kathmandu", + currency: ["NPR"], + languages: ["ne"], + }, + NR: { + name: "Nauru", + native: "Nauru", + phone: [674], + continent: "OC", + capital: "Yaren", + currency: ["AUD"], + languages: ["en", "na"], + }, + NU: { + name: "Niue", + native: "Niuē", + phone: [683], + continent: "OC", + capital: "Alofi", + currency: ["NZD"], + languages: ["en"], + }, + NZ: { + name: "New Zealand", + native: "New Zealand", + phone: [64], + continent: "OC", + capital: "Wellington", + currency: ["NZD"], + languages: ["en", "mi"], + }, + OM: { + name: "Oman", + native: "عمان", + phone: [968], + continent: "AS", + capital: "Muscat", + currency: ["OMR"], + languages: ["ar"], + }, + PA: { + name: "Panama", + native: "Panamá", + phone: [507], + continent: "NA", + capital: "Panama City", + currency: ["PAB", "USD"], + languages: ["es"], + }, + PE: { + name: "Peru", + native: "Perú", + phone: [51], + continent: "SA", + capital: "Lima", + currency: ["PEN"], + languages: ["es"], + }, + PF: { + name: "French Polynesia", + native: "Polynésie française", + phone: [689], + continent: "OC", + capital: "Papeetē", + currency: ["XPF"], + languages: ["fr"], + }, + PG: { + name: "Papua New Guinea", + native: "Papua Niugini", + phone: [675], + continent: "OC", + capital: "Port Moresby", + currency: ["PGK"], + languages: ["en"], + }, + PH: { + name: "Philippines", + native: "Pilipinas", + phone: [63], + continent: "AS", + capital: "Manila", + currency: ["PHP"], + languages: ["en"], + }, + PK: { + name: "Pakistan", + native: "Pakistan", + phone: [92], + continent: "AS", + capital: "Islamabad", + currency: ["PKR"], + languages: ["en", "ur"], + }, + PL: { + name: "Poland", + native: "Polska", + phone: [48], + continent: "EU", + capital: "Warsaw", + currency: ["PLN"], + languages: ["pl"], + }, + PM: { + name: "Saint Pierre and Miquelon", + native: "Saint-Pierre-et-Miquelon", + phone: [508], + continent: "NA", + capital: "Saint-Pierre", + currency: ["EUR"], + languages: ["fr"], + }, + PN: { + name: "Pitcairn Islands", + native: "Pitcairn Islands", + phone: [64], + continent: "OC", + capital: "Adamstown", + currency: ["NZD"], + languages: ["en"], + }, + PR: { + name: "Puerto Rico", + native: "Puerto Rico", + phone: [1787, 1939], + continent: "NA", + capital: "San Juan", + currency: ["USD"], + languages: ["es", "en"], + }, + PS: { + name: "Palestine", + native: "فلسطين", + phone: [970], + continent: "AS", + capital: "Ramallah", + currency: ["ILS"], + languages: ["ar"], + }, + PT: { + name: "Portugal", + native: "Portugal", + phone: [351], + continent: "EU", + capital: "Lisbon", + currency: ["EUR"], + languages: ["pt"], + }, + PW: { + name: "Palau", + native: "Palau", + phone: [680], + continent: "OC", + capital: "Ngerulmud", + currency: ["USD"], + languages: ["en"], + }, + PY: { + name: "Paraguay", + native: "Paraguay", + phone: [595], + continent: "SA", + capital: "Asunción", + currency: ["PYG"], + languages: ["es", "gn"], + }, + QA: { + name: "Qatar", + native: "قطر", + phone: [974], + continent: "AS", + capital: "Doha", + currency: ["QAR"], + languages: ["ar"], + }, + RE: { + name: "Reunion", + native: "La Réunion", + phone: [262], + continent: "AF", + capital: "Saint-Denis", + currency: ["EUR"], + languages: ["fr"], + }, + RO: { + name: "Romania", + native: "România", + phone: [40], + continent: "EU", + capital: "Bucharest", + currency: ["RON"], + languages: ["ro"], + }, + RS: { + name: "Serbia", + native: "Србија", + phone: [381], + continent: "EU", + capital: "Belgrade", + currency: ["RSD"], + languages: ["sr"], + }, + RU: { + name: "Russia", + native: "Россия", + phone: [7], + continent: "AS", + continents: ["AS", "EU"], + capital: "Moscow", + currency: ["RUB"], + languages: ["ru"], + }, + RW: { + name: "Rwanda", + native: "Rwanda", + phone: [250], + continent: "AF", + capital: "Kigali", + currency: ["RWF"], + languages: ["rw", "en", "fr"], + }, + SA: { + name: "Saudi Arabia", + native: "العربية السعودية", + phone: [966], + continent: "AS", + capital: "Riyadh", + currency: ["SAR"], + languages: ["ar"], + }, + SB: { + name: "Solomon Islands", + native: "Solomon Islands", + phone: [677], + continent: "OC", + capital: "Honiara", + currency: ["SBD"], + languages: ["en"], + }, + SC: { + name: "Seychelles", + native: "Seychelles", + phone: [248], + continent: "AF", + capital: "Victoria", + currency: ["SCR"], + languages: ["fr", "en"], + }, + SD: { + name: "Sudan", + native: "السودان", + phone: [249], + continent: "AF", + capital: "Khartoum", + currency: ["SDG"], + languages: ["ar", "en"], + }, + SE: { + name: "Sweden", + native: "Sverige", + phone: [46], + continent: "EU", + capital: "Stockholm", + currency: ["SEK"], + languages: ["sv"], + }, + SG: { + name: "Singapore", + native: "Singapore", + phone: [65], + continent: "AS", + capital: "Singapore", + currency: ["SGD"], + languages: ["en", "ms", "ta", "zh"], + }, + SH: { + name: "Saint Helena", + native: "Saint Helena", + phone: [290], + continent: "AF", + capital: "Jamestown", + currency: ["SHP"], + languages: ["en"], + }, + SI: { + name: "Slovenia", + native: "Slovenija", + phone: [386], + continent: "EU", + capital: "Ljubljana", + currency: ["EUR"], + languages: ["sl"], + }, + SJ: { + name: "Svalbard and Jan Mayen", + native: "Svalbard og Jan Mayen", + phone: [4779], + continent: "EU", + capital: "Longyearbyen", + currency: ["NOK"], + languages: ["no"], + }, + SK: { + name: "Slovakia", + native: "Slovensko", + phone: [421], + continent: "EU", + capital: "Bratislava", + currency: ["EUR"], + languages: ["sk"], + }, + SL: { + name: "Sierra Leone", + native: "Sierra Leone", + phone: [232], + continent: "AF", + capital: "Freetown", + currency: ["SLL"], + languages: ["en"], + }, + SM: { + name: "San Marino", + native: "San Marino", + phone: [378], + continent: "EU", + capital: "City of San Marino", + currency: ["EUR"], + languages: ["it"], + }, + SN: { + name: "Senegal", + native: "Sénégal", + phone: [221], + continent: "AF", + capital: "Dakar", + currency: ["XOF"], + languages: ["fr"], + }, + SO: { + name: "Somalia", + native: "Soomaaliya", + phone: [252], + continent: "AF", + capital: "Mogadishu", + currency: ["SOS"], + languages: ["so", "ar"], + }, + SR: { + name: "Suriname", + native: "Suriname", + phone: [597], + continent: "SA", + capital: "Paramaribo", + currency: ["SRD"], + languages: ["nl"], + }, + SS: { + name: "South Sudan", + native: "South Sudan", + phone: [211], + continent: "AF", + capital: "Juba", + currency: ["SSP"], + languages: ["en"], + }, + ST: { + name: "Sao Tome and Principe", + native: "São Tomé e Príncipe", + phone: [239], + continent: "AF", + capital: "São Tomé", + currency: ["STN"], + languages: ["pt"], + }, + SV: { + name: "El Salvador", + native: "El Salvador", + phone: [503], + continent: "NA", + capital: "San Salvador", + currency: ["SVC", "USD"], + languages: ["es"], + }, + SX: { + name: "Sint Maarten", + native: "Sint Maarten", + phone: [1721], + continent: "NA", + capital: "Philipsburg", + currency: ["ANG"], + languages: ["nl", "en"], + }, + SY: { + name: "Syria", + native: "سوريا", + phone: [963], + continent: "AS", + capital: "Damascus", + currency: ["SYP"], + languages: ["ar"], + }, + SZ: { + name: "Eswatini", + native: "Eswatini", + phone: [268], + continent: "AF", + capital: "Lobamba", + currency: ["SZL"], + languages: ["en", "ss"], + }, + TC: { + name: "Turks and Caicos Islands", + native: "Turks and Caicos Islands", + phone: [1649], + continent: "NA", + capital: "Cockburn Town", + currency: ["USD"], + languages: ["en"], + }, + TD: { + name: "Chad", + native: "Tchad", + phone: [235], + continent: "AF", + capital: "N'Djamena", + currency: ["XAF"], + languages: ["fr", "ar"], + }, + TF: { + name: "French Southern Territories", + native: "Territoire des Terres australes et antarctiques fr", + phone: [262], + continent: "AN", + capital: "Port-aux-Français", + currency: ["EUR"], + languages: ["fr"], + }, + TG: { + name: "Togo", + native: "Togo", + phone: [228], + continent: "AF", + capital: "Lomé", + currency: ["XOF"], + languages: ["fr"], + }, + TH: { + name: "Thailand", + native: "ประเทศไทย", + phone: [66], + continent: "AS", + capital: "Bangkok", + currency: ["THB"], + languages: ["th"], + }, + TJ: { + name: "Tajikistan", + native: "Тоҷикистон", + phone: [992], + continent: "AS", + capital: "Dushanbe", + currency: ["TJS"], + languages: ["tg", "ru"], + }, + TK: { + name: "Tokelau", + native: "Tokelau", + phone: [690], + continent: "OC", + capital: "Fakaofo", + currency: ["NZD"], + languages: ["en"], + }, + TL: { + name: "East Timor", + native: "Timor-Leste", + phone: [670], + continent: "OC", + capital: "Dili", + currency: ["USD"], + languages: ["pt"], + }, + TM: { + name: "Turkmenistan", + native: "Türkmenistan", + phone: [993], + continent: "AS", + capital: "Ashgabat", + currency: ["TMT"], + languages: ["tk", "ru"], + }, + TN: { + name: "Tunisia", + native: "تونس", + phone: [216], + continent: "AF", + capital: "Tunis", + currency: ["TND"], + languages: ["ar"], + }, + TO: { + name: "Tonga", + native: "Tonga", + phone: [676], + continent: "OC", + capital: "Nuku'alofa", + currency: ["TOP"], + languages: ["en", "to"], + }, + TR: { + name: "Turkey", + native: "Türkiye", + phone: [90], + continent: "AS", + continents: ["AS", "EU"], + capital: "Ankara", + currency: ["TRY"], + languages: ["tr"], + }, + TT: { + name: "Trinidad and Tobago", + native: "Trinidad and Tobago", + phone: [1868], + continent: "NA", + capital: "Port of Spain", + currency: ["TTD"], + languages: ["en"], + }, + TV: { + name: "Tuvalu", + native: "Tuvalu", + phone: [688], + continent: "OC", + capital: "Funafuti", + currency: ["AUD"], + languages: ["en"], + }, + TW: { + name: "Taiwan", + native: "臺灣", + phone: [886], + continent: "AS", + capital: "Taipei", + currency: ["TWD"], + languages: ["zh"], + }, + TZ: { + name: "Tanzania", + native: "Tanzania", + phone: [255], + continent: "AF", + capital: "Dodoma", + currency: ["TZS"], + languages: ["sw", "en"], + }, + UA: { + name: "Ukraine", + native: "Україна", + phone: [380], + continent: "EU", + capital: "Kyiv", + currency: ["UAH"], + languages: ["uk"], + }, + UG: { + name: "Uganda", + native: "Uganda", + phone: [256], + continent: "AF", + capital: "Kampala", + currency: ["UGX"], + languages: ["en", "sw"], + }, + UM: { + name: "U.S. Minor Outlying Islands", + native: "United States Minor Outlying Islands", + phone: [1], + continent: "OC", + capital: "", + currency: ["USD"], + languages: ["en"], + }, + US: { + name: "United States", + native: "United States", + phone: [1], + continent: "NA", + capital: "Washington D.C.", + currency: ["USD", "USN", "USS"], + languages: ["en"], + }, + UY: { + name: "Uruguay", + native: "Uruguay", + phone: [598], + continent: "SA", + capital: "Montevideo", + currency: ["UYI", "UYU"], + languages: ["es"], + }, + UZ: { + name: "Uzbekistan", + native: "O'zbekiston", + phone: [998], + continent: "AS", + capital: "Tashkent", + currency: ["UZS"], + languages: ["uz", "ru"], + }, + VA: { + name: "Vatican City", + native: "Vaticano", + phone: [379], + continent: "EU", + capital: "Vatican City", + currency: ["EUR"], + languages: ["it", "la"], + }, + VC: { + name: "Saint Vincent and the Grenadines", + native: "Saint Vincent and the Grenadines", + phone: [1784], + continent: "NA", + capital: "Kingstown", + currency: ["XCD"], + languages: ["en"], + }, + VE: { + name: "Venezuela", + native: "Venezuela", + phone: [58], + continent: "SA", + capital: "Caracas", + currency: ["VES"], + languages: ["es"], + }, + VG: { + name: "British Virgin Islands", + native: "British Virgin Islands", + phone: [1284], + continent: "NA", + capital: "Road Town", + currency: ["USD"], + languages: ["en"], + }, + VI: { + name: "U.S. Virgin Islands", + native: "United States Virgin Islands", + phone: [1340], + continent: "NA", + capital: "Charlotte Amalie", + currency: ["USD"], + languages: ["en"], + }, + VN: { + name: "Vietnam", + native: "Việt Nam", + phone: [84], + continent: "AS", + capital: "Hanoi", + currency: ["VND"], + languages: ["vi"], + }, + VU: { + name: "Vanuatu", + native: "Vanuatu", + phone: [678], + continent: "OC", + capital: "Port Vila", + currency: ["VUV"], + languages: ["bi", "en", "fr"], + }, + WF: { + name: "Wallis and Futuna", + native: "Wallis et Futuna", + phone: [681], + continent: "OC", + capital: "Mata-Utu", + currency: ["XPF"], + languages: ["fr"], + }, + WS: { + name: "Samoa", + native: "Samoa", + phone: [685], + continent: "OC", + capital: "Apia", + currency: ["WST"], + languages: ["sm", "en"], + }, + XK: { + name: "Kosovo", + native: "Republika e Kosovës", + phone: [377, 381, 383, 386], + continent: "EU", + capital: "Pristina", + currency: ["EUR"], + languages: ["sq", "sr"], + userAssigned: true, + }, + YE: { + name: "Yemen", + native: "اليَمَن", + phone: [967], + continent: "AS", + capital: "Sana'a", + currency: ["YER"], + languages: ["ar"], + }, + YT: { + name: "Mayotte", + native: "Mayotte", + phone: [262], + continent: "AF", + capital: "Mamoudzou", + currency: ["EUR"], + languages: ["fr"], + }, + ZA: { + name: "South Africa", + native: "South Africa", + phone: [27], + continent: "AF", + capital: "Pretoria", + currency: ["ZAR"], + languages: ["af", "en", "nr", "st", "ss", "tn", "ts", "ve", "xh", "zu"], + }, + ZM: { + name: "Zambia", + native: "Zambia", + phone: [260], + continent: "AF", + capital: "Lusaka", + currency: ["ZMW"], + languages: ["en"], + }, + ZW: { + name: "Zimbabwe", + native: "Zimbabwe", + phone: [263], + continent: "AF", + capital: "Harare", + currency: ["USD", "ZAR", "BWP", "GBP", "AUD", "CNY", "INR", "JPY"], + languages: ["en", "sn", "nd"], + }, +}; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c564c471..5285a9f4 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -593,6 +593,7 @@ export class WebScraperDataProvider { disableJsDom: options.pageOptions?.disableJsDom ?? false, atsv: options.pageOptions?.atsv ?? false, actions: options.pageOptions?.actions ?? undefined, + geolocation: options.pageOptions?.geolocation ?? undefined, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index f715c427..29f62762 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -28,7 +28,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "us" } }, fireEngineOptions = {}, headers, options, @@ -40,7 +40,7 @@ export async function scrapWithFireEngine({ waitFor?: number; screenshot?: boolean; fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } }; fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; @@ -118,6 +118,7 @@ export async function scrapWithFireEngine({ ...fireEngineOptionsParam, atsv: pageOptions?.atsv ?? false, scrollXPaths: pageOptions?.scrollXPaths ?? [], + geolocation: pageOptions?.geolocation, actions: actions, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 13ca7dd2..b4141dc1 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -156,6 +156,7 @@ export async function scrapSingleUrl( disableJsDom: pageOptions.disableJsDom ?? false, atsv: pageOptions.atsv ?? false, actions: pageOptions.actions ?? undefined, + geolocation: pageOptions.geolocation ?? undefined, } if (extractorOptions) { From 795e5a92287dd2969ca58de040d7ce0e1d46c308 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 21:36:13 -0300 Subject: [PATCH 033/102] Update metadata.ts --- .../src/scraper/WebScraper/utils/metadata.ts | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index fac53b38..0c2af118 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -70,11 +70,12 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let pageStatusCode: number | null = null; let pageError: string | null = null; + const customMetadata: Record = {}; + try { title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; - // Assuming the language is part of the URL as per the regex pattern language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; @@ -104,6 +105,22 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; + // Extract all meta tags for custom metadata + soup("meta").each((i, elem) => { + const name = soup(elem).attr("name") || soup(elem).attr("property"); + const content = soup(elem).attr("content"); + + if (name && content) { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } + } + }); + } catch (error) { Logger.error(`Error extracting metadata: ${error}`); } @@ -141,5 +158,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(sourceURL ? { sourceURL } : {}), ...(pageStatusCode ? { pageStatusCode } : {}), ...(pageError ? { pageError } : {}), + ...customMetadata, }; } From 027158fa4484d6d75a84ac01f71566852b4933da Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 21:47:27 -0300 Subject: [PATCH 034/102] Nick: --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 29f62762..d1dafd5f 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -28,7 +28,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "us" } }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } }, fireEngineOptions = {}, headers, options, From e5a5ca2446e70319ec65ee34d062d6569c8b16e6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 01:06:10 -0300 Subject: [PATCH 035/102] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 69b0617a..ab610392 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -58,17 +58,20 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: // In case chunk.price_credits is undefined, set it to a large number to avoid mistakes const totalPriceCredits = chunk.price_credits ?? 100000000; // Removal of + credits - const creditUsagePercentage = creditsWillBeUsed / totalPriceCredits; + const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits; // Compare the adjusted total credits used with the credits allowed by the plan if (creditsWillBeUsed > totalPriceCredits) { - sendNotification( - team_id, + // Only notify if their actual credits (not what they will use) used is greater than the total price credits + if(chunk.adjusted_credits_used > totalPriceCredits) { + sendNotification( + team_id, NotificationType.LIMIT_REACHED, chunk.sub_current_period_start, chunk.sub_current_period_end, chunk ); + } return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit From cf8fe93281c34a8511ba18a40e9b1e93e65bbba4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 01:09:57 -0300 Subject: [PATCH 036/102] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index ab610392..fc73ca7c 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -72,7 +72,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: chunk ); } - return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk }; + return { success: false, message: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit sendNotification( From 2c1a98f01976c7b9cebd9f8081d56636967b3984 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 13:37:40 -0300 Subject: [PATCH 037/102] Update excludeTags.ts --- apps/api/src/scraper/WebScraper/utils/excludeTags.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index 400ef84f..b813c813 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -6,7 +6,6 @@ export const excludeNonMainTags = [ ".header", ".top", ".navbar", - "#header", ".footer", ".bottom", "#footer", @@ -39,8 +38,6 @@ export const excludeNonMainTags = [ "#search", ".share", "#share", - ".widget", - "#widget", ".cookie", "#cookie" ]; From ff906f7750cf45643c65b89239f234dfb40bc892 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 13:40:34 -0300 Subject: [PATCH 038/102] Update excludeTags.ts --- apps/api/src/scraper/WebScraper/utils/excludeTags.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts index b813c813..71e1637b 100644 --- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -3,7 +3,6 @@ export const excludeNonMainTags = [ "footer", "nav", "aside", - ".header", ".top", ".navbar", ".footer", From 417c7697c385c8e227a632dd2e13b74d331a847b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:26:46 -0300 Subject: [PATCH 039/102] Update metadata.ts --- apps/api/src/scraper/WebScraper/utils/metadata.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 0c2af118..b009c20c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -34,6 +34,7 @@ interface Metadata { sourceURL?: string; pageStatusCode?: number; pageError?: string; + [key: string]: string | string[] | number | undefined; } export function extractMetadata(soup: CheerioAPI, url: string): Metadata { From c0384ea381e08fedee49a9216561f8b312568d51 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:32:44 -0300 Subject: [PATCH 040/102] Nick: added tests --- .../__tests__/e2e_v1_withAuth/index.test.ts | 43 +++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 2 + 2 files changed, 45 insertions(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index eef65125..a4163472 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => { }, 30000 ); // 30 seconds timeout + + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/abs/2410.04840", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).toContain("Strong Model Collapse"); + expect(response.body.data.metadata.error).toBeUndefined(); + expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse"); + expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse"); + expect(response.body.data.metadata.citation_author).toEqual([ + "Dohmatob, Elvis", + "Feng, Yunzhen", + "Subramonian, Arjun", + "Kempe, Julia" + ]); + expect(response.body.data.metadata.citation_date).toBe("2024/10/07"); + expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08"); + expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840"); + expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840"); + expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm"); + expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840"); + expect(response.body.data.metadata.statusCode).toBe(200); + }, + 30000 + ); it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 998f2dfa..0975bb01 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -250,6 +250,8 @@ export type Document = { sourceURL?: string; statusCode?: number; error?: string; + [key: string]: string | string[] | number | undefined; + }; }; From 8974230db47abd3dc041896fe032a4a91951eef5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:35:03 -0300 Subject: [PATCH 041/102] Nick: formatting + error handling --- .../src/scraper/WebScraper/utils/metadata.ts | 74 ++++++++++++------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index b009c20c..aecae481 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -76,52 +76,72 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { try { title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; - - language = soup('html').attr('lang') || null; + + language = soup("html").attr("lang") || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null; - ogDescription = soup('meta[property="og:description"]').attr("content") || null; + ogDescription = + soup('meta[property="og:description"]').attr("content") || null; ogUrl = soup('meta[property="og:url"]').attr("content") || null; ogImage = soup('meta[property="og:image"]').attr("content") || null; ogAudio = soup('meta[property="og:audio"]').attr("content") || null; - ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; + ogDeterminer = + soup('meta[property="og:determiner"]').attr("content") || null; ogLocale = soup('meta[property="og:locale"]').attr("content") || null; - ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; + ogLocaleAlternate = + soup('meta[property="og:locale:alternate"]') + .map((i, el) => soup(el).attr("content")) + .get() || null; ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; ogVideo = soup('meta[property="og:video"]').attr("content") || null; - articleSection = soup('meta[name="article:section"]').attr("content") || null; + articleSection = + soup('meta[name="article:section"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null; - publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; - modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; - dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; + publishedTime = + soup('meta[property="article:published_time"]').attr("content") || null; + modifiedTime = + soup('meta[property="article:modified_time"]').attr("content") || null; + dctermsKeywords = + soup('meta[name="dcterms.keywords"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; - dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null; - dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null; + dctermsSubject = + soup('meta[name="dcterms.subject"]').attr("content") || null; + dctermsAudience = + soup('meta[name="dcterms.audience"]').attr("content") || null; dcType = soup('meta[name="dc.type"]').attr("content") || null; dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null; dcDate = soup('meta[name="dc.date"]').attr("content") || null; - dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; - dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; + dcDateCreated = + soup('meta[name="dc.date.created"]').attr("content") || null; + dctermsCreated = + soup('meta[name="dcterms.created"]').attr("content") || null; - // Extract all meta tags for custom metadata - soup("meta").each((i, elem) => { - const name = soup(elem).attr("name") || soup(elem).attr("property"); - const content = soup(elem).attr("content"); + try { + // Extract all meta tags for custom metadata + soup("meta").each((i, elem) => { + try { + const name = soup(elem).attr("name") || soup(elem).attr("property"); + const content = soup(elem).attr("content"); - if (name && content) { - if (customMetadata[name] === undefined) { - customMetadata[name] = content; - } else if (Array.isArray(customMetadata[name])) { - (customMetadata[name] as string[]).push(content); - } else { - customMetadata[name] = [customMetadata[name] as string, content]; + if (name && content) { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } + } + } catch (error) { + Logger.error(`Error extracting custom metadata (in): ${error}`); } - } - }); - + }); + } catch (error) { + Logger.error(`Error extracting custom metadata: ${error}`); + } } catch (error) { Logger.error(`Error extracting metadata: ${error}`); } From 2ac50a16f536b834087759a3477b8d8221eb9b0e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:37:07 -0300 Subject: [PATCH 042/102] Update metadata.ts --- apps/api/src/scraper/WebScraper/utils/metadata.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index aecae481..531dc17c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -74,6 +74,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { const customMetadata: Record = {}; try { + // TODO: remove this as it is redundant with the below implementation title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; From 06b8d24a4cc3f0864a1c970e51021c140cf4c36e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:50:21 -0300 Subject: [PATCH 043/102] Update scrape.ts --- apps/api/src/controllers/v1/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 6da48999..d0d4c5fc 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -139,7 +139,7 @@ export async function scrapeController( crawlerOptions: {}, pageOptions: pageOptions, origin: origin, - extractor_options: { mode: "markdown" }, + extractor_options: extractorOptions, num_tokens: numTokens, }); From 03b37998fdce3f46c3d52d559f59ea432b0ff68d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 17 Oct 2024 19:40:18 +0200 Subject: [PATCH 044/102] feat: bulk scrape --- apps/api/src/controllers/v1/bulk-scrape.ts | 99 ++++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 32 ++++--- apps/api/src/lib/crawl-redis.ts | 2 +- apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/routes/v1.ts | 16 ++++ apps/api/src/services/queue-worker.ts | 6 +- 6 files changed, 140 insertions(+), 17 deletions(-) create mode 100644 apps/api/src/controllers/v1/bulk-scrape.ts diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/bulk-scrape.ts new file mode 100644 index 00000000..3e1afbd0 --- /dev/null +++ b/apps/api/src/controllers/v1/bulk-scrape.ts @@ -0,0 +1,99 @@ +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { + BulkScrapeRequest, + bulkScrapeRequestSchema, + CrawlResponse, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJobs, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { logCrawl } from "../../services/logging/crawl_log"; +import { getScrapeQueue } from "../../services/queue-service"; +import { getJobPriority } from "../../lib/job-priority"; + +export async function bulkScrapeController( + req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>, + res: Response +) { + req.body = bulkScrapeRequestSchema.parse(req.body); + + const id = uuidv4(); + + await logCrawl(id, req.auth.team_id); + + let { remainingCredits } = req.account; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if(!useDbAuthentication){ + remainingCredits = Infinity; + } + + const pageOptions = legacyScrapeOptions(req.body); + + const sc: StoredCrawl = { + crawlerOptions: null, + pageOptions, + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan, + }; + + await saveCrawl(id, sc); + + let jobPriority = 20; + + // If it is over 1000, we need to get the job priority, + // otherwise we can use the default priority of 20 + if(req.body.urls.length > 1000){ + // set base to 21 + jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + } + + const jobs = req.body.urls.map((x) => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url: x, + mode: "single_urls", + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions: null, + pageOptions, + origin: "api", + crawl_id: id, + sitemapped: true, + v1: true, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); + await getScrapeQueue().addBulk(jobs); + + const protocol = process.env.ENV === "local" ? req.protocol : "https"; + + return res.status(200).json({ + success: true, + id, + url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`, + }); +} + + diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 033de6e0..56c944ec 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -141,19 +141,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({ return obj; }); -// export type ScrapeRequest = { -// url: string; -// formats?: Format[]; -// headers?: { [K: string]: string }; -// includeTags?: string[]; -// excludeTags?: string[]; -// onlyMainContent?: boolean; -// timeout?: number; -// waitFor?: number; -// } - export type ScrapeRequest = z.infer; +export const bulkScrapeRequestSchema = scrapeOptions.extend({ + urls: url.array(), + origin: z.string().optional().default("api"), +}).strict(strictMessage).refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); + }, + { + message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + } +).transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; +}); + +export type BulkScrapeRequest = z.infer; + const crawlerOptions = z.object({ includePaths: z.string().array().default([]), excludePaths: z.string().array().default([]), diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index f0ece43f..379bc179 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service"; import { Logger } from "./logger"; export type StoredCrawl = { - originUrl: string; + originUrl?: string; crawlerOptions: any; pageOptions: any; team_id: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 6e642c65..8eb679e7 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -112,7 +112,7 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = crawlerOptions.returnOnlyUrls + const filteredDocs = crawlerOptions?.returnOnlyUrls ? docs.map((doc) => { if (doc.metadata.sourceURL) { return { url: doc.metadata.sourceURL }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index b0ceceb4..2bd3d3ea 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; +import { bulkScrapeController } from "../controllers/v1/bulk-scrape"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -122,6 +123,15 @@ v1Router.post( wrap(crawlController) ); +v1Router.post( + "/bulk/scrape", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(bulkScrapeController) +); + v1Router.post( "/map", authMiddleware(RateLimiterMode.Map), @@ -136,6 +146,12 @@ v1Router.get( wrap(crawlStatusController) ); +v1Router.get( + "/bulk/scrape/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) +); + v1Router.get( "/scrape/:jobId", wrap(scrapeStatusController) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index bff51f74..1ea4775a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -365,7 +365,7 @@ async function processJob(job: Job, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - if (!job.data.sitemapped) { + if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { const crawler = crawlToCrawler(job.data.crawl_id, sc); @@ -414,9 +414,7 @@ async function processJob(job: Job, token: string) { } } - if (await finishCrawl(job.data.crawl_id)) { - - + if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) { if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); From 5f69358ce82321b27d9eb97570f715223a796e59 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 17 Oct 2024 15:47:28 -0400 Subject: [PATCH 045/102] Swarm Extractor Example --- .gitignore | 1 + .../.env.example | 3 + .../README.md | 37 ++++++ .../main.py | 120 ++++++++++++++++++ .../requirements.txt | 3 + 5 files changed, 164 insertions(+) create mode 100644 examples/openai_swarm_firecrawl_web_extractor/.env.example create mode 100644 examples/openai_swarm_firecrawl_web_extractor/README.md create mode 100644 examples/openai_swarm_firecrawl_web_extractor/main.py create mode 100644 examples/openai_swarm_firecrawl_web_extractor/requirements.txt diff --git a/.gitignore b/.gitignore index 240d6937..dcfd499a 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ apps/js-sdk/firecrawl/dist /examples/o1_web_crawler/firecrawl_env /examples/crm_lead_enrichment/crm_lead_enrichment_env +/.venv diff --git a/examples/openai_swarm_firecrawl_web_extractor/.env.example b/examples/openai_swarm_firecrawl_web_extractor/.env.example new file mode 100644 index 00000000..9385c44f --- /dev/null +++ b/examples/openai_swarm_firecrawl_web_extractor/.env.example @@ -0,0 +1,3 @@ +OPENAI_API_KEY= +FIRECRAWL_API_KEY= +SERP_API_KEY= \ No newline at end of file diff --git a/examples/openai_swarm_firecrawl_web_extractor/README.md b/examples/openai_swarm_firecrawl_web_extractor/README.md new file mode 100644 index 00000000..b256ae7d --- /dev/null +++ b/examples/openai_swarm_firecrawl_web_extractor/README.md @@ -0,0 +1,37 @@ +# Swarm Firecrawl Marketing Agent + +A multi-agent system using [OpenAI Swarm](https://github.com/openai/swarm) for AI-powered marketing strategies using [Firecrawl](https://firecrawl.dev) for web scraping. + +## Agents + +1. User Interface: Manages user interactions +2. Website Scraper: Extracts clean LLM-ready content via Firecrawl API +3. Analyst: Provides marketing insights +4. Campaign Idea: Generates marketing campaign concepts +5. Copywriter: Creates compelling marketing copy + +## Requirements + +- [Firecrawl](https://firecrawl.dev) API key +- [OpenAI](https://platform.openai.com/api-keys) API key + +## Setup + +1. Install the required packages: + ``` + pip install -r requirements.txt + ``` + +2. Set up your environment variables in a `.env` file: + ``` + OPENAI_API_KEY=your_openai_api_key + FIRECRAWL_API_KEY=your_firecrawl_api_key + ``` + +## Usage + +Run the main script to start the interactive demo: + +``` +python main.py +``` \ No newline at end of file diff --git a/examples/openai_swarm_firecrawl_web_extractor/main.py b/examples/openai_swarm_firecrawl_web_extractor/main.py new file mode 100644 index 00000000..b96bf5ac --- /dev/null +++ b/examples/openai_swarm_firecrawl_web_extractor/main.py @@ -0,0 +1,120 @@ +import os +from firecrawl import FirecrawlApp +from swarm import Agent +from swarm.repl import run_demo_loop +import dotenv +from serpapi import GoogleSearch +from openai import OpenAI + +dotenv.load_dotenv() + +# Initialize FirecrawlApp and OpenAI +app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def search_google(query, objective): + """Search Google using SerpAPI.""" + print(f"Parameters: query={query}, objective={objective}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + results = search.get_dict().get("organic_results", []) + return {"objective": objective, "results": results} + +def map_url_pages(url, objective): + """Map a website's pages using Firecrawl.""" + + search_query = generate_completion( + "website search query generator", + f"Generate a 1-2 word search query for the website: {url} based on the objective", + "Objective: " + objective + ) + print(f"Parameters: url={url}, objective={objective}, search_query={search_query}") + map_status = app.map_url(url, params={'search': search_query}) + if map_status.get('status') == 'success': + links = map_status.get('links', []) + top_link = links[0] if links else None + return {"objective": objective, "results": [top_link] if top_link else []} + else: + return {"objective": objective, "results": []} + +def scrape_url(url, objective): + """Scrape a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + scrape_status = app.scrape_url( + url, + params={'formats': ['markdown']} + ) + return {"objective": objective, "results": scrape_status} + +def analyze_website_content(content, objective): + """Analyze the scraped website content using OpenAI.""" + print(f"Parameters: content={content[:50]}..., objective={objective}") + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective.", + "Objective: " + objective + "\nContent: " + content + ) + return {"objective": objective, "results": analysis} + +def generate_completion(role, task, content): + """Generate a completion using OpenAI.""" + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": f"You are a {role}. {task}"}, + {"role": "user", "content": content} + ] + ) + return response.choices[0].message.content + +def handoff_to_search_google(): + """Hand off the search query to the search google agent.""" + return google_search_agent + +def handoff_to_map_url(): + """Hand off the url to the map url agent.""" + return map_url_agent + +def handoff_to_website_scraper(): + """Hand off the url to the website scraper agent.""" + return website_scraper_agent + +def handoff_to_analyst(): + """Hand off the website content to the analyst agent.""" + return analyst_agent + + + +user_interface_agent = Agent( + name="User Interface Agent", + instructions="You are a user interface agent that handles all interactions with the user. You need to always start with an web data extraction objective that the user wants to achieve by searching the web, mapping the web pages, and extracting the content from a specific page. Be concise.", + functions=[handoff_to_search_google], +) + +google_search_agent = Agent( + name="Google Search Agent", + instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the map agent.", + functions=[search_google, handoff_to_map_url], +) + +map_url_agent = Agent( + name="Map URL Agent", + instructions="You are a map url agent specialized in mapping the web pages. When you are done, you must hand off the results to the website scraper agent.", + functions=[map_url_pages, handoff_to_website_scraper], +) + +website_scraper_agent = Agent( + name="Website Scraper Agent", + instructions="You are a website scraper agent specialized in scraping website content. When you are done, you must hand off the website content to the analyst agent to extract the data based on the objective.", + functions=[scrape_url, handoff_to_analyst], +) + +analyst_agent = Agent( + name="Analyst Agent", + instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must return a JSON object.", + functions=[analyze_website_content], +) + +if __name__ == "__main__": + # Run the demo loop with the user interface agent + run_demo_loop(user_interface_agent, stream=True) \ No newline at end of file diff --git a/examples/openai_swarm_firecrawl_web_extractor/requirements.txt b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt new file mode 100644 index 00000000..60bc9b84 --- /dev/null +++ b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py +openai +serpapi \ No newline at end of file From dff71a81791bd2892b8703794935ad33712a104a Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 17 Oct 2024 15:48:36 -0400 Subject: [PATCH 046/102] Delete README.md --- .../README.md | 37 ------------------- 1 file changed, 37 deletions(-) delete mode 100644 examples/openai_swarm_firecrawl_web_extractor/README.md diff --git a/examples/openai_swarm_firecrawl_web_extractor/README.md b/examples/openai_swarm_firecrawl_web_extractor/README.md deleted file mode 100644 index b256ae7d..00000000 --- a/examples/openai_swarm_firecrawl_web_extractor/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Swarm Firecrawl Marketing Agent - -A multi-agent system using [OpenAI Swarm](https://github.com/openai/swarm) for AI-powered marketing strategies using [Firecrawl](https://firecrawl.dev) for web scraping. - -## Agents - -1. User Interface: Manages user interactions -2. Website Scraper: Extracts clean LLM-ready content via Firecrawl API -3. Analyst: Provides marketing insights -4. Campaign Idea: Generates marketing campaign concepts -5. Copywriter: Creates compelling marketing copy - -## Requirements - -- [Firecrawl](https://firecrawl.dev) API key -- [OpenAI](https://platform.openai.com/api-keys) API key - -## Setup - -1. Install the required packages: - ``` - pip install -r requirements.txt - ``` - -2. Set up your environment variables in a `.env` file: - ``` - OPENAI_API_KEY=your_openai_api_key - FIRECRAWL_API_KEY=your_firecrawl_api_key - ``` - -## Usage - -Run the main script to start the interactive demo: - -``` -python main.py -``` \ No newline at end of file From a110fdeb857019284b578f72ca569d01edac07fe Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Fri, 18 Oct 2024 02:10:39 +0530 Subject: [PATCH 047/102] Update requirements.txt --- .../openai_swarm_firecrawl_web_extractor/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/openai_swarm_firecrawl_web_extractor/requirements.txt b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt index 60bc9b84..201da900 100644 --- a/examples/openai_swarm_firecrawl_web_extractor/requirements.txt +++ b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt @@ -1,3 +1,5 @@ firecrawl-py openai -serpapi \ No newline at end of file +serpapi +google-search-results +git+https://github.com/openai/swarm.git From 7d8df7d53bd450247a7e8f5dd097e31986bd0654 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Fri, 18 Oct 2024 02:11:15 +0530 Subject: [PATCH 048/102] Update requirements.txt --- examples/openai_swarm_firecrawl_web_extractor/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/openai_swarm_firecrawl_web_extractor/requirements.txt b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt index 201da900..d7be486c 100644 --- a/examples/openai_swarm_firecrawl_web_extractor/requirements.txt +++ b/examples/openai_swarm_firecrawl_web_extractor/requirements.txt @@ -1,5 +1,4 @@ firecrawl-py openai -serpapi google-search-results git+https://github.com/openai/swarm.git From 79e65f31ef1d7a4172870471d81501ee2e8aef22 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 17 Oct 2024 17:57:44 -0300 Subject: [PATCH 049/102] Update v1.ts --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index b0ceceb4..246f9b05 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -36,7 +36,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R if (!success) { Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); if (!res.headersSent) { - return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." }); + return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." }); } } req.account = { remainingCredits }; From aed11e72a60e920659f2bd33131bd71cb179046f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:50:58 -0300 Subject: [PATCH 050/102] fix encoding if error --- apps/api/src/lib/LLM-extraction/helpers.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/helpers.ts b/apps/api/src/lib/LLM-extraction/helpers.ts index f47a6b3c..2143a32d 100644 --- a/apps/api/src/lib/LLM-extraction/helpers.ts +++ b/apps/api/src/lib/LLM-extraction/helpers.ts @@ -6,7 +6,13 @@ export function numTokensFromString(message: string, model: string): number { const encoder = encoding_for_model(model as TiktokenModel); // Encode the message into tokens - const tokens = encoder.encode(message); + let tokens: Uint32Array; + try { + tokens = encoder.encode(message); + } catch (error) { + message = message.replace("<|endoftext|>", ""); + tokens = encoder.encode(message); + } // Free the encoder resources after use encoder.free(); From 18f69c90b173709382fa7c15e03119f17324bc3d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:18:57 -0300 Subject: [PATCH 051/102] fix/missing error in response --- apps/python-sdk/firecrawl/firecrawl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 70f677ef..41b5949a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -81,8 +81,10 @@ class FirecrawlApp: response = response.json() if response['success'] and 'data' in response: return response['data'] - else: + elif "error" in response: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + else: + raise Exception(f'Failed to scrape URL. Error: {response}') else: self._handle_error(response, 'scrape URL') @@ -266,8 +268,10 @@ class FirecrawlApp: response = response.json() if response['success'] and 'links' in response: return response - else: + elif 'error' in response: raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + raise Exception(f'Failed to map URL. Error: {response}') else: self._handle_error(response, 'map') From 10381b5d3cda7f9b635008b59cb90d61ca87c0ed Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:51:18 +0530 Subject: [PATCH 052/102] Create app.py --- examples/sales_web_crawler/app.py | 99 +++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 examples/sales_web_crawler/app.py diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py new file mode 100644 index 00000000..ae14fc62 --- /dev/null +++ b/examples/sales_web_crawler/app.py @@ -0,0 +1,99 @@ +import os +import csv +import json + +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from openai import OpenAI +from serpapi import GoogleSearch + +load_dotenv() + +# Initialize FirecrawlApp and OpenAI +app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def search_google(query, objective): + """Search Google using SerpAPI.""" + print(f"Parameters: query={query}, objective={objective}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + results = search.get_dict().get("organic_results", []) + return {"objective": objective, "results": results} + +def scrape_url(url, objective): + """Scrape a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + scrape_status = app.scrape_url( + url, + params={'formats': ['markdown']} + ) + return {"objective": objective, "results": scrape_status} + +def crawl_url(url, objective): + """Crawl a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + # If using a crawled url set, pass the ID in the function call below + # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") + # scrape_status['results'] = scrape_status['data'] + scrape_status = app.crawl_url( + url, + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + ) + return {"objective": objective, "results": scrape_status} + +def analyze_website_content(content, objective): + """Analyze the scraped website content using OpenAI.""" + print(f"Parameters: content={content[:50]}..., objective={objective}") + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + return {"objective": objective, "results": analysis} + +def generate_completion(role, task, content): + """Generate a completion using OpenAI.""" + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": f"You are a {role}. {task}"}, + {"role": "user", "content": content} + ] + ) + return response.choices[0].message.content + +def read_websites_from_csv(file_path): + """Read websites from a CSV file.""" + websites = [] + with open(file_path, mode='r') as file: + csv_reader = csv.DictReader(file) + for row in csv_reader: + websites.append(row['website']) + return websites + +def write_results_to_json(results, file_path): + """Write results to a JSON file.""" + with open(file_path, mode='w') as file: + json.dump(results, file, indent=4) + +def process_websites(file_path): + """Process websites from a CSV file and write results to a new JSON file.""" + results = [] + websites = read_websites_from_csv(file_path) + for website in websites: + search_results = search_google(website, "Search website") + if search_results['results']: + top_result = search_results['results'][0] + url = top_result['link'] + crawl_results = crawl_url(url, "Crawl website") + if crawl_results['results']: + for each_result in crawl_results['results']['data'][:2]: + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") + print(analysis_results['results']) + results.append(json.loads(analysis_results['results'])) + write_results_to_json(results, 'enriched_data.json') + +if __name__ == "__main__": + # Process websites from the CSV file + process_websites('websites.csv') From 11fd630e55128b40c56e3768308db056bed2e9a5 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:14 +0530 Subject: [PATCH 053/102] Create requirements.txt --- examples/sales_web_crawler/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/sales_web_crawler/requirements.txt diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt new file mode 100644 index 00000000..685c8e33 --- /dev/null +++ b/examples/sales_web_crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py +openai +google-search-results From adfc493c9b5cf22e692f0c456c68e8c6f71b9d53 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:26 +0530 Subject: [PATCH 054/102] Create websites.csv --- examples/sales_web_crawler/websites.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/sales_web_crawler/websites.csv diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv new file mode 100644 index 00000000..32bee52d --- /dev/null +++ b/examples/sales_web_crawler/websites.csv @@ -0,0 +1,2 @@ +website +https://www.launchfa.st From ba3ee8ead6c5b704d0305f5e4e49539646b7d9ea Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 00:52:47 +0530 Subject: [PATCH 055/102] Create .env.example --- examples/sales_web_crawler/.env.example | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/sales_web_crawler/.env.example diff --git a/examples/sales_web_crawler/.env.example b/examples/sales_web_crawler/.env.example new file mode 100644 index 00000000..06ccc66d --- /dev/null +++ b/examples/sales_web_crawler/.env.example @@ -0,0 +1,3 @@ +OPENAI_API_KEY= +FIRECRAWL_API_KEY= +SERP_API_KEY= From f5af938ea29eae582aba97bafeef1292c29b14fe Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:17 +0530 Subject: [PATCH 056/102] Update requirements.txt --- examples/sales_web_crawler/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt index 685c8e33..180c5d6c 100644 --- a/examples/sales_web_crawler/requirements.txt +++ b/examples/sales_web_crawler/requirements.txt @@ -1,3 +1,4 @@ firecrawl-py openai google-search-results +tqdm From 2022db7f0a3824abbab452bf957c2ec867b8a13a Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:25 +0530 Subject: [PATCH 057/102] Update websites.csv --- examples/sales_web_crawler/websites.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv index 32bee52d..eef3403e 100644 --- a/examples/sales_web_crawler/websites.csv +++ b/examples/sales_web_crawler/websites.csv @@ -1,2 +1,2 @@ website -https://www.launchfa.st +https://precog.iiit.ac.in/ From 7d8519218ae2ed674fd7aa6995fe94221ad0de73 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:39 +0530 Subject: [PATCH 058/102] Update app.py --- examples/sales_web_crawler/app.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index ae14fc62..f76280e9 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -1,11 +1,13 @@ -import os import csv import json +import os +import uuid from dotenv import load_dotenv from firecrawl import FirecrawlApp from openai import OpenAI from serpapi import GoogleSearch +from tqdm import tqdm load_dotenv() @@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def search_google(query, objective): """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") + # print(f"Parameters: query={query}, objective={objective}") search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) results = search.get_dict().get("organic_results", []) return {"objective": objective, "results": results} def scrape_url(url, objective): """Scrape a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") scrape_status = app.scrape_url( url, params={'formats': ['markdown']} @@ -31,19 +33,19 @@ def scrape_url(url, objective): def crawl_url(url, objective): """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") # If using a crawled url set, pass the ID in the function call below # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status['results'] = scrape_status['data'] scrape_status = app.crawl_url( url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} ) return {"objective": objective, "results": scrape_status} def analyze_website_content(content, objective): """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") + # print(f"Parameters: content={content[:50]}..., objective={objective}") analysis = generate_completion( "website data extractor", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", @@ -53,7 +55,7 @@ def analyze_website_content(content, objective): def generate_completion(role, task, content): """Generate a completion using OpenAI.""" - print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") response = client.chat.completions.create( model="gpt-4o", messages=[ @@ -86,13 +88,18 @@ def process_websites(file_path): if search_results['results']: top_result = search_results['results'][0] url = top_result['link'] + unique_filename = f'output_{uuid.uuid4()}.json' crawl_results = crawl_url(url, "Crawl website") if crawl_results['results']: - for each_result in crawl_results['results']['data'][:2]: - analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") - print(analysis_results['results']) - results.append(json.loads(analysis_results['results'])) - write_results_to_json(results, 'enriched_data.json') + for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") + try: + result = json.loads(analysis_results['results']) + if result: + results.append(result) + write_results_to_json(results, unique_filename) + except: + continue if __name__ == "__main__": # Process websites from the CSV file From 42ec08c76ea8ae1d5e9228cc072a52af2ab301e1 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 03:53:41 +0530 Subject: [PATCH 059/102] Update websites.csv --- examples/sales_web_crawler/websites.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv index eef3403e..f029ccfc 100644 --- a/examples/sales_web_crawler/websites.csv +++ b/examples/sales_web_crawler/websites.csv @@ -1,2 +1,2 @@ website -https://precog.iiit.ac.in/ +https://www.media.mit.edu/ From 8a4ee4482d703bf5b7b45aeb2027a6482b2a211c Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 03:54:14 +0530 Subject: [PATCH 060/102] Create output_01f6efd5-1297-4745-94b5-5972c10f17d6.json --- ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json new file mode 100644 index 00000000..8f1f5bd8 --- /dev/null +++ b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json @@ -0,0 +1,630 @@ +[ + { + "contacts": [ + { + "name": "Canan Dagdeviren", + "email": null, + "title": null, + "company": null + }, + { + "name": "Media Lab Communications", + "email": "press@media.mit.edu", + "title": null, + "company": "MIT Media Lab" + } + ] + }, + { + "people": [ + { + "name": "Xan Foote", + "title": "Group Contact", + "email": "fluidadmin@media.mit.edu" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Fluid Interfaces" + } + ] + }, + { + "emails": [], + "people": [ + { + "name": "Personal Robots", + "title": "Group", + "company": "MIT Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Personal Robots group" + } + ] + }, + { + "people": [ + { + "name": "David Sweeney", + "title": "Author" + }, + { + "name": "Rosalind W. Picard", + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Hugh Herr", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Deblina Sarkar", + "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" + }, + { + "name": "Canan Dagdeviren", + "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" + }, + { + "name": "Dava Newman", + "title": "Director; Apollo Professor of Astronautics" + }, + { + "name": "Cynthia Breazeal", + "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" + }, + { + "name": "Susan Blumenthal, MD", + "title": "Visiting Professor; Director's Circle Member" + } + ], + "emails": [], + "companies": [] + }, + { + "people": [ + { + "name": "Dan Blondell", + "title": "I2" + } + ], + "companies": [], + "emails": [] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": "Copyright Holder" + }, + { + "name": "Jonathan Williams", + "title": "Copyright Holder" + }, + { + "name": "Sara V. Fernandez", + "title": "Courtesy of" + }, + { + "name": "Irmandy Wicaksono", + "title": "Courtesy of" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": "Interdisciplinary Research Institution" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "David Sweeney", + "title": null + }, + { + "name": "Sarah Beckmann", + "title": null + }, + { + "name": "Behnaz Farahi", + "title": "Assistant Professor, Transformative Design" + }, + { + "name": "Paul Liang", + "title": "Assistant Professor, AI + Human Experience" + }, + { + "name": "Rosalind W. Picard", + "title": null + }, + { + "name": "Guillermo Herrera-Arcos", + "title": null + }, + { + "name": "Christine Higgins", + "title": null + }, + { + "name": "Patrick Chwalek", + "title": null + }, + { + "name": "Sarra Shubart", + "title": null + }, + { + "name": "Amanda Diehl", + "title": null + }, + { + "name": "Chia Evers", + "title": null + }, + { + "name": "Matthew Groh", + "title": null + }, + { + "name": "Cl\u00e9mence Taillandier", + "title": null + }, + { + "name": "Cody Paige", + "title": null + }, + { + "name": "Minoo Rathnasabapathy", + "title": null + }, + { + "name": "Alex Berke", + "title": null + } + ], + "emails": [ + "web-admin@media.mit.edu" + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Samsung" + }, + { + "name": "Castrol" + } + ] + }, + { + "people": [ + { + "name": "Tod Machover", + "title": "Opera Composer" + } + ], + "companies": [ + { + "name": "Future Worlds", + "title": "Design and action for the future we want to live in" + }, + { + "name": "NOAA", + "title": "The Challenge: To secure a sustainable future for all living things" + }, + { + "name": "MIT Media Lab", + "title": "Research and development in interdisciplinary expertise" + } + ] + }, + { + "emails": [ + "r-admin@media.mit.edu" + ], + "people": [ + { + "name": "Affective Computing group", + "title": "MIT Media Lab" + } + ], + "companies": [] + }, + { + "people": [ + { + "name": "David Sweeney", + "email": null, + "title": "Author at Samsung Newsroom" + }, + { + "name": "Pattie Maes", + "email": null, + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Rosalind W. Picard", + "email": null, + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + } + ], + "companies": [ + { + "name": "Samsung", + "email": null, + "title": "Collaborator" + }, + { + "name": "MIT Media Lab", + "email": null, + "title": "Collaborator" + } + ] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": null + }, + { + "name": "Jonathan Williams", + "title": null + }, + { + "name": "Sara V. Fernandez", + "title": null + }, + { + "name": "Irmandy Wicaksono", + "title": null + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "title": null + } + ], + "emails": [] + }, + { + "people": [], + "emails": [], + "companies": [], + "titles": [] + }, + { + "emails": [], + "people": [ + { + "name": "Andy Ryan", + "title": "Photographer", + "company": "MIT Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "department": "Program in Media Arts and Sciences" + }, + { + "name": "MIT", + "department": "Center for Bits and Atoms" + } + ] + }, + { + "people": [ + { + "name": "Dan Allen", + "title": "Media Lab" + } + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Castrol" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Pat Pataranutaporn", + "title": "Former Graduate Student" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Kavin Winson", + "title": "Researcher at KASIKORN Labs" + }, + { + "name": "Peggy Yin", + "title": "Harvard University Undergraduate" + }, + { + "name": "Auttasak Lapapirojn", + "title": "KASIKORN Labs" + }, + { + "name": "Pichayoot Ouppaphan", + "title": "KASIKORN Labs" + }, + { + "name": "Monchai Lertsutthiwong", + "title": "Head of AI Research at KASIKORN Business-Technology Group" + }, + { + "name": "Hal Hershfield", + "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles" + }, + { + "name": "Jeremy Bailenson", + "title": "Thomas More Storke Professor of Communication at Stanford University" + }, + { + "name": "Thanawit Prasongpongchai", + "title": "Designer at KBTG and Visiting Scientist at the Media Lab" + } + ], + "companies": [ + { + "name": "MIT", + "role": "AI and simulation research" + }, + { + "name": "KASIKORN Labs", + "role": "Research and co-authorship" + }, + { + "name": "KASIKORN Business-Technology Group", + "role": "AI research support" + } + ] + }, + { + "people": [ + { + "name": "Andy Ryan", + "title": "Copyright" + } + ], + "companies": [ + { + "name": "MIT Media Lab", + "collaborator": "Castrol", + "project": "Space Research" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Fadel Adib", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Edward Boyden", + "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology" + }, + { + "name": "Cynthia Breazeal", + "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" + }, + { + "name": "Canan Dagdeviren", + "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" + }, + { + "name": "Kevin Esvelt", + "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications" + }, + { + "name": "Behnaz Farahi", + "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor" + }, + { + "name": "Hugh Herr", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Hiroshi Ishii", + "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab" + }, + { + "name": "Joseph M. Jacobson", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Kent Larson", + "title": "Professor of the Practice" + }, + { + "name": "Paul Pu Liang", + "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science" + }, + { + "name": "Zach Lieberman", + "title": "Adjunct Associate Professor of Media Arts and Sciences" + }, + { + "name": "Andrew Lippman", + "title": "Senior Research Scientist" + }, + { + "name": "Tod Machover", + "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences" + }, + { + "name": "Pattie Maes", + "title": "Professor of Media Technology; Germeshausen Professor" + }, + { + "name": "Dava Newman", + "title": "Director; Apollo Professor of Astronautics" + }, + { + "name": "Joseph A. Paradiso", + "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences" + }, + { + "name": "Alex 'Sandy' Pentland", + "title": "Professor Post Tenure of Media Arts and Sciences" + }, + { + "name": "Rosalind W. Picard", + "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" + }, + { + "name": "Ramesh Raskar", + "title": "Associate Professor of Media Arts and Sciences" + }, + { + "name": "Mitchel Resnick", + "title": "LEGO Papert Professor of Learning Research" + }, + { + "name": "Deb Roy", + "title": "Professor of Media Arts and Sciences" + }, + { + "name": "Deblina Sarkar", + "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" + }, + { + "name": "Danielle Wood", + "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics" + } + ], + "emails": [], + "companies": [] + }, + { + "people": [ + { + "name": "Canan Dagdeviren", + "title": "Individual", + "email": null + }, + { + "name": "Jonathan Williams", + "title": "Individual", + "email": null + } + ], + "companies": [], + "emails": [] + }, + { + "people": [ + { + "name": "Dava Newman", + "title": "Media Lab Director" + }, + { + "name": "Xin Liu", + "title": "Media Lab Alum" + } + ], + "companies": [ + { + "name": "MIT Media Lab" + }, + { + "name": "Boston Museum of Science" + } + ], + "emails": [] + }, + { + "people": [ + { + "name": "Behnaz Farahi", + "title": "Assistant Professor, Transformative Design", + "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)" + }, + { + "name": "Paul Liang", + "title": "Assistant Professor, AI + Human Experience", + "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing" + }, + { + "name": "Barmak Heshmat", + "title": "Co-founder", + "company": "Brelyon" + }, + { + "name": "Mohammad Tariqul Islam", + "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow" + }, + { + "name": "Hao-Tung Yang", + "title": "Recipient of the T.S. Lin Fellowship Award" + }, + { + "name": "Deblina Sarkar", + "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award" + }, + { + "name": "Danielle Wood", + "title": "2024 Just Tech Fellow" + }, + { + "name": "Baju Joy", + "title": "Whitaker Health Sciences Fellowship Award Recipient" + }, + { + "name": "Max Addae", + "title": "2024 Guthman Musical Instrument Competition Winner" + }, + { + "name": "Tod Machover", + "title": "Head of Opera of the Future", + "affiliation": "MIT" + }, + { + "name": "Sharif Islam", + "title": "ESIP Community Fellow and Future Earth Coasts Fellow", + "affiliation": "Postdoctoral associate in the Space Enabled research group" + } + ], + "companies": [ + { + "name": "Samsung", + "collaboration": "MIT Media Lab" + }, + { + "name": "Brelyon", + "co_founder": "Barmak Heshmat" + }, + { + "name": "Castrol", + "collaboration": "AstroAnt Payload Program" + }, + { + "name": "Augmental", + "product": "Mouth-based touchpad" + } + ], + "email_addresses": [] + } +] From 7acd8d2edb6abc45a63fe1060377d2acb398ec36 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 19 Oct 2024 13:27:47 -0300 Subject: [PATCH 061/102] Nick: improved map ranking algorithm --- apps/api/src/controllers/v1/map.ts | 13 ++++++++++++- apps/api/src/controllers/v1/types.ts | 8 ++++++++ apps/api/src/lib/map-cosine.ts | 15 ++++++++------- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 5ed3dd51..112c06b4 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -2,6 +2,7 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { legacyCrawlerOptions, + LinkInfo, mapRequestSchema, RequestWithAuth, } from "./types"; @@ -109,6 +110,10 @@ export async function mapController( mapResults = mapResults.slice(0, minumumCutoff); } + + + let linkInfos: LinkInfo[] = []; + if (mapResults.length > 0) { if (req.body.search) { // Ensure all map results are first, maintaining their order @@ -117,6 +122,12 @@ export async function mapController( ...mapResults.slice(1).map((x) => x.url), ...links, ]; + + linkInfos = [ + mapResults[0], + ...mapResults.slice(1), + ...links.map((x) => ({ url: x })), + ] } else { mapResults.map((x) => { links.push(x.url); @@ -128,7 +139,7 @@ export async function mapController( if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); - links = performCosineSimilarity(links, searchQuery); + links = performCosineSimilarity(linkInfos, searchQuery); } links = links diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 033de6e0..91618e1a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -478,3 +478,11 @@ export function legacyDocumentConverter(doc: any): Document { }, }; } + + + +export interface LinkInfo { + url: string; + title?: string; + description?: string; +} \ No newline at end of file diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts index db2491a9..8804aa58 100644 --- a/apps/api/src/lib/map-cosine.ts +++ b/apps/api/src/lib/map-cosine.ts @@ -1,6 +1,7 @@ import { Logger } from "./logger"; +import { LinkInfo } from "../controllers/v1/types"; -export function performCosineSimilarity(links: string[], searchQuery: string) { +export function performCosineSimilarity(links: LinkInfo[], searchQuery: string) { try { // Function to calculate cosine similarity const cosineSimilarity = (vec1: number[], vec2: number[]): number => { @@ -27,20 +28,20 @@ export function performCosineSimilarity(links: string[], searchQuery: string) { // Calculate similarity scores const similarityScores = links.map((link) => { - const linkVector = textToVector(link); + const linkText = `${link.url} ${link.title || ''} ${link.description || ''}`.trim(); + const linkVector = textToVector(linkText); const searchVector = textToVector(searchQuery); return cosineSimilarity(linkVector, searchVector); }); - // Sort links based on similarity scores and print scores - const a = links + // Sort links based on similarity scores + const sortedLinks = links .map((link, index) => ({ link, score: similarityScores[index] })) .sort((a, b) => b.score - a.score); - links = a.map((item) => item.link); - return links; + return sortedLinks.map((item) => item.link.url); } catch (error) { Logger.error(`Error performing cosine similarity: ${error}`); - return links; + return links.map(link => link.url); } } From 2b0c52ff671427ecbbcfa8473a96ec0a98dc1920 Mon Sep 17 00:00:00 2001 From: Mayur Kawale <122032765+Mefisto04@users.noreply.github.com> Date: Sun, 20 Oct 2024 12:33:45 +0530 Subject: [PATCH 062/102] Update SELF_HOST.md --- SELF_HOST.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index dc8a1cf9..b86ddf3e 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -36,7 +36,7 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example` -To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features) +To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features) `.env:` ``` @@ -47,7 +47,7 @@ HOST=0.0.0.0 REDIS_URL=redis://redis:6379 REDIS_RATE_LIMIT_URL=redis://redis:6379 -## To turn on DB authentication, you need to set up supabase. +## To turn on DB authentication, you need to set up Supabase. USE_DB_AUTHENTICATION=false # ===== Optional ENVS ====== @@ -59,8 +59,8 @@ SUPABASE_SERVICE_TOKEN= # Other Optionals TEST_API_KEY= # use if you've set up authentication and want to test with a real API key -SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking -OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +SCRAPING_BEE_API_KEY= # use if you'd like to use ScrapingBee to handle JS blocking +OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation) BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback @@ -176,4 +176,4 @@ By addressing these common issues, you can ensure a smoother setup and operation ## Install Firecrawl on a Kubernetes Cluster (Simple Version) -Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. \ No newline at end of file +Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. From d113199a297a98e0b13a4438838486bb2f21f736 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sun, 20 Oct 2024 18:08:38 +0530 Subject: [PATCH 063/102] Update app.py --- examples/sales_web_crawler/app.py | 92 +++++++++++++++++++------------ 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index f76280e9..842e1345 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -1,13 +1,13 @@ import csv import json import os -import uuid from dotenv import load_dotenv from firecrawl import FirecrawlApp from openai import OpenAI from serpapi import GoogleSearch -from tqdm import tqdm +from swarm import Agent +from swarm.repl import run_demo_loop load_dotenv() @@ -17,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def search_google(query, objective): """Search Google using SerpAPI.""" - # print(f"Parameters: query={query}, objective={objective}") + print(f"Parameters: query={query}, objective={objective}") search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) results = search.get_dict().get("organic_results", []) return {"objective": objective, "results": results} def scrape_url(url, objective): """Scrape a website using Firecrawl.""" - # print(f"Parameters: url={url}, objective={objective}") + print(f"Parameters: url={url}, objective={objective}") scrape_status = app.scrape_url( url, params={'formats': ['markdown']} @@ -33,29 +33,29 @@ def scrape_url(url, objective): def crawl_url(url, objective): """Crawl a website using Firecrawl.""" - # print(f"Parameters: url={url}, objective={objective}") + print(f"Parameters: url={url}, objective={objective}") # If using a crawled url set, pass the ID in the function call below # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status['results'] = scrape_status['data'] scrape_status = app.crawl_url( url, - params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} ) return {"objective": objective, "results": scrape_status} def analyze_website_content(content, objective): """Analyze the scraped website content using OpenAI.""" - # print(f"Parameters: content={content[:50]}..., objective={objective}") + print(f"Parameters: content={content[:50]}..., objective={objective}") analysis = generate_completion( "website data extractor", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", "Objective: " + objective + "\nContent: " + content ) - return {"objective": objective, "results": analysis} + return {"objective": objective, "results": json.loads(analysis)} def generate_completion(role, task, content): """Generate a completion using OpenAI.""" - # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") response = client.chat.completions.create( model="gpt-4o", messages=[ @@ -76,31 +76,55 @@ def read_websites_from_csv(file_path): def write_results_to_json(results, file_path): """Write results to a JSON file.""" - with open(file_path, mode='w') as file: - json.dump(results, file, indent=4) + with open(file_path, mode='w', encoding='utf-8') as file: + json.dump(json.loads(results), file, ensure_ascii=False) -def process_websites(file_path): - """Process websites from a CSV file and write results to a new JSON file.""" - results = [] - websites = read_websites_from_csv(file_path) - for website in websites: - search_results = search_google(website, "Search website") - if search_results['results']: - top_result = search_results['results'][0] - url = top_result['link'] - unique_filename = f'output_{uuid.uuid4()}.json' - crawl_results = crawl_url(url, "Crawl website") - if crawl_results['results']: - for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): - analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") - try: - result = json.loads(analysis_results['results']) - if result: - results.append(result) - write_results_to_json(results, unique_filename) - except: - continue +def handoff_to_search_google(): + """Hand off the search query to the search google agent.""" + return google_search_agent + +def handoff_to_map_url(): + """Hand off the url to the map url agent.""" + return crawl_website_agent + +def handoff_to_analyst(): + """Hand off the website content to the analyst agent.""" + return analyst_agent + +def handoff_to_writer(): + """Hand off the results to the writer agent.""" + return writer_agent + +user_interface_agent = Agent( + name="User Interface Agent", + instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.", + functions=[read_websites_from_csv, handoff_to_search_google], +) + +google_search_agent = Agent( + name="Google Search Agent", + instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.", + functions=[search_google, handoff_to_map_url], +) + +crawl_website_agent = Agent( + name="Crawl Website Agent", + instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.", + functions=[crawl_url, handoff_to_analyst], +) + +analyst_agent = Agent( + name="Analyst Agent", + instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.", + functions=[analyze_website_content, handoff_to_writer], +) + +writer_agent = Agent( + name="Writer Agent", + instructions="You are a writer agent that writes the final results to a JSON file.", + functions=[write_results_to_json], +) if __name__ == "__main__": - # Process websites from the CSV file - process_websites('websites.csv') + # Run the demo loop with the user interface agent + run_demo_loop(user_interface_agent, stream=True) From cf98d69bbbf5e8d9afd546ef7dff74373bce7249 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sun, 20 Oct 2024 18:09:38 +0530 Subject: [PATCH 064/102] Update requirements.txt --- examples/sales_web_crawler/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sales_web_crawler/requirements.txt b/examples/sales_web_crawler/requirements.txt index 180c5d6c..d7be486c 100644 --- a/examples/sales_web_crawler/requirements.txt +++ b/examples/sales_web_crawler/requirements.txt @@ -1,4 +1,4 @@ firecrawl-py openai google-search-results -tqdm +git+https://github.com/openai/swarm.git From e1d8e1584e79d44702cf7c487232fa80d233eb00 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 21 Oct 2024 12:23:27 -0300 Subject: [PATCH 065/102] Update SELF_HOST.md --- SELF_HOST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index b86ddf3e..78228485 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -59,7 +59,7 @@ SUPABASE_SERVICE_TOKEN= # Other Optionals TEST_API_KEY= # use if you've set up authentication and want to test with a real API key -SCRAPING_BEE_API_KEY= # use if you'd like to use ScrapingBee to handle JS blocking +SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation) BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail From 22d375ad293296c3533c2195bd6be9a3fbb841ad Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 21 Oct 2024 12:01:09 -0400 Subject: [PATCH 066/102] Updates --- examples/sales_web_crawler/app.py | 116 +--- ..._01f6efd5-1297-4745-94b5-5972c10f17d6.json | 630 ------------------ examples/sales_web_crawler/websites.csv | 2 - 3 files changed, 32 insertions(+), 716 deletions(-) delete mode 100644 examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json delete mode 100644 examples/sales_web_crawler/websites.csv diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index 842e1345..70063071 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -15,43 +15,35 @@ load_dotenv() app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -def search_google(query, objective): - """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") - search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) - results = search.get_dict().get("organic_results", []) - return {"objective": objective, "results": results} - -def scrape_url(url, objective): - """Scrape a website using Firecrawl.""" +def crawl_and_analyze_url(url, objective): + """Crawl a website using Firecrawl and analyze the content.""" print(f"Parameters: url={url}, objective={objective}") - scrape_status = app.scrape_url( + # Crawl the website + crawl_status = app.crawl_url( url, - params={'formats': ['markdown']} + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}, + poll_interval=5 ) - return {"objective": objective, "results": scrape_status} - -def crawl_url(url, objective): - """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") - # If using a crawled url set, pass the ID in the function call below - # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") - # scrape_status['results'] = scrape_status['data'] - scrape_status = app.crawl_url( - url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} - ) - return {"objective": objective, "results": scrape_status} - -def analyze_website_content(content, objective): - """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") - analysis = generate_completion( - "website data extractor", - f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", - "Objective: " + objective + "\nContent: " + content - ) - return {"objective": objective, "results": json.loads(analysis)} + crawl_status = crawl_status['data'] + # Process each 'markdown' element individually + combined_results = [] + for item in crawl_status: + if 'markdown' in item: + content = item['markdown'] + # Analyze the content + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + # Parse the JSON result + try: + result = json.loads(analysis) + combined_results.append(result) + except json.JSONDecodeError: + print(f"Could not parse JSON from analysis: {analysis}") + # Combine the results + return {"objective": objective, "results": combined_results} def generate_completion(role, task, content): """Generate a completion using OpenAI.""" @@ -65,64 +57,20 @@ def generate_completion(role, task, content): ) return response.choices[0].message.content -def read_websites_from_csv(file_path): - """Read websites from a CSV file.""" - websites = [] - with open(file_path, mode='r') as file: - csv_reader = csv.DictReader(file) - for row in csv_reader: - websites.append(row['website']) - return websites - -def write_results_to_json(results, file_path): - """Write results to a JSON file.""" - with open(file_path, mode='w', encoding='utf-8') as file: - json.dump(json.loads(results), file, ensure_ascii=False) - -def handoff_to_search_google(): - """Hand off the search query to the search google agent.""" - return google_search_agent - -def handoff_to_map_url(): - """Hand off the url to the map url agent.""" +def handoff_to_crawl_url(): + """Hand off the url to the crawl url agent.""" return crawl_website_agent -def handoff_to_analyst(): - """Hand off the website content to the analyst agent.""" - return analyst_agent - -def handoff_to_writer(): - """Hand off the results to the writer agent.""" - return writer_agent - user_interface_agent = Agent( name="User Interface Agent", - instructions="You are a user interface agent that handles all interactions with the user. You need to always start with reading a CSV, then perform web data extraction objective that the user wants to achieve by searching the web, crawling the website URL, and extracting the content from a specific page. Be concise.", - functions=[read_websites_from_csv, handoff_to_search_google], -) - -google_search_agent = Agent( - name="Google Search Agent", - instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the crawl agent.", - functions=[search_google, handoff_to_map_url], + instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.", + functions=[handoff_to_crawl_url], ) crawl_website_agent = Agent( name="Crawl Website Agent", - instructions="You are a crawl url agent specialized in crawling the web pages. When you are done, you must hand off the results to the analyst agent.", - functions=[crawl_url, handoff_to_analyst], -) - -analyst_agent = Agent( - name="Analyst Agent", - instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must hand off the results to the writer agent.", - functions=[analyze_website_content, handoff_to_writer], -) - -writer_agent = Agent( - name="Writer Agent", - instructions="You are a writer agent that writes the final results to a JSON file.", - functions=[write_results_to_json], + instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.", + functions=[crawl_and_analyze_url], ) if __name__ == "__main__": diff --git a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json b/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json deleted file mode 100644 index 8f1f5bd8..00000000 --- a/examples/sales_web_crawler/output_01f6efd5-1297-4745-94b5-5972c10f17d6.json +++ /dev/null @@ -1,630 +0,0 @@ -[ - { - "contacts": [ - { - "name": "Canan Dagdeviren", - "email": null, - "title": null, - "company": null - }, - { - "name": "Media Lab Communications", - "email": "press@media.mit.edu", - "title": null, - "company": "MIT Media Lab" - } - ] - }, - { - "people": [ - { - "name": "Xan Foote", - "title": "Group Contact", - "email": "fluidadmin@media.mit.edu" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Fluid Interfaces" - } - ] - }, - { - "emails": [], - "people": [ - { - "name": "Personal Robots", - "title": "Group", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Personal Robots group" - } - ] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": "Author" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Susan Blumenthal, MD", - "title": "Visiting Professor; Director's Circle Member" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Dan Blondell", - "title": "I2" - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Copyright Holder" - }, - { - "name": "Jonathan Williams", - "title": "Copyright Holder" - }, - { - "name": "Sara V. Fernandez", - "title": "Courtesy of" - }, - { - "name": "Irmandy Wicaksono", - "title": "Courtesy of" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": "Interdisciplinary Research Institution" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "title": null - }, - { - "name": "Sarah Beckmann", - "title": null - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience" - }, - { - "name": "Rosalind W. Picard", - "title": null - }, - { - "name": "Guillermo Herrera-Arcos", - "title": null - }, - { - "name": "Christine Higgins", - "title": null - }, - { - "name": "Patrick Chwalek", - "title": null - }, - { - "name": "Sarra Shubart", - "title": null - }, - { - "name": "Amanda Diehl", - "title": null - }, - { - "name": "Chia Evers", - "title": null - }, - { - "name": "Matthew Groh", - "title": null - }, - { - "name": "Cl\u00e9mence Taillandier", - "title": null - }, - { - "name": "Cody Paige", - "title": null - }, - { - "name": "Minoo Rathnasabapathy", - "title": null - }, - { - "name": "Alex Berke", - "title": null - } - ], - "emails": [ - "web-admin@media.mit.edu" - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Samsung" - }, - { - "name": "Castrol" - } - ] - }, - { - "people": [ - { - "name": "Tod Machover", - "title": "Opera Composer" - } - ], - "companies": [ - { - "name": "Future Worlds", - "title": "Design and action for the future we want to live in" - }, - { - "name": "NOAA", - "title": "The Challenge: To secure a sustainable future for all living things" - }, - { - "name": "MIT Media Lab", - "title": "Research and development in interdisciplinary expertise" - } - ] - }, - { - "emails": [ - "r-admin@media.mit.edu" - ], - "people": [ - { - "name": "Affective Computing group", - "title": "MIT Media Lab" - } - ], - "companies": [] - }, - { - "people": [ - { - "name": "David Sweeney", - "email": null, - "title": "Author at Samsung Newsroom" - }, - { - "name": "Pattie Maes", - "email": null, - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Rosalind W. Picard", - "email": null, - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - } - ], - "companies": [ - { - "name": "Samsung", - "email": null, - "title": "Collaborator" - }, - { - "name": "MIT Media Lab", - "email": null, - "title": "Collaborator" - } - ] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": null - }, - { - "name": "Jonathan Williams", - "title": null - }, - { - "name": "Sara V. Fernandez", - "title": null - }, - { - "name": "Irmandy Wicaksono", - "title": null - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "title": null - } - ], - "emails": [] - }, - { - "people": [], - "emails": [], - "companies": [], - "titles": [] - }, - { - "emails": [], - "people": [ - { - "name": "Andy Ryan", - "title": "Photographer", - "company": "MIT Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "department": "Program in Media Arts and Sciences" - }, - { - "name": "MIT", - "department": "Center for Bits and Atoms" - } - ] - }, - { - "people": [ - { - "name": "Dan Allen", - "title": "Media Lab" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Castrol" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Pat Pataranutaporn", - "title": "Former Graduate Student" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Kavin Winson", - "title": "Researcher at KASIKORN Labs" - }, - { - "name": "Peggy Yin", - "title": "Harvard University Undergraduate" - }, - { - "name": "Auttasak Lapapirojn", - "title": "KASIKORN Labs" - }, - { - "name": "Pichayoot Ouppaphan", - "title": "KASIKORN Labs" - }, - { - "name": "Monchai Lertsutthiwong", - "title": "Head of AI Research at KASIKORN Business-Technology Group" - }, - { - "name": "Hal Hershfield", - "title": "Professor of Marketing, Behavioral Decision Making, and Psychology at the University of California at Los Angeles" - }, - { - "name": "Jeremy Bailenson", - "title": "Thomas More Storke Professor of Communication at Stanford University" - }, - { - "name": "Thanawit Prasongpongchai", - "title": "Designer at KBTG and Visiting Scientist at the Media Lab" - } - ], - "companies": [ - { - "name": "MIT", - "role": "AI and simulation research" - }, - { - "name": "KASIKORN Labs", - "role": "Research and co-authorship" - }, - { - "name": "KASIKORN Business-Technology Group", - "role": "AI research support" - } - ] - }, - { - "people": [ - { - "name": "Andy Ryan", - "title": "Copyright" - } - ], - "companies": [ - { - "name": "MIT Media Lab", - "collaborator": "Castrol", - "project": "Space Research" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Fadel Adib", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Edward Boyden", - "title": "Professor of Media Arts and Sciences; Y. Eva Tan Professor in Neurotechnology" - }, - { - "name": "Cynthia Breazeal", - "title": "Professor of Media Arts and Sciences; MIT Dean for Digital Learning" - }, - { - "name": "Canan Dagdeviren", - "title": "Associate Professor of Media Arts and Sciences; LG Career Development Professor of Media Arts and Sciences" - }, - { - "name": "Kevin Esvelt", - "title": "Associate Professor of Media Arts and Sciences; NEC Career Development Professor of Computer and Communications" - }, - { - "name": "Behnaz Farahi", - "title": "Assistant Professor of Media Arts and Sciences; Asahi Broadcast Corp Career Development Assistant Professor" - }, - { - "name": "Hugh Herr", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Hiroshi Ishii", - "title": "Jerome B. Wiesner Professor of Media Arts and Sciences; Associate Director, MIT Media Lab" - }, - { - "name": "Joseph M. Jacobson", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Kent Larson", - "title": "Professor of the Practice" - }, - { - "name": "Paul Pu Liang", - "title": "Assistant Professor of Media Arts and Sciences; Assistant Professor of Electrical Engineering and Computer Science" - }, - { - "name": "Zach Lieberman", - "title": "Adjunct Associate Professor of Media Arts and Sciences" - }, - { - "name": "Andrew Lippman", - "title": "Senior Research Scientist" - }, - { - "name": "Tod Machover", - "title": "Muriel R. Cooper Professor of Music and Media; Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Pattie Maes", - "title": "Professor of Media Technology; Germeshausen Professor" - }, - { - "name": "Dava Newman", - "title": "Director; Apollo Professor of Astronautics" - }, - { - "name": "Joseph A. Paradiso", - "title": "Alexander W Dreyfoos (1954) Professor; Associate Academic Head, Program in Media Arts and Sciences" - }, - { - "name": "Alex 'Sandy' Pentland", - "title": "Professor Post Tenure of Media Arts and Sciences" - }, - { - "name": "Rosalind W. Picard", - "title": "Professor of Media Arts and Sciences; Grover M. Hermann Professor in Health Sciences and Technology" - }, - { - "name": "Ramesh Raskar", - "title": "Associate Professor of Media Arts and Sciences" - }, - { - "name": "Mitchel Resnick", - "title": "LEGO Papert Professor of Learning Research" - }, - { - "name": "Deb Roy", - "title": "Professor of Media Arts and Sciences" - }, - { - "name": "Deblina Sarkar", - "title": "Assistant Professor of Media Arts and Sciences; AT&T Career Development Professor" - }, - { - "name": "Danielle Wood", - "title": "Associate Professor of Media Arts and Sciences; Associate Professor (Joint) of Aeronautics and Astronautics" - } - ], - "emails": [], - "companies": [] - }, - { - "people": [ - { - "name": "Canan Dagdeviren", - "title": "Individual", - "email": null - }, - { - "name": "Jonathan Williams", - "title": "Individual", - "email": null - } - ], - "companies": [], - "emails": [] - }, - { - "people": [ - { - "name": "Dava Newman", - "title": "Media Lab Director" - }, - { - "name": "Xin Liu", - "title": "Media Lab Alum" - } - ], - "companies": [ - { - "name": "MIT Media Lab" - }, - { - "name": "Boston Museum of Science" - } - ], - "emails": [] - }, - { - "people": [ - { - "name": "Behnaz Farahi", - "title": "Assistant Professor, Transformative Design", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS)" - }, - { - "name": "Paul Liang", - "title": "Assistant Professor, AI + Human Experience", - "affiliation": "MIT Media Lab's Program in Media Arts and Sciences (MAS) and MIT Schwarzman College of Computing" - }, - { - "name": "Barmak Heshmat", - "title": "Co-founder", - "company": "Brelyon" - }, - { - "name": "Mohammad Tariqul Islam", - "title": "MIT-Novo Nordisk Artificial Intelligence Postdoctoral Fellow" - }, - { - "name": "Hao-Tung Yang", - "title": "Recipient of the T.S. Lin Fellowship Award" - }, - { - "name": "Deblina Sarkar", - "title": "Recipient of NSF CAREER Award and ChadTough New Investigator Award" - }, - { - "name": "Danielle Wood", - "title": "2024 Just Tech Fellow" - }, - { - "name": "Baju Joy", - "title": "Whitaker Health Sciences Fellowship Award Recipient" - }, - { - "name": "Max Addae", - "title": "2024 Guthman Musical Instrument Competition Winner" - }, - { - "name": "Tod Machover", - "title": "Head of Opera of the Future", - "affiliation": "MIT" - }, - { - "name": "Sharif Islam", - "title": "ESIP Community Fellow and Future Earth Coasts Fellow", - "affiliation": "Postdoctoral associate in the Space Enabled research group" - } - ], - "companies": [ - { - "name": "Samsung", - "collaboration": "MIT Media Lab" - }, - { - "name": "Brelyon", - "co_founder": "Barmak Heshmat" - }, - { - "name": "Castrol", - "collaboration": "AstroAnt Payload Program" - }, - { - "name": "Augmental", - "product": "Mouth-based touchpad" - } - ], - "email_addresses": [] - } -] diff --git a/examples/sales_web_crawler/websites.csv b/examples/sales_web_crawler/websites.csv deleted file mode 100644 index f029ccfc..00000000 --- a/examples/sales_web_crawler/websites.csv +++ /dev/null @@ -1,2 +0,0 @@ -website -https://www.media.mit.edu/ From d2344aa14bc80b93b8f14ebc211c1884d98b104c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 21 Oct 2024 16:11:32 -0300 Subject: [PATCH 067/102] Revert "Nick: improved map ranking algorithm" This reverts commit 7acd8d2edb6abc45a63fe1060377d2acb398ec36. --- apps/api/src/controllers/v1/map.ts | 13 +------------ apps/api/src/controllers/v1/types.ts | 8 -------- apps/api/src/lib/map-cosine.ts | 15 +++++++-------- 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 112c06b4..5ed3dd51 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -2,7 +2,6 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { legacyCrawlerOptions, - LinkInfo, mapRequestSchema, RequestWithAuth, } from "./types"; @@ -110,10 +109,6 @@ export async function mapController( mapResults = mapResults.slice(0, minumumCutoff); } - - - let linkInfos: LinkInfo[] = []; - if (mapResults.length > 0) { if (req.body.search) { // Ensure all map results are first, maintaining their order @@ -122,12 +117,6 @@ export async function mapController( ...mapResults.slice(1).map((x) => x.url), ...links, ]; - - linkInfos = [ - mapResults[0], - ...mapResults.slice(1), - ...links.map((x) => ({ url: x })), - ] } else { mapResults.map((x) => { links.push(x.url); @@ -139,7 +128,7 @@ export async function mapController( if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); - links = performCosineSimilarity(linkInfos, searchQuery); + links = performCosineSimilarity(links, searchQuery); } links = links diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 91618e1a..033de6e0 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -478,11 +478,3 @@ export function legacyDocumentConverter(doc: any): Document { }, }; } - - - -export interface LinkInfo { - url: string; - title?: string; - description?: string; -} \ No newline at end of file diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts index 8804aa58..db2491a9 100644 --- a/apps/api/src/lib/map-cosine.ts +++ b/apps/api/src/lib/map-cosine.ts @@ -1,7 +1,6 @@ import { Logger } from "./logger"; -import { LinkInfo } from "../controllers/v1/types"; -export function performCosineSimilarity(links: LinkInfo[], searchQuery: string) { +export function performCosineSimilarity(links: string[], searchQuery: string) { try { // Function to calculate cosine similarity const cosineSimilarity = (vec1: number[], vec2: number[]): number => { @@ -28,20 +27,20 @@ export function performCosineSimilarity(links: LinkInfo[], searchQuery: string) // Calculate similarity scores const similarityScores = links.map((link) => { - const linkText = `${link.url} ${link.title || ''} ${link.description || ''}`.trim(); - const linkVector = textToVector(linkText); + const linkVector = textToVector(link); const searchVector = textToVector(searchQuery); return cosineSimilarity(linkVector, searchVector); }); - // Sort links based on similarity scores - const sortedLinks = links + // Sort links based on similarity scores and print scores + const a = links .map((link, index) => ({ link, score: similarityScores[index] })) .sort((a, b) => b.score - a.score); - return sortedLinks.map((item) => item.link.url); + links = a.map((item) => item.link); + return links; } catch (error) { Logger.error(`Error performing cosine similarity: ${error}`); - return links.map(link => link.url); + return links; } } From 76c0073829e92448a6148c69e80a4a8ef76c74bf Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 21 Oct 2024 16:27:15 -0300 Subject: [PATCH 068/102] Nick: grok 2 example --- examples/grok_web_crawler/grok_web_crawler.py | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 examples/grok_web_crawler/grok_web_crawler.py diff --git a/examples/grok_web_crawler/grok_web_crawler.py b/examples/grok_web_crawler/grok_web_crawler.py new file mode 100644 index 00000000..8d598e86 --- /dev/null +++ b/examples/grok_web_crawler/grok_web_crawler.py @@ -0,0 +1,150 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +import requests + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +grok_api_key = os.getenv("GROK_API_KEY") + +# Initialize the FirecrawlApp +app = FirecrawlApp(api_key=firecrawl_api_key) + +# Function to make Grok API calls +def grok_completion(prompt): + url = "https://api.x.ai/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {grok_api_key}" + } + data = { + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt + } + ], + "model": "grok-beta", + "stream": False, + "temperature": 0 + } + response = requests.post(url, headers=headers, json=data) + return response.json()['choices'][0]['message']['content'] + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + map_search_parameter = grok_completion(map_prompt) + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + print(f"{Colors.MAGENTA}{map_search_parameter}{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}") + print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}") + return map_website["links"] + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app): + try: + print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}") + # Get top 3 links from the map result + top_links = map_website[:3] if isinstance(map_website, list) else [] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + for link in top_links: + print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}") + # Scrape the page + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}") + + + # Check if objective is met + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. + If the objective is not met with confidence, respond with 'Objective not met'. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + + Remember: + 1. Only return JSON if you are confident the objective is fully met. + 2. Keep the JSON structure as simple and flat as possible. + 3. Do not include any explanations or markdown formatting in your response. + """ + + result = grok_completion(check_prompt) + print(f"{Colors.MAGENTA}{result}{Colors.RESET}") + if result != "Objective not met": + print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}") + try: + result = result.replace("```json", "").replace("```", "") + return json.loads(result) + except json.JSONDecodeError: + print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}") + else: + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app) + + if result: + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + +if __name__ == "__main__": + main() From 3cd328cf93618b26073f94bb64b2f661528cefca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 22 Oct 2024 18:58:48 +0200 Subject: [PATCH 069/102] feat(bulk/scrape): add node and python SDK integration + docs --- apps/js-sdk/firecrawl/README.md | 40 +++++++ apps/js-sdk/firecrawl/src/index.ts | 138 +++++++++++++++++++++++++ apps/python-sdk/README.md | 63 +++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 117 +++++++++++++++++++++ 4 files changed, 358 insertions(+) diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 0f3a6824..1655d2ee 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -145,6 +145,46 @@ watch.addEventListener("done", state => { }); ``` +### Bulk scraping multiple URLs + +To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. + +```js +const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { + formats: ['markdown', 'html'], +}) +``` + + +#### Asynchronous bulk scrape + +To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape. + +```js +const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); +``` + +#### Bulk scrape with WebSockets + +To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats. + +```js +// Bulk scrape multiple URLs with WebSockets: +const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); + +watch.addEventListener("document", doc => { + console.log("DOC", doc.detail); +}); + +watch.addEventListener("error", err => { + console.error("ERR", err.detail.error); +}); + +watch.addEventListener("done", state => { + console.log("DONE", state.detail.status); +}); +``` + ## Error Handling The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks. diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a839d5d0..30797c34 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -493,6 +493,144 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } + /** + * Initiates a bulk scrape job for multiple URLs using the Firecrawl API. + * @param url - The URLs to scrape. + * @param params - Additional parameters for the scrape request. + * @param pollInterval - Time in seconds for job status checks. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns The response from the crawl operation. + */ + async bulkScrapeUrls( + urls: string[], + params?: ScrapeParams, + pollInterval: number = 2, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: any = { urls, ...(params ?? {}) }; + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/bulk/scrape`, + jsonData, + headers + ); + if (response.status === 200) { + const id: string = response.data.id; + return this.monitorJobStatus(id, headers, pollInterval); + } else { + this.handleError(response, "start bulk scrape job"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } + + async asyncBulkScrapeUrls( + urls: string[], + params?: ScrapeParams, + idempotencyKey?: string + ): Promise { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData: any = { urls, ...(params ?? {}) }; + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/bulk/scrape`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data; + } else { + this.handleError(response, "start bulk scrape job"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error." }; + } + + /** + * Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket. + * @param urls - The URL to scrape. + * @param params - Additional parameters for the scrape request. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns A CrawlWatcher instance to monitor the crawl job. + */ + async bulkScrapeUrlsAndWatch( + urls: string[], + params?: ScrapeParams, + idempotencyKey?: string, + ) { + const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey); + + if (crawl.success && crawl.id) { + const id = crawl.id; + return new CrawlWatcher(id, this); + } + + throw new FirecrawlError("Bulk scrape job failed to start", 400); + } + + /** + * Checks the status of a bulk scrape job using the Firecrawl API. + * @param id - The ID of the bulk scrape operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) + * @returns The response containing the job status. + */ + async checkBulkScrapeStatus(id?: string, getAllData = false): Promise { + if (!id) { + throw new FirecrawlError("No bulk scrape ID provided", 400); + } + + const headers: AxiosRequestHeaders = this.prepareHeaders(); + try { + const response: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/bulk/scrape/${id}`, + headers + ); + if (response.status === 200) { + let allData = response.data.data; + if (getAllData && response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } + return ({ + success: response.data.success, + status: response.data.status, + total: response.data.total, + completed: response.data.completed, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, + data: allData, + error: response.data.error, + }) + } else { + this.handleError(response, "check bulk scrape status"); + } + } catch (error: any) { + throw new FirecrawlError(error.message, 500); + } + return { success: false, error: "Internal server error." }; + } + /** * Prepares the headers for an API request. * @param idempotencyKey - Optional key to ensure idempotency. diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index a437e0c6..6a7d4a0a 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -149,6 +149,69 @@ async def start_crawl_and_watch(): await start_crawl_and_watch() ``` +### Scraping multiple URLs in bulk + +To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. + +```python +idempotency_key = str(uuid.uuid4()) # optional idempotency key +bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) +print(bulk_scrape_result) +``` + +### Asynchronous bulk scrape + +To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. + +```python +bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) +print(bulk_scrape_result) +``` + +### Checking bulk scrape status + +To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job. + +```python +id = bulk_scrape_result['id'] +status = app.check_bulk_scrape_job(id) +``` + +### Bulk scrape with WebSockets + +To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. + +```python +# inside an async function... +nest_asyncio.apply() + +# Define event handlers +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + +# Function to start the crawl and watch process +async def start_crawl_and_watch(): + # Initiate the crawl job and get the watcher + watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() + +# Run the event loop +await start_crawl_and_watch() +``` + ## Error Handling The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 70f677ef..4b596619 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -271,6 +271,123 @@ class FirecrawlApp: else: self._handle_error(response, 'map') + def bulk_scrape_urls(self, urls: list[str], + params: Optional[Dict[str, Any]] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None) -> Any: + """ + Initiate a bulk scrape job for the specified URLs using the Firecrawl API. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Dict[str, Any]: A dictionary containing the scrape results. The structure includes: + - 'success' (bool): Indicates if the bulk scrape was successful. + - 'status' (str): The final status of the bulk scrape job (e.g., 'completed'). + - 'completed' (int): Number of scraped pages that completed. + - 'total' (int): Total number of scraped pages. + - 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires. + - 'data' (List[Dict]): List of all the scraped pages. + + Raises: + Exception: If the bulk scrape job initiation or monitoring fails. + """ + endpoint = f'/v1/bulk/scrape' + headers = self._prepare_headers(idempotency_key) + json_data = {'urls': urls} + if params: + json_data.update(params) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + if response.status_code == 200: + id = response.json().get('id') + return self._monitor_job_status(id, headers, poll_interval) + + else: + self._handle_error(response, 'start bulk scrape job') + + + def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + """ + Initiate a crawl job asynchronously. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes: + - 'success' (bool): Indicates if the bulk scrape initiation was successful. + - 'id' (str): The unique identifier for the bulk scrape job. + - 'url' (str): The URL to check the status of the bulk scrape job. + """ + endpoint = f'/v1/bulk/scrape' + headers = self._prepare_headers(idempotency_key) + json_data = {'urls': urls} + if params: + json_data.update(params) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, 'start bulk scrape job') + + def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + """ + Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket. + + Args: + urls (list[str]): The URLs to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job. + """ + crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key) + if crawl_response['success'] and 'id' in crawl_response: + return CrawlWatcher(crawl_response['id'], self) + else: + raise Exception("Bulk scrape job failed to start") + + def check_bulk_scrape_status(self, id: str) -> Any: + """ + Check the status of a bulk scrape job using the Firecrawl API. + + Args: + id (str): The ID of the bulk scrape job. + + Returns: + Any: The status of the bulk scrape job. + + Raises: + Exception: If the status check request fails. + """ + endpoint = f'/v1/bulk/scrape/{id}' + + headers = self._prepare_headers() + response = self._get_request(f'{self.api_url}{endpoint}', headers) + if response.status_code == 200: + data = response.json() + return { + 'success': True, + 'status': data.get('status'), + 'total': data.get('total'), + 'completed': data.get('completed'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } + else: + self._handle_error(response, 'check bulk scrape status') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. From 6ed3104eb6bd20fdf45e9f1fd64432000a280c4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 22 Oct 2024 20:28:10 +0200 Subject: [PATCH 070/102] feat: clear ACUC cache endpoint based on team ID --- apps/api/src/controllers/auth.ts | 9 ++++++++- .../src/controllers/v0/admin/acuc-cache-clear.ts | 15 +++++++++++++++ apps/api/src/routes/admin.ts | 7 +++++++ apps/api/src/routes/v1.ts | 2 +- 4 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/controllers/v0/admin/acuc-cache-clear.ts diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 5546bc17..552dd9b4 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { sendNotification } from "../services/notification/email_notification"; import { Logger } from "../lib/logger"; import { redlock } from "../services/redlock"; -import { getValue } from "../services/redis"; +import { deleteKey, getValue } from "../services/redis"; import { setValue } from "../services/redis"; import { validate } from "uuid"; import * as Sentry from "@sentry/node"; @@ -128,6 +128,13 @@ export async function getACUC( } } +export async function clearACUC( + api_key: string, +): Promise { + const cacheKeyACUC = `acuc_${api_key}`; + await deleteKey(cacheKeyACUC); +} + export async function authenticateUser( req, res, diff --git a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts new file mode 100644 index 00000000..3ef1f7fb --- /dev/null +++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts @@ -0,0 +1,15 @@ +import { Request, Response } from "express"; +import { supabase_service } from "../../../services/supabase"; +import { clearACUC } from "../../auth"; + +export async function acucCacheClearController(req: Request, res: Response) { + const team_id: string = req.body.team_id; + + const keys = await supabase_service.from("api_keys") + .select("*") + .eq("team_id", team_id); + + await Promise.all(keys.data.map(x => clearACUC(x.key))); + + res.json({ ok: true }); +} diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index 38611eac..88159060 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -6,6 +6,8 @@ import { cleanBefore24hCompleteJobsController, queuesController, } from "../controllers/v0/admin/queue"; +import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; +import { wrap } from "./v1"; export const adminRouter = express.Router(); @@ -33,3 +35,8 @@ adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`, autoscalerController ); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, + wrap(acucCacheClearController), +); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 246f9b05..880ae885 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -94,7 +94,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { next(); } -function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { +export function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { return (req, res, next) => { controller(req, res) .catch(err => next(err)) From bd55464b52e235ff492f0e87d2608b3ed4a15ef7 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Tue, 22 Oct 2024 22:28:02 +0300 Subject: [PATCH 071/102] skipTlsVerification --- apps/api/src/controllers/v1/types.ts | 2 ++ apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/index.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 +++-- apps/api/src/scraper/WebScraper/single_url.ts | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 033de6e0..2bf0dc34 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -117,6 +117,7 @@ export const scrapeOptions = z.object({ } ).transform(val => val ? val.toUpperCase() : 'US') }).optional(), + skipTlsVerification: z.boolean().default(false), }).strict(strictMessage) @@ -433,6 +434,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { parsePDF: x.parsePDF, actions: x.actions as Action[], // no strict null checking grrrr - mogery geolocation: x.geolocation, + skipTlsVerification: x.skipTlsVerification }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index ca6142ec..8aa1d004 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -54,6 +54,7 @@ export type PageOptions = { geolocation?: { country?: string; }; + skipTlsVerification?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 5285a9f4..1817a07b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -594,6 +594,7 @@ export class WebScraperDataProvider { atsv: options.pageOptions?.atsv ?? false, actions: options.pageOptions?.actions ?? undefined, geolocation: options.pageOptions?.geolocation ?? undefined, + skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index d1dafd5f..3bbd74eb 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -28,7 +28,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false }, fireEngineOptions = {}, headers, options, @@ -40,7 +40,7 @@ export async function scrapWithFireEngine({ waitFor?: number; screenshot?: boolean; fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } }; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; @@ -119,6 +119,7 @@ export async function scrapWithFireEngine({ atsv: pageOptions?.atsv ?? false, scrollXPaths: pageOptions?.scrollXPaths ?? [], geolocation: pageOptions?.geolocation, + skipTlsVerification: pageOptions?.skipTlsVerification ?? false, actions: actions, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b4141dc1..cd76793c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -157,6 +157,7 @@ export async function scrapSingleUrl( atsv: pageOptions.atsv ?? false, actions: pageOptions.actions ?? undefined, geolocation: pageOptions.geolocation ?? undefined, + skipTlsVerification: pageOptions.skipTlsVerification ?? false, } if (extractorOptions) { From acde353e5652bc64318ff989fe7002c7d798a763 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Wed, 23 Oct 2024 01:07:03 +0300 Subject: [PATCH 072/102] skipTlsVerification on robots.txt scraping --- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index c8e449f0..0000b6fe 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -78,7 +78,7 @@ export async function crawlController( const crawler = crawlToCrawler(id, sc); try { - sc.robots = await crawler.getRobotsTxt(); + sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification); } catch (e) { Logger.debug( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 009a5933..72a49fd8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -9,7 +9,7 @@ import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; import { Logger } from "../../../src/lib/logger"; - +import https from "https"; export class WebCrawler { private jobId: string; private initialUrl: string; @@ -145,8 +145,14 @@ export class WebCrawler { .slice(0, limit); } - public async getRobotsTxt(): Promise { - const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); + public async getRobotsTxt(skipTlsVerification = false): Promise { + let extraArgs = {}; + if(skipTlsVerification) { + extraArgs["httpsAgent"] = new https.Agent({ + rejectUnauthorized: false + }); + } + const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs }); return response.data; } From bbfdda8867614812b3dea399647117317357b783 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 22 Oct 2024 19:47:23 -0300 Subject: [PATCH 073/102] Nick: init --- apps/api/src/controllers/auth.ts | 20 ++- apps/api/src/services/billing/auto_charge.ts | 148 ++++++++++++++++ .../src/services/billing/credit_billing.ts | 134 +++++++++++--- .../api/src/services/billing/issue_credits.ts | 20 +++ apps/api/src/services/billing/stripe.ts | 51 ++++++ .../notification/email_notification.ts | 166 ++++++++++-------- .../notification/notification_string.ts | 4 + apps/api/src/types.ts | 2 + 8 files changed, 435 insertions(+), 110 deletions(-) create mode 100644 apps/api/src/services/billing/auto_charge.ts create mode 100644 apps/api/src/services/billing/issue_credits.ts create mode 100644 apps/api/src/services/billing/stripe.ts diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 5546bc17..bf41b96a 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -75,15 +75,19 @@ export async function setCachedACUC( export async function getACUC( api_key: string, - cacheOnly = false + cacheOnly = false, + useCache = true ): Promise { const cacheKeyACUC = `acuc_${api_key}`; - const cachedACUC = await getValue(cacheKeyACUC); + if (useCache) { + const cachedACUC = await getValue(cacheKeyACUC); + if (cachedACUC !== null) { + return JSON.parse(cachedACUC); + } + } - if (cachedACUC !== null) { - return JSON.parse(cachedACUC); - } else if (!cacheOnly) { + if (!cacheOnly) { let data; let error; let retries = 0; @@ -91,7 +95,7 @@ export async function getACUC( while (retries < maxRetries) { ({ data, error } = await supabase_service.rpc( - "auth_credit_usage_chunk_test_3", + "auth_credit_usage_chunk_test_17_credit_pack", { input_key: api_key } )); @@ -118,9 +122,11 @@ export async function getACUC( data.length === 0 ? null : data[0].team_id === null ? null : data[0]; // NOTE: Should we cache null chunks? - mogery - if (chunk !== null) { + if (chunk !== null && useCache) { setCachedACUC(api_key, chunk); } + // Log the chunk for now + console.log(chunk); return chunk; } else { diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts new file mode 100644 index 00000000..6ab6914c --- /dev/null +++ b/apps/api/src/services/billing/auto_charge.ts @@ -0,0 +1,148 @@ +// Import necessary dependencies and types +import { AuthCreditUsageChunk } from "../../controllers/v1/types"; +import { getACUC, setCachedACUC } from "../../controllers/auth"; +import { redlock } from "../redlock"; +import { supabase_service } from "../supabase"; +import { createPaymentIntent } from "./stripe"; +import { issueCredits } from "./issue_credits"; +import { sendNotification } from "../notification/email_notification"; +import { NotificationType } from "../../types"; +import { deleteKey } from "../redis"; +import { sendSlackWebhook } from "../alerts/slack"; +import { Logger } from "../../lib/logger"; + +// Define the number of credits to be added during auto-recharge +const AUTO_RECHARGE_CREDITS = 1000; + +/** + * Attempt to automatically charge a user's account when their credit balance falls below a threshold + * @param chunk The user's current usage data + * @param autoRechargeThreshold The credit threshold that triggers auto-recharge + */ +export async function autoCharge( + chunk: AuthCreditUsageChunk, + autoRechargeThreshold: number +): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> { + const resource = `auto-recharge:${chunk.team_id}`; + try { + // Use a distributed lock to prevent concurrent auto-charge attempts + return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => { + // Recheck the condition inside the lock to prevent race conditions + const updatedChunk = await getACUC(chunk.api_key, false, false); + if ( + updatedChunk && + updatedChunk.remaining_credits < autoRechargeThreshold + ) { + if (chunk.sub_user_id) { + // Fetch the customer's Stripe information + const { data: customer, error: customersError } = + await supabase_service + .from("customers") + .select("id, stripe_customer_id") + .eq("id", chunk.sub_user_id) + .single(); + + if (customersError) { + Logger.error(`Error fetching customer data: ${customersError}`); + return { + success: false, + message: "Error fetching customer data", + remainingCredits: chunk.remaining_credits, + chunk, + }; + } + + if (customer && customer.stripe_customer_id) { + let issueCreditsSuccess = false; + // Attempt to create a payment intent + const paymentStatus = await createPaymentIntent( + chunk.team_id, + customer.stripe_customer_id + ); + + // If payment is successful or requires further action, issue credits + if ( + paymentStatus.return_status === "succeeded" || + paymentStatus.return_status === "requires_action" + ) { + issueCreditsSuccess = await issueCredits( + chunk.team_id, + AUTO_RECHARGE_CREDITS + ); + } + + // Record the auto-recharge transaction + await supabase_service.from("auto_recharge_transactions").insert({ + team_id: chunk.team_id, + initial_payment_status: paymentStatus.return_status, + credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0, + stripe_charge_id: paymentStatus.charge_id, + }); + + // Send a notification if credits were successfully issued + if (issueCreditsSuccess) { + await sendNotification( + chunk.team_id, + NotificationType.AUTO_RECHARGE_SUCCESS, + chunk.sub_current_period_start, + chunk.sub_current_period_end, + chunk, + true + ); + } + + // Reset ACUC cache to reflect the new credit balance + const cacheKeyACUC = `acuc_${chunk.api_key}`; + await deleteKey(cacheKeyACUC); + if (process.env.SLACK_ADMIN_WEBHOOK_URL ) { + sendSlackWebhook( + `Auto-recharge successful: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}. User was notified via email.`, + false, + process.env.SLACK_ADMIN_WEBHOOK_URL + ).catch((error) => { + Logger.debug(`Error sending slack notification: ${error}`); + }); + } + return { + success: true, + message: "Auto-recharge successful", + remainingCredits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS, + chunk: {...chunk, remaining_credits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS}, + }; + } else { + Logger.error("No Stripe customer ID found for user"); + return { + success: false, + message: "No Stripe customer ID found for user", + remainingCredits: chunk.remaining_credits, + chunk, + }; + } + } else { + Logger.error("No sub_user_id found in chunk"); + return { + success: false, + message: "No sub_user_id found in chunk", + remainingCredits: chunk.remaining_credits, + chunk, + }; + } + } + return { + success: false, + message: "No need to auto-recharge", + remainingCredits: chunk.remaining_credits, + chunk, + }; + + }); + } catch (error) { + Logger.error(`Failed to acquire lock for auto-recharge: ${error}`); + return { + success: false, + message: "Failed to acquire lock for auto-recharge", + remainingCredits: chunk.remaining_credits, + chunk, + }; + } +} diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index fc73ca7c..3c43f5a0 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -6,24 +6,40 @@ import { Logger } from "../../lib/logger"; import * as Sentry from "@sentry/node"; import { AuthCreditUsageChunk } from "../../controllers/v1/types"; import { getACUC, setCachedACUC } from "../../controllers/auth"; +import { issueCredits } from "./issue_credits"; +import { redlock } from "../redlock"; +import { autoCharge } from "./auto_charge"; +import { getValue, setValue } from "../redis"; const FREE_CREDITS = 500; /** * If you do not know the subscription_id in the current context, pass subscription_id as undefined. */ -export async function billTeam(team_id: string, subscription_id: string | null | undefined, credits: number) { +export async function billTeam( + team_id: string, + subscription_id: string | null | undefined, + credits: number +) { return withAuth(supaBillTeam)(team_id, subscription_id, credits); } -export async function supaBillTeam(team_id: string, subscription_id: string, credits: number) { +export async function supaBillTeam( + team_id: string, + subscription_id: string, + credits: number +) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } Logger.info(`Billing team ${team_id} for ${credits} credits`); - const { data, error } = - await supabase_service.rpc("bill_team", { _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, credits }); - + const { data, error } = await supabase_service.rpc("bill_team", { + _team_id: team_id, + sub_id: subscription_id ?? null, + fetch_subscription: subscription_id === undefined, + credits, + }); + if (error) { Sentry.captureException(error); Logger.error("Failed to bill team: " + JSON.stringify(error)); @@ -31,48 +47,109 @@ export async function supaBillTeam(team_id: string, subscription_id: string, cre } (async () => { - for (const apiKey of (data ?? []).map(x => x.api_key)) { - await setCachedACUC(apiKey, acuc => (acuc ? { - ...acuc, - credits_used: acuc.credits_used + credits, - adjusted_credits_used: acuc.adjusted_credits_used + credits, - remaining_credits: acuc.remaining_credits - credits, - } : null)); + for (const apiKey of (data ?? []).map((x) => x.api_key)) { + await setCachedACUC(apiKey, (acuc) => + acuc + ? { + ...acuc, + credits_used: acuc.credits_used + credits, + adjusted_credits_used: acuc.adjusted_credits_used + credits, + remaining_credits: acuc.remaining_credits - credits, + } + : null + ); } })(); } -export async function checkTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) { - return withAuth(supaCheckTeamCredits)(chunk, team_id, credits); +export async function checkTeamCredits( + chunk: AuthCreditUsageChunk, + team_id: string, + credits: number +): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> { + const result = await withAuth(supaCheckTeamCredits)(chunk, team_id, credits); + return { + success: result.success, + message: result.message, + remainingCredits: result.remainingCredits, + chunk: chunk // Ensure chunk is always returned + }; } // if team has enough credits for the operation, return true, else return false -export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) { +export async function supaCheckTeamCredits( + chunk: AuthCreditUsageChunk, + team_id: string, + credits: number +) { // WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery if (team_id === "preview") { - return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; + return { + success: true, + message: "Preview team, no credits used", + remainingCredits: Infinity, + }; } const creditsWillBeUsed = chunk.adjusted_credits_used + credits; // In case chunk.price_credits is undefined, set it to a large number to avoid mistakes - const totalPriceCredits = chunk.price_credits ?? 100000000; + const totalPriceCredits = chunk.total_credits_sum ?? 100000000; // Removal of + credits const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits; + let isAutoRechargeEnabled = false, autoRechargeThreshold = 1000; + const cacheKey = `team_auto_recharge_${team_id}`; + let cachedData = await getValue(cacheKey); + if (cachedData) { + const parsedData = JSON.parse(cachedData); + isAutoRechargeEnabled = parsedData.auto_recharge; + autoRechargeThreshold = parsedData.auto_recharge_threshold; + } else { + const { data, error } = await supabase_service + .from("teams") + .select("auto_recharge, auto_recharge_threshold") + .eq("id", team_id) + .single(); + + if (data) { + isAutoRechargeEnabled = data.auto_recharge; + autoRechargeThreshold = data.auto_recharge_threshold; + await setValue(cacheKey, JSON.stringify(data), 300); // Cache for 5 minutes (300 seconds) + } + } + + if (isAutoRechargeEnabled && chunk.remaining_credits < autoRechargeThreshold) { + const autoChargeResult = await autoCharge(chunk, autoRechargeThreshold); + if (autoChargeResult.success) { + return { + success: true, + message: autoChargeResult.message, + remainingCredits: autoChargeResult.remainingCredits, + chunk: autoChargeResult.chunk, + }; + } + } + // Compare the adjusted total credits used with the credits allowed by the plan if (creditsWillBeUsed > totalPriceCredits) { // Only notify if their actual credits (not what they will use) used is greater than the total price credits - if(chunk.adjusted_credits_used > totalPriceCredits) { + if (chunk.adjusted_credits_used > totalPriceCredits) { sendNotification( team_id, - NotificationType.LIMIT_REACHED, - chunk.sub_current_period_start, - chunk.sub_current_period_end, - chunk - ); - } - return { success: false, message: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk }; + NotificationType.LIMIT_REACHED, + chunk.sub_current_period_start, + chunk.sub_current_period_end, + chunk + ); + } + return { + success: false, + message: + "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", + remainingCredits: chunk.remaining_credits, + chunk, + }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit sendNotification( @@ -84,7 +161,12 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: ); } - return { success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, chunk }; + return { + success: true, + message: "Sufficient credits available", + remainingCredits: chunk.remaining_credits, + chunk, + }; } // Count the total credits used by a team within the current billing period and return the remaining credits. diff --git a/apps/api/src/services/billing/issue_credits.ts b/apps/api/src/services/billing/issue_credits.ts new file mode 100644 index 00000000..6b34b2ed --- /dev/null +++ b/apps/api/src/services/billing/issue_credits.ts @@ -0,0 +1,20 @@ +import { Logger } from "../../lib/logger"; +import { supabase_service } from "../supabase"; + +export async function issueCredits(team_id: string, credits: number) { + // Add an entry to supabase coupons + const { data, error } = await supabase_service.from("coupons").insert({ + team_id: team_id, + credits: credits, + status: "active", + // indicates that this coupon was issued from auto recharge + from_auto_recharge: true, + }); + + if (error) { + Logger.error(`Error adding coupon: ${error}`); + return false; + } + + return true; +} diff --git a/apps/api/src/services/billing/stripe.ts b/apps/api/src/services/billing/stripe.ts new file mode 100644 index 00000000..f1400804 --- /dev/null +++ b/apps/api/src/services/billing/stripe.ts @@ -0,0 +1,51 @@ +import { Logger } from "../../lib/logger"; +import Stripe from "stripe"; + +const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? ""); + +async function getCustomerDefaultPaymentMethod(customerId: string) { + const paymentMethods = await stripe.customers.listPaymentMethods(customerId, { + limit: 3, + }); + return paymentMethods.data[0]?.id; +} + +type ReturnStatus = "succeeded" | "requires_action" | "failed"; +export async function createPaymentIntent( + team_id: string, + customer_id: string +): Promise<{ return_status: ReturnStatus; charge_id: string }> { + try { + const paymentIntent = await stripe.paymentIntents.create({ + amount: 1100, + currency: "usd", + customer: customer_id, + description: "Firecrawl: Auto re-charge of 1000 credits", + payment_method_types: ["card"], + payment_method: await getCustomerDefaultPaymentMethod(customer_id), + off_session: true, + confirm: true, + }); + + if (paymentIntent.status === "succeeded") { + Logger.info(`Payment succeeded for team: ${team_id}`); + return { return_status: "succeeded", charge_id: paymentIntent.id }; + } else if ( + paymentIntent.status === "requires_action" || + paymentIntent.status === "processing" || + paymentIntent.status === "requires_capture" + ) { + Logger.warn(`Payment requires further action for team: ${team_id}`); + return { return_status: "requires_action", charge_id: paymentIntent.id }; + } else { + Logger.error(`Payment failed for team: ${team_id}`); + return { return_status: "failed", charge_id: paymentIntent.id }; + } + } catch (error) { + Logger.error( + `Failed to create or confirm PaymentIntent for team: ${team_id}` + ); + console.error(error); + return { return_status: "failed", charge_id: "" }; + } +} diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index cf02892e..1eceb06b 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -24,6 +24,14 @@ const emailTemplates: Record< subject: "Rate Limit Reached - Firecrawl", html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", }, + [NotificationType.AUTO_RECHARGE_SUCCESS]: { + subject: "Auto recharge successful - Firecrawl", + html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold.


Thanks,
Firecrawl Team
", + }, + [NotificationType.AUTO_RECHARGE_FAILED]: { + subject: "Auto recharge failed - Firecrawl", + html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team
", + }, }; export async function sendNotification( @@ -31,18 +39,20 @@ export async function sendNotification( notificationType: NotificationType, startDateString: string, endDateString: string, - chunk: AuthCreditUsageChunk + chunk: AuthCreditUsageChunk, + bypassRecentChecks: boolean = false ) { return withAuth(sendNotificationInternal)( team_id, notificationType, startDateString, endDateString, - chunk + chunk, + bypassRecentChecks ); } -async function sendEmailNotification( +export async function sendEmailNotification( email: string, notificationType: NotificationType, ) { @@ -72,90 +82,92 @@ export async function sendNotificationInternal( notificationType: NotificationType, startDateString: string, endDateString: string, - chunk: AuthCreditUsageChunk + chunk: AuthCreditUsageChunk, + bypassRecentChecks: boolean = false ): Promise<{ success: boolean }> { if (team_id === "preview") { return { success: true }; } - const fifteenDaysAgo = new Date(); - fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); + if (!bypassRecentChecks) { + const fifteenDaysAgo = new Date(); + fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); - const { data, error } = await supabase_service - .from("user_notifications") - .select("*") - .eq("team_id", team_id) - .eq("notification_type", notificationType) - .gte("sent_date", fifteenDaysAgo.toISOString()); - - if (error) { - Logger.debug(`Error fetching notifications: ${error}`); - return { success: false }; - } - - if (data.length !== 0) { - // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`); - return { success: false }; - } - - const { data: recentData, error: recentError } = await supabase_service - .from("user_notifications") - .select("*") - .eq("team_id", team_id) - .eq("notification_type", notificationType) - .gte("sent_date", startDateString) - .lte("sent_date", endDateString); - - if (recentError) { - Logger.debug(`Error fetching recent notifications: ${recentError}`); - return { success: false }; - } - - if (recentData.length !== 0) { - // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`); - return { success: false }; - } else { - console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`); - // get the emails from the user with the team_id - const { data: emails, error: emailsError } = await supabase_service - .from("users") - .select("email") - .eq("team_id", team_id); - - if (emailsError) { - Logger.debug(`Error fetching emails: ${emailsError}`); - return { success: false }; - } - - for (const email of emails) { - await sendEmailNotification(email.email, notificationType); - } - - const { error: insertError } = await supabase_service + const { data, error } = await supabase_service .from("user_notifications") - .insert([ - { - team_id: team_id, - notification_type: notificationType, - sent_date: new Date().toISOString(), - }, - ]); + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", fifteenDaysAgo.toISOString()); - if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { - sendSlackWebhook( - `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, - false, - process.env.SLACK_ADMIN_WEBHOOK_URL - ).catch((error) => { - Logger.debug(`Error sending slack notification: ${error}`); - }); - } - - if (insertError) { - Logger.debug(`Error inserting notification record: ${insertError}`); + if (error) { + Logger.debug(`Error fetching notifications: ${error}`); return { success: false }; } - return { success: true }; + if (data.length !== 0) { + return { success: false }; + } + + const { data: recentData, error: recentError } = await supabase_service + .from("user_notifications") + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", startDateString) + .lte("sent_date", endDateString); + + if (recentError) { + Logger.debug(`Error fetching recent notifications: ${recentError}`); + return { success: false }; + } + + if (recentData.length !== 0) { + return { success: false }; + } + } + + console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`); + // get the emails from the user with the team_id + const { data: emails, error: emailsError } = await supabase_service + .from("users") + .select("email") + .eq("team_id", team_id); + + if (emailsError) { + Logger.debug(`Error fetching emails: ${emailsError}`); + return { success: false }; + } + + for (const email of emails) { + await sendEmailNotification(email.email, notificationType); + } + + const { error: insertError } = await supabase_service + .from("user_notifications") + .insert([ + { + team_id: team_id, + notification_type: notificationType, + sent_date: new Date().toISOString(), + }, + ]); + + if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { + sendSlackWebhook( + `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, + false, + process.env.SLACK_ADMIN_WEBHOOK_URL + ).catch((error) => { + Logger.debug(`Error sending slack notification: ${error}`); + }); + } + + if (insertError) { + Logger.debug(`Error inserting notification record: ${insertError}`); + return { success: false }; + } + + return { success: true }; } diff --git a/apps/api/src/services/notification/notification_string.ts b/apps/api/src/services/notification/notification_string.ts index 8369a0ca..72bc60c4 100644 --- a/apps/api/src/services/notification/notification_string.ts +++ b/apps/api/src/services/notification/notification_string.ts @@ -11,6 +11,10 @@ export function getNotificationString( return "Limit reached (100%)"; case NotificationType.RATE_LIMIT_REACHED: return "Rate limit reached"; + case NotificationType.AUTO_RECHARGE_SUCCESS: + return "Auto-recharge successful"; + case NotificationType.AUTO_RECHARGE_FAILED: + return "Auto-recharge failed"; default: return "Unknown notification type"; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a03176da..b43aa02c 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -130,6 +130,8 @@ export enum NotificationType { APPROACHING_LIMIT = "approachingLimit", LIMIT_REACHED = "limitReached", RATE_LIMIT_REACHED = "rateLimitReached", + AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess", + AUTO_RECHARGE_FAILED = "autoRechargeFailed", } export type ScrapeLog = { From d375bca167550e762bd8b426d978ee72098fffb6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 22 Oct 2024 20:42:59 -0300 Subject: [PATCH 074/102] Update acuc-cache-clear.ts --- .../controllers/v0/admin/acuc-cache-clear.ts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts index 3ef1f7fb..876ca98a 100644 --- a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts +++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts @@ -1,15 +1,22 @@ import { Request, Response } from "express"; import { supabase_service } from "../../../services/supabase"; import { clearACUC } from "../../auth"; +import { Logger } from "../../../lib/logger"; export async function acucCacheClearController(req: Request, res: Response) { + try { const team_id: string = req.body.team_id; - const keys = await supabase_service.from("api_keys") - .select("*") - .eq("team_id", team_id); - - await Promise.all(keys.data.map(x => clearACUC(x.key))); + const keys = await supabase_service + .from("api_keys") + .select("*") + .eq("team_id", team_id); + + await Promise.all(keys.data.map((x) => clearACUC(x.key))); res.json({ ok: true }); + } catch (error) { + Logger.error(`Error clearing ACUC cache via API route: ${error}`); + res.status(500).json({ error: "Internal server error" }); + } } From 70c4e7c334af199f68ed21559e8d1f3cc0e6179a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 23 Oct 2024 19:42:02 +0200 Subject: [PATCH 075/102] feat(bulk/scrape): check credits via url list length --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 4bebcbb7..2984faaf 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -30,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R return (req, res, next) => { (async () => { if (!minimum && req.body) { - minimum = (req.body as any)?.limit ?? 1; + minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; } const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum); req.acuc = chunk; From d8abd157164c4b9b4935961c0d2b07857119d8f1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 15:37:24 -0300 Subject: [PATCH 076/102] Nick: from bulk to batch --- .../v1/{bulk-scrape.ts => batch-scrape.ts} | 12 ++--- apps/api/src/controllers/v1/crawl-status.ts | 4 +- apps/api/src/controllers/v1/types.ts | 4 +- apps/api/src/routes/v1.ts | 11 ++-- apps/js-sdk/firecrawl/README.md | 18 +++---- apps/js-sdk/firecrawl/src/index.ts | 34 ++++++------ apps/python-sdk/README.md | 30 +++++------ apps/python-sdk/firecrawl/firecrawl.py | 54 +++++++++---------- 8 files changed, 84 insertions(+), 83 deletions(-) rename apps/api/src/controllers/v1/{bulk-scrape.ts => batch-scrape.ts} (87%) diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts similarity index 87% rename from apps/api/src/controllers/v1/bulk-scrape.ts rename to apps/api/src/controllers/v1/batch-scrape.ts index 3e1afbd0..7c68341b 100644 --- a/apps/api/src/controllers/v1/bulk-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -1,8 +1,8 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { - BulkScrapeRequest, - bulkScrapeRequestSchema, + BatchScrapeRequest, + batchScrapeRequestSchema, CrawlResponse, legacyScrapeOptions, RequestWithAuth, @@ -17,11 +17,11 @@ import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { getJobPriority } from "../../lib/job-priority"; -export async function bulkScrapeController( - req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>, +export async function batchScrapeController( + req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, res: Response ) { - req.body = bulkScrapeRequestSchema.parse(req.body); + req.body = batchScrapeRequestSchema.parse(req.body); const id = uuidv4(); @@ -92,7 +92,7 @@ export async function bulkScrapeController( return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`, + url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, }); } diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index b753b17b..a8d78293 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) { return jobs; } -export async function crawlStatusController(req: RequestWithAuth, res: Response) { +export async function crawlStatusController(req: RequestWithAuth, res: Response, isBatch = false) { const sc = await getCrawl(req.params.jobId); if (!sc) { return res.status(404).json({ success: false, error: "Job not found" }); @@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth x.returnvalue); const protocol = process.env.ENV === "local" ? req.protocol : "https"; - const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`); nextURL.searchParams.set("skip", (start + data.length).toString()); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 4938b074..9705b855 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -144,7 +144,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({ export type ScrapeRequest = z.infer; -export const bulkScrapeRequestSchema = scrapeOptions.extend({ +export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), origin: z.string().optional().default("api"), }).strict(strictMessage).refine( @@ -163,7 +163,7 @@ export const bulkScrapeRequestSchema = scrapeOptions.extend({ return obj; }); -export type BulkScrapeRequest = z.infer; +export type BatchScrapeRequest = z.infer; const crawlerOptions = z.object({ includePaths: z.string().array().default([]), diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 2984faaf..4e4b6052 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -17,7 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; -import { bulkScrapeController } from "../controllers/v1/bulk-scrape"; +import { batchScrapeController } from "../controllers/v1/batch-scrape"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -124,12 +124,12 @@ v1Router.post( ); v1Router.post( - "/bulk/scrape", + "/batch/scrape", authMiddleware(RateLimiterMode.Crawl), checkCreditsMiddleware(), blocklistMiddleware, idempotencyMiddleware, - wrap(bulkScrapeController) + wrap(batchScrapeController) ); v1Router.post( @@ -147,9 +147,10 @@ v1Router.get( ); v1Router.get( - "/bulk/scrape/:jobId", + "/batch/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), - wrap(crawlStatusController) + // Yes, it uses the same controller as the normal crawl status controller + wrap((req:any, res):any => crawlStatusController(req, res, true)) ); v1Router.get( diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 1655d2ee..a90907ba 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -145,32 +145,32 @@ watch.addEventListener("done", state => { }); ``` -### Bulk scraping multiple URLs +### Batch scraping multiple URLs -To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. +To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. ```js -const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { +const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'], }) ``` -#### Asynchronous bulk scrape +#### Asynchronous batch scrape -To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape. +To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape. ```js const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); ``` -#### Bulk scrape with WebSockets +#### Batch scrape with WebSockets -To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats. +To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats. ```js -// Bulk scrape multiple URLs with WebSockets: -const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); +// Batch scrape multiple URLs with WebSockets: +const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); watch.addEventListener("document", doc => { console.log("DOC", doc.detail); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 30797c34..e9985683 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -494,14 +494,14 @@ export default class FirecrawlApp { } /** - * Initiates a bulk scrape job for multiple URLs using the Firecrawl API. + * Initiates a batch scrape job for multiple URLs using the Firecrawl API. * @param url - The URLs to scrape. * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. * @returns The response from the crawl operation. */ - async bulkScrapeUrls( + async batchScrapeUrls( urls: string[], params?: ScrapeParams, pollInterval: number = 2, @@ -511,7 +511,7 @@ export default class FirecrawlApp { let jsonData: any = { urls, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + `/v1/bulk/scrape`, + this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); @@ -519,7 +519,7 @@ export default class FirecrawlApp { const id: string = response.data.id; return this.monitorJobStatus(id, headers, pollInterval); } else { - this.handleError(response, "start bulk scrape job"); + this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { @@ -531,7 +531,7 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } - async asyncBulkScrapeUrls( + async asyncBatchScrapeUrls( urls: string[], params?: ScrapeParams, idempotencyKey?: string @@ -540,14 +540,14 @@ export default class FirecrawlApp { let jsonData: any = { urls, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + `/v1/bulk/scrape`, + this.apiUrl + `/v1/batch/scrape`, jsonData, headers ); if (response.status === 200) { return response.data; } else { - this.handleError(response, "start bulk scrape job"); + this.handleError(response, "start batch scrape job"); } } catch (error: any) { if (error.response?.data?.error) { @@ -560,42 +560,42 @@ export default class FirecrawlApp { } /** - * Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket. + * Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket. * @param urls - The URL to scrape. * @param params - Additional parameters for the scrape request. * @param idempotencyKey - Optional idempotency key for the request. * @returns A CrawlWatcher instance to monitor the crawl job. */ - async bulkScrapeUrlsAndWatch( + async batchScrapeUrlsAndWatch( urls: string[], params?: ScrapeParams, idempotencyKey?: string, ) { - const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey); + const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey); if (crawl.success && crawl.id) { const id = crawl.id; return new CrawlWatcher(id, this); } - throw new FirecrawlError("Bulk scrape job failed to start", 400); + throw new FirecrawlError("Batch scrape job failed to start", 400); } /** - * Checks the status of a bulk scrape job using the Firecrawl API. - * @param id - The ID of the bulk scrape operation. + * Checks the status of a batch scrape job using the Firecrawl API. + * @param id - The ID of the batch scrape operation. * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkBulkScrapeStatus(id?: string, getAllData = false): Promise { + async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { if (!id) { - throw new FirecrawlError("No bulk scrape ID provided", 400); + throw new FirecrawlError("No batch scrape ID provided", 400); } const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( - `${this.apiUrl}/v1/bulk/scrape/${id}`, + `${this.apiUrl}/v1/batch/scrape/${id}`, headers ); if (response.status === 200) { @@ -623,7 +623,7 @@ export default class FirecrawlApp { error: response.data.error, }) } else { - this.handleError(response, "check bulk scrape status"); + this.handleError(response, "check batch scrape status"); } } catch (error: any) { throw new FirecrawlError(error.message, 500); diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 6a7d4a0a..412c3e05 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -149,37 +149,37 @@ async def start_crawl_and_watch(): await start_crawl_and_watch() ``` -### Scraping multiple URLs in bulk +### Scraping multiple URLs in batch -To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. +To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats. ```python idempotency_key = str(uuid.uuid4()) # optional idempotency key -bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) -print(bulk_scrape_result) +batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key) +print(batch_scrape_result) ``` -### Asynchronous bulk scrape +### Asynchronous batch scrape -To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. +To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. ```python -bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) -print(bulk_scrape_result) +batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) +print(batch_scrape_result) ``` -### Checking bulk scrape status +### Checking batch scrape status -To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job. +To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job. ```python -id = bulk_scrape_result['id'] -status = app.check_bulk_scrape_job(id) +id = batch_scrape_result['id'] +status = app.check_batch_scrape_job(id) ``` -### Bulk scrape with WebSockets +### Batch scrape with WebSockets -To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. +To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats. ```python # inside an async function... @@ -198,7 +198,7 @@ def on_done(detail): # Function to start the crawl and watch process async def start_crawl_and_watch(): # Initiate the crawl job and get the watcher - watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) + watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}) # Add event listeners watcher.add_event_listener("document", on_document) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 3b8e39e0..1986ddd2 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -275,12 +275,12 @@ class FirecrawlApp: else: self._handle_error(response, 'map') - def bulk_scrape_urls(self, urls: list[str], + def batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None) -> Any: """ - Initiate a bulk scrape job for the specified URLs using the Firecrawl API. + Initiate a batch scrape job for the specified URLs using the Firecrawl API. Args: urls (list[str]): The URLs to scrape. @@ -290,18 +290,18 @@ class FirecrawlApp: Returns: Dict[str, Any]: A dictionary containing the scrape results. The structure includes: - - 'success' (bool): Indicates if the bulk scrape was successful. - - 'status' (str): The final status of the bulk scrape job (e.g., 'completed'). + - 'success' (bool): Indicates if the batch scrape was successful. + - 'status' (str): The final status of the batch scrape job (e.g., 'completed'). - 'completed' (int): Number of scraped pages that completed. - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires. + - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape. + - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires. - 'data' (List[Dict]): List of all the scraped pages. Raises: - Exception: If the bulk scrape job initiation or monitoring fails. + Exception: If the batch scrape job initiation or monitoring fails. """ - endpoint = f'/v1/bulk/scrape' + endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) json_data = {'urls': urls} if params: @@ -312,10 +312,10 @@ class FirecrawlApp: return self._monitor_job_status(id, headers, poll_interval) else: - self._handle_error(response, 'start bulk scrape job') + self._handle_error(response, 'start batch scrape job') - def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: """ Initiate a crawl job asynchronously. @@ -325,12 +325,12 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes: - - 'success' (bool): Indicates if the bulk scrape initiation was successful. - - 'id' (str): The unique identifier for the bulk scrape job. - - 'url' (str): The URL to check the status of the bulk scrape job. + Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: + - 'success' (bool): Indicates if the batch scrape initiation was successful. + - 'id' (str): The unique identifier for the batch scrape job. + - 'url' (str): The URL to check the status of the batch scrape job. """ - endpoint = f'/v1/bulk/scrape' + endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) json_data = {'urls': urls} if params: @@ -339,11 +339,11 @@ class FirecrawlApp: if response.status_code == 200: return response.json() else: - self._handle_error(response, 'start bulk scrape job') + self._handle_error(response, 'start batch scrape job') - def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': """ - Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket. + Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: urls (list[str]): The URLs to scrape. @@ -351,28 +351,28 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job. + CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job. """ - crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key) + crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) if crawl_response['success'] and 'id' in crawl_response: return CrawlWatcher(crawl_response['id'], self) else: - raise Exception("Bulk scrape job failed to start") + raise Exception("Batch scrape job failed to start") - def check_bulk_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str) -> Any: """ - Check the status of a bulk scrape job using the Firecrawl API. + Check the status of a batch scrape job using the Firecrawl API. Args: - id (str): The ID of the bulk scrape job. + id (str): The ID of the batch scrape job. Returns: - Any: The status of the bulk scrape job. + Any: The status of the batch scrape job. Raises: Exception: If the status check request fails. """ - endpoint = f'/v1/bulk/scrape/{id}' + endpoint = f'/v1/batch/scrape/{id}' headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) @@ -390,7 +390,7 @@ class FirecrawlApp: 'error': data.get('error') } else: - self._handle_error(response, 'check bulk scrape status') + self._handle_error(response, 'check batch scrape status') def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ From 60b6e6b1d4bdfcec635e3a2c55f5386a13a4b05d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 15:59:40 -0300 Subject: [PATCH 077/102] Nick: fixes --- apps/js-sdk/firecrawl/package.json | 4 ++-- apps/js-sdk/firecrawl/src/index.ts | 32 +++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e50205d5..0dfb4d69 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { - "name": "firecrawl", - "version": "1.6.1", + "name": "@mendable/firecrawl-js", + "version": "1.7.0-beta.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index e9985683..3e9c7bdf 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -154,6 +154,17 @@ export interface CrawlResponse { error?: string; } +/** + * Response interface for crawling operations. + * Defines the structure of the response received after initiating a crawl. + */ +export interface BatchScrapeResponse { + id?: string; + url?: string; + success: true; + error?: string; +} + /** * Response interface for job status checks. * Provides detailed status of a crawl job including progress and results. @@ -169,6 +180,21 @@ export interface CrawlStatusResponse { data: FirecrawlDocument[]; }; +/** + * Response interface for job status checks. + * Provides detailed status of a crawl job including progress and results. + */ +export interface BatchScrapeStatusResponse { + success: true; + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: Date; + next?: string; + data: FirecrawlDocument[]; +}; + /** * Parameters for mapping operations. * Defines options for mapping URLs during a crawl. @@ -506,7 +532,7 @@ export default class FirecrawlApp { params?: ScrapeParams, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, ...(params ?? {}) }; try { @@ -535,7 +561,7 @@ export default class FirecrawlApp { urls: string[], params?: ScrapeParams, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey); let jsonData: any = { urls, ...(params ?? {}) }; try { @@ -587,7 +613,7 @@ export default class FirecrawlApp { * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { + async checkBatchScrapeStatus(id?: string, getAllData = false): Promise { if (!id) { throw new FirecrawlError("No batch scrape ID provided", 400); } From c7f217098075a974c6a274cad04e8f8111cd8379 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:04:46 -0300 Subject: [PATCH 078/102] Update example.py --- apps/python-sdk/example.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index 02c06288..e7c80b30 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-") scrape_result = app.scrape_url('firecrawl.dev') print(scrape_result['markdown']) + +# Test batch scrape +urls = ['https://example.com', 'https://docs.firecrawl.dev'] +batch_scrape_params = { + 'formats': ['markdown', 'html'], +} + +# Synchronous batch scrape +batch_result = app.batch_scrape_urls(urls, batch_scrape_params) +print("Synchronous Batch Scrape Result:") +print(batch_result['data'][0]['markdown']) + +# Asynchronous batch scrape +async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) +print("\nAsynchronous Batch Scrape Result:") +print(async_batch_result) + # Crawl a website: idempotency_key = str(uuid.uuid4()) # optional idempotency key crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) From f0054da934a6965450d8cb902afd32dd1f1e3715 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:06:08 -0300 Subject: [PATCH 079/102] Nick: lgtm --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/package-lock.json | 22 ++++------------------ apps/js-sdk/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0dfb4d69..16f1b595 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.0-beta.2", + "version": "1.7.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 975b14e8..3bba385f 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^1.0.3", + "@mendable/firecrawl-js": "^1.7.0-beta.2", "axios": "^1.6.8", "firecrawl": "^1.2.0", "ts-node": "^10.9.2", @@ -423,31 +423,17 @@ } }, "node_modules/@mendable/firecrawl-js": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz", - "integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==", + "version": "1.7.0-beta.2", + "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz", + "integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==", "dependencies": { "axios": "^1.6.8", - "dotenv": "^16.4.5", "isows": "^1.0.4", "typescript-event-target": "^1.1.1", - "uuid": "^9.0.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, - "node_modules/@mendable/firecrawl-js/node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index ac3ef038..6324707f 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "^1.0.3", + "@mendable/firecrawl-js": "1.7.0-beta.2", "axios": "^1.6.8", "firecrawl": "^1.2.0", "ts-node": "^10.9.2", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 5700a3e6..82c73348 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.3.1" +__version__ = "1.4.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From b11035814a6c3ef9809a9e1c28544872e7b4c3c9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:10:21 -0300 Subject: [PATCH 080/102] Nick: --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 16f1b595..a7fb2d83 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.0", + "version": "1.7.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3e9c7bdf..491df1e4 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -155,7 +155,7 @@ export interface CrawlResponse { } /** - * Response interface for crawling operations. + * Response interface for batch scrape operations. * Defines the structure of the response received after initiating a crawl. */ export interface BatchScrapeResponse { @@ -181,8 +181,8 @@ export interface CrawlStatusResponse { }; /** - * Response interface for job status checks. - * Provides detailed status of a crawl job including progress and results. + * Response interface for batch scrape job status checks. + * Provides detailed status of a batch scrape job including progress and results. */ export interface BatchScrapeStatusResponse { success: true; From 19cac2220f20632af1b6d37c697772cdb4bba46a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 23 Oct 2024 16:31:01 -0300 Subject: [PATCH 081/102] Nick: --- apps/api/src/services/queue-worker.ts | 2 +- apps/js-sdk/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 1ea4775a..1369326d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -353,7 +353,7 @@ async function processJob(job: Job, token: string) { docs: docs, time_taken: timeTakenInSeconds, team_id: job.data.team_id, - mode: job.data.mode, + mode: job.data.crawlerOptions === null ? "batch_scrape" : job.data.mode, url: job.data.url, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 6324707f..1992de80 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "@mendable/firecrawl-js": "1.7.0-beta.2", + "@mendable/firecrawl-js": "1.7.1", "axios": "^1.6.8", "firecrawl": "^1.2.0", "ts-node": "^10.9.2", From 1da6360b77fb7a2d18937846c9bf27b29510eb6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 23 Oct 2024 21:55:21 +0200 Subject: [PATCH 082/102] feat(batch/scrape): restructure logs, add webhooks --- apps/api/src/services/queue-worker.ts | 22 ++++++++++++---------- apps/api/src/types.ts | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 1369326d..f2c25042 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -329,7 +329,8 @@ async function processJob(job: Job, token: string) { job.id as string, data, job.data.webhook, - job.data.v1 + job.data.v1, + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", ); } if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { @@ -339,7 +340,7 @@ async function processJob(job: Job, token: string) { data, job.data.webhook, job.data.v1, - "crawl.page", + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", true ); } @@ -353,7 +354,7 @@ async function processJob(job: Job, token: string) { docs: docs, time_taken: timeTakenInSeconds, team_id: job.data.team_id, - mode: job.data.crawlerOptions === null ? "batch_scrape" : job.data.mode, + mode: job.data.mode, url: job.data.url, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, @@ -414,7 +415,7 @@ async function processJob(job: Job, token: string) { } } - if (await finishCrawl(job.data.crawl_id) && job.data.crawlerOptions !== null) { + if (await finishCrawl(job.data.crawl_id)) { if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); @@ -437,7 +438,7 @@ async function processJob(job: Job, token: string) { docs: [], time_taken: (Date.now() - sc.createdAt) / 1000, team_id: job.data.team_id, - mode: "crawl", + mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", url: sc.originUrl, crawlerOptions: sc.crawlerOptions, pageOptions: sc.pageOptions, @@ -467,7 +468,7 @@ async function processJob(job: Job, token: string) { data, job.data.webhook, job.data.v1, - "crawl.completed" + job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed" ); } } else { @@ -485,7 +486,7 @@ async function processJob(job: Job, token: string) { [], job.data.webhook, job.data.v1, - "crawl.completed" + job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed" ); } @@ -497,7 +498,7 @@ async function processJob(job: Job, token: string) { docs: [], time_taken: (Date.now() - sc.createdAt) / 1000, team_id: job.data.team_id, - mode: "crawl", + mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", url: sc.originUrl, crawlerOptions: sc.crawlerOptions, pageOptions: sc.pageOptions, @@ -554,7 +555,8 @@ async function processJob(job: Job, token: string) { job.data.crawl_id ?? (job.id as string), data, job.data.webhook, - job.data.v1 + job.data.v1, + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", ); } // if (job.data.v1) { @@ -603,7 +605,7 @@ async function processJob(job: Job, token: string) { docs: [], time_taken: 0, team_id: job.data.team_id, - mode: "crawl", + mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", url: sc ? sc.originUrl : job.data.url, crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions, pageOptions: sc ? sc.pageOptions : job.data.pageOptions, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a03176da..701d4add 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -159,4 +159,4 @@ export type PlanType = | ""; -export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed"; \ No newline at end of file +export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file From 9a4ccd08017fddbec081c6bf52806e181ef25e59 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 24 Oct 2024 14:40:53 -0400 Subject: [PATCH 083/102] Claude Web Crawler with Batch Scrape --- .gitignore | 1 + .../claude_web_crawler/claude_web_crawler.py | 164 ++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 examples/claude_web_crawler/claude_web_crawler.py diff --git a/.gitignore b/.gitignore index dcfd499a..4d35cb4a 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ apps/js-sdk/firecrawl/dist /examples/o1_web_crawler/firecrawl_env /examples/crm_lead_enrichment/crm_lead_enrichment_env /.venv +/examples/claude_web_crawler/firecrawl_env diff --git a/examples/claude_web_crawler/claude_web_crawler.py b/examples/claude_web_crawler/claude_web_crawler.py new file mode 100644 index 00000000..55168f30 --- /dev/null +++ b/examples/claude_web_crawler/claude_web_crawler.py @@ -0,0 +1,164 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +import anthropic + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") + +# Initialize the FirecrawlApp and OpenAI client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = anthropic.Anthropic(api_key=anthropic_api_key) + +# Find the page that most likely contains the objective +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}") + completion = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1000, + temperature=0, + system="You are an expert web crawler. Respond with the best search parameter.", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": map_prompt + } + ] + } + ] + ) + + map_search_parameter = completion.content[0].text + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}") + return map_website['links'] + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None +def find_objective_in_top_pages(map_website, objective, app, client): + try: + # Get top 2 links from the map result + top_links = map_website[:2] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + # Scrape the pages in batch + batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']}) + print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}") + + + for scrape_result in batch_scrape_result['data']: + + # Check if objective is met + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible. + If the objective is not met with confidence, respond with 'Objective not met'. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + + Remember: + 1. Only return JSON if you are confident the objective is fully met. + 2. Keep the JSON structure as simple and flat as possible. + 3. Do not include any explanations or markdown formatting in your response. + """ + + completion = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1000, + temperature=0, + system="You are an expert web crawler. Respond with the relevant information in JSON format.", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": check_prompt + } + ] + } + ] + ) + + result = completion.content[0].text + + if result != "Objective not met": + print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}") + try: + return json.loads(result) + except json.JSONDecodeError: + print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}") + else: + print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}") + + print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}") + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}") + return None + +# Main function to execute the process +def main(): + # Get user input + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + if not url.strip(): + url = "https://www.firecrawl.dev/" + + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + if not objective.strip(): + objective = "find me the pricing plans" + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + # Find the relevant page + map_website = find_relevant_page_via_map(objective, url, app, client) + print(map_website) + + if map_website: + print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}") + # Find objective in top pages + result = find_objective_in_top_pages(map_website, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}") + print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}") + else: + print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}") + else: + print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") + +if __name__ == "__main__": + main() From d965f2ce7d9a3519da2e659ccc9ab78956b35030 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 24 Oct 2024 23:13:30 -0300 Subject: [PATCH 084/102] Nick: fixes --- apps/api/src/controllers/auth.ts | 6 +++--- apps/api/src/controllers/v1/types.ts | 2 ++ apps/api/src/services/notification/email_notification.ts | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index c9705f0f..93327e66 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -95,7 +95,7 @@ export async function getACUC( while (retries < maxRetries) { ({ data, error } = await supabase_service.rpc( - "auth_credit_usage_chunk_test_17_credit_pack", + "auth_credit_usage_chunk_test_21_credit_pack", { input_key: api_key } )); @@ -125,8 +125,8 @@ export async function getACUC( if (chunk !== null && useCache) { setCachedACUC(api_key, chunk); } - // Log the chunk for now - console.log(chunk); + + // console.log(chunk); return chunk; } else { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9705b855..22ac6294 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -362,6 +362,8 @@ export type AuthCreditUsageChunk = { coupons: any[]; adjusted_credits_used: number; // credits this period minus coupons used remaining_credits: number; + sub_user_id: string | null; + total_credits_sum: number; }; export interface RequestWithMaybeACUC< diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 1eceb06b..a94d34c4 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -109,6 +109,8 @@ export async function sendNotificationInternal( return { success: false }; } + // TODO: observation: Free credits people are not receiving notifications + const { data: recentData, error: recentError } = await supabase_service .from("user_notifications") .select("*") @@ -118,7 +120,7 @@ export async function sendNotificationInternal( .lte("sent_date", endDateString); if (recentError) { - Logger.debug(`Error fetching recent notifications: ${recentError}`); + Logger.debug(`Error fetching recent notifications: ${recentError.message}`); return { success: false }; } From 73e6db45debdefbe606c40675cd28620de7e9500 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 24 Oct 2024 23:14:41 -0300 Subject: [PATCH 085/102] Update email_notification.ts --- apps/api/src/services/notification/email_notification.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index a94d34c4..001f164a 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -26,7 +26,7 @@ const emailTemplates: Record< }, [NotificationType.AUTO_RECHARGE_SUCCESS]: { subject: "Auto recharge successful - Firecrawl", - html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold.


Thanks,
Firecrawl Team
", + html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at firecrawl.dev/pricing to avoid hitting the limit.


Thanks,
Firecrawl Team
", }, [NotificationType.AUTO_RECHARGE_FAILED]: { subject: "Auto recharge failed - Firecrawl", From dbcf2d7ff642f5c322ab34304dbaa9580f3307a3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 24 Oct 2024 23:44:08 -0300 Subject: [PATCH 086/102] Nick: fix loggin for batch scrape --- apps/api/src/services/queue-worker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index f2c25042..f15aca4e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -499,7 +499,7 @@ async function processJob(job: Job, token: string) { time_taken: (Date.now() - sc.createdAt) / 1000, team_id: job.data.team_id, mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", - url: sc.originUrl, + url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), crawlerOptions: sc.crawlerOptions, pageOptions: sc.pageOptions, origin: job.data.origin, From 95c4652fd4002ccee9a4f0bce3e39192ce0ec966 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 25 Oct 2024 16:05:23 -0300 Subject: [PATCH 087/102] Nick: 10min cooldown on auto charge --- apps/api/src/services/billing/auto_charge.ts | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 6ab6914c..61e2bdb8 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -7,12 +7,13 @@ import { createPaymentIntent } from "./stripe"; import { issueCredits } from "./issue_credits"; import { sendNotification } from "../notification/email_notification"; import { NotificationType } from "../../types"; -import { deleteKey } from "../redis"; +import { deleteKey, getValue, setValue } from "../redis"; import { sendSlackWebhook } from "../alerts/slack"; import { Logger } from "../../lib/logger"; // Define the number of credits to be added during auto-recharge const AUTO_RECHARGE_CREDITS = 1000; +const AUTO_RECHARGE_COOLDOWN = 600; // 10 minutes in seconds /** * Attempt to automatically charge a user's account when their credit balance falls below a threshold @@ -24,7 +25,22 @@ export async function autoCharge( autoRechargeThreshold: number ): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> { const resource = `auto-recharge:${chunk.team_id}`; + const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`; + try { + // Check if the team is in the cooldown period + // Another check to prevent race conditions, double charging - cool down of 10 minutes + const cooldownValue = await getValue(cooldownKey); + if (cooldownValue) { + Logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`); + return { + success: false, + message: "Auto-recharge is in cooldown period", + remainingCredits: chunk.remaining_credits, + chunk, + }; + } + // Use a distributed lock to prevent concurrent auto-charge attempts return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => { // Recheck the condition inside the lock to prevent race conditions @@ -89,6 +105,9 @@ export async function autoCharge( chunk, true ); + + // Set cooldown period + await setValue(cooldownKey, 'true', AUTO_RECHARGE_COOLDOWN); } // Reset ACUC cache to reflect the new credit balance From 97b8d6c333400e98792eafd83154ea96a7b916f5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 25 Oct 2024 16:05:39 -0300 Subject: [PATCH 088/102] Update auto_charge.ts --- apps/api/src/services/billing/auto_charge.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 61e2bdb8..9669972a 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -13,7 +13,7 @@ import { Logger } from "../../lib/logger"; // Define the number of credits to be added during auto-recharge const AUTO_RECHARGE_CREDITS = 1000; -const AUTO_RECHARGE_COOLDOWN = 600; // 10 minutes in seconds +const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds /** * Attempt to automatically charge a user's account when their credit balance falls below a threshold @@ -29,7 +29,7 @@ export async function autoCharge( try { // Check if the team is in the cooldown period - // Another check to prevent race conditions, double charging - cool down of 10 minutes + // Another check to prevent race conditions, double charging - cool down of 5 minutes const cooldownValue = await getValue(cooldownKey); if (cooldownValue) { Logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`); From 801f0f773e82abf731f20ab3ad968f461b542eb2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 26 Oct 2024 03:59:15 -0300 Subject: [PATCH 089/102] Nick: fix auto charge failing when payment is through Link --- apps/api/src/services/billing/auto_charge.ts | 13 +++++++++++-- apps/api/src/services/billing/stripe.ts | 11 ++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 9669972a..5f318321 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -113,15 +113,24 @@ export async function autoCharge( // Reset ACUC cache to reflect the new credit balance const cacheKeyACUC = `acuc_${chunk.api_key}`; await deleteKey(cacheKeyACUC); - if (process.env.SLACK_ADMIN_WEBHOOK_URL ) { + + if (process.env.SLACK_ADMIN_WEBHOOK_URL) { + const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`; + const isInCooldown = await getValue(webhookCooldownKey); + + if (!isInCooldown) { sendSlackWebhook( - `Auto-recharge successful: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}. User was notified via email.`, + `Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`, false, process.env.SLACK_ADMIN_WEBHOOK_URL ).catch((error) => { Logger.debug(`Error sending slack notification: ${error}`); }); + + // Set cooldown for 1 hour + await setValue(webhookCooldownKey, 'true', 60 * 60); } + } return { success: true, message: "Auto-recharge successful", diff --git a/apps/api/src/services/billing/stripe.ts b/apps/api/src/services/billing/stripe.ts index f1400804..e459d3e3 100644 --- a/apps/api/src/services/billing/stripe.ts +++ b/apps/api/src/services/billing/stripe.ts @@ -7,7 +7,7 @@ async function getCustomerDefaultPaymentMethod(customerId: string) { const paymentMethods = await stripe.customers.listPaymentMethods(customerId, { limit: 3, }); - return paymentMethods.data[0]?.id; + return paymentMethods.data[0] ?? null; } type ReturnStatus = "succeeded" | "requires_action" | "failed"; @@ -16,13 +16,18 @@ export async function createPaymentIntent( customer_id: string ): Promise<{ return_status: ReturnStatus; charge_id: string }> { try { + const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id); + if (!defaultPaymentMethod) { + Logger.error(`No default payment method found for customer: ${customer_id}`); + return { return_status: "failed", charge_id: "" }; + } const paymentIntent = await stripe.paymentIntents.create({ amount: 1100, currency: "usd", customer: customer_id, description: "Firecrawl: Auto re-charge of 1000 credits", - payment_method_types: ["card"], - payment_method: await getCustomerDefaultPaymentMethod(customer_id), + payment_method_types: [defaultPaymentMethod?.type ?? "card"], + payment_method: defaultPaymentMethod?.id, off_session: true, confirm: true, }); From 9593ab80e11b08fba076235a4e90bdd6cb0b9487 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 26 Oct 2024 16:03:07 -0300 Subject: [PATCH 090/102] Update README.md --- apps/js-sdk/firecrawl/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index a90907ba..e404a317 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -147,7 +147,7 @@ watch.addEventListener("done", state => { ### Batch scraping multiple URLs -To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats. +To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats. ```js const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { @@ -158,10 +158,10 @@ const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', #### Asynchronous batch scrape -To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape. +To initiate an asynchronous batch scrape, utilize the `asyncBatchScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape. ```js -const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); +const asyncBatchScrapeResult = await app.asyncBatchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] }); ``` #### Batch scrape with WebSockets From 8a4f4cb9d98884bc70f4cf188a2c4dc87f656462 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 26 Oct 2024 16:10:51 -0300 Subject: [PATCH 091/102] Update README.md --- apps/python-sdk/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/README.md b/apps/python-sdk/README.md index 412c3e05..abae05de 100644 --- a/apps/python-sdk/README.md +++ b/apps/python-sdk/README.md @@ -170,11 +170,11 @@ print(batch_scrape_result) ### Checking batch scrape status -To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job. +To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_status` method. It takes the job ID as a parameter and returns the current status of the batch scrape job. ```python id = batch_scrape_result['id'] -status = app.check_batch_scrape_job(id) +status = app.check_batch_scrape_status(id) ``` ### Batch scrape with WebSockets From 68b2e1b20966733494c09db0951d8b5d27d6c298 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 27 Oct 2024 23:14:25 -0300 Subject: [PATCH 092/102] Update log_job.ts --- apps/api/src/services/logging/log_job.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 4d8ee014..c2aedd13 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -70,7 +70,9 @@ export async function logJob(job: FirecrawlJob) { retry: job.retry, }, }; - posthog.capture(phLog); + if(job.mode !== "single_urls") { + posthog.capture(phLog); + } } if (error) { Logger.error(`Error logging job: ${error.message}`); From 877d5e4383bde79f5aab4b2bbaa804c2497a0bdd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 27 Oct 2024 23:17:20 -0300 Subject: [PATCH 093/102] Update types.ts --- apps/api/src/controllers/v1/types.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 22ac6294..8c60c0fb 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -109,6 +109,16 @@ export const scrapeOptions = z.object({ extract: extractOptions.optional(), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), + // New + location: z.object({ + country: z.string().optional().refine( + (val) => !val || Object.keys(countries).includes(val.toUpperCase()), + { + message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", + } + ).transform(val => val ? val.toUpperCase() : 'US') + }).optional(), + // Deprecated geolocation: z.object({ country: z.string().optional().refine( (val) => !val || Object.keys(countries).includes(val.toUpperCase()), @@ -445,7 +455,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { fullPageScreenshot: x.formats.includes("screenshot@fullPage"), parsePDF: x.parsePDF, actions: x.actions as Action[], // no strict null checking grrrr - mogery - geolocation: x.geolocation, + geolocation: x.location ?? x.geolocation, skipTlsVerification: x.skipTlsVerification }; } From b48eed5716d7bad5f19702bce839b2998fe8aaf6 Mon Sep 17 00:00:00 2001 From: Twilight <46562212+twlite@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:40:35 +0545 Subject: [PATCH 094/102] chore(README.md): use `satisfies` instead of `as` for ts example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd8a740a..e7c48fa5 100644 --- a/README.md +++ b/README.md @@ -483,7 +483,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html'], } -} as CrawlParams, true, 30) as CrawlStatusResponse; +} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse; if (crawlResponse) { console.log(crawlResponse) From e3e8375c7de0c64df66a56f0b3f1d9ddc4fd2c9c Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 28 Oct 2024 11:13:33 -0400 Subject: [PATCH 095/102] Add AgentOps Monitoring --- examples/claude_web_crawler/claude_web_crawler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/claude_web_crawler/claude_web_crawler.py b/examples/claude_web_crawler/claude_web_crawler.py index 55168f30..6ca29f14 100644 --- a/examples/claude_web_crawler/claude_web_crawler.py +++ b/examples/claude_web_crawler/claude_web_crawler.py @@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp import json from dotenv import load_dotenv import anthropic +import agentops # ANSI color codes class Colors: @@ -161,4 +162,5 @@ def main(): print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") if __name__ == "__main__": + agentops.init(os.getenv("AGENTOPS_API_KEY")) main() From 007e3edfc5c785da858f11a94f14ea7a5bd28ff0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 12:40:04 -0300 Subject: [PATCH 096/102] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index dd8a740a..d8e5bdcb 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge - **Media parsing**: pdfs, docx, images. - **Reliability first**: designed to get the data you need - no matter how hard it is. - **Actions**: click, scroll, input, wait and more before extracting data +- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev) @@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ }' ``` +### Batch Scraping Multiple URLs (New) + +You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape. + +```bash +curl -X POST https://api.firecrawl.dev/v1/batch/scrape \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer YOUR_API_KEY' \ + -d '{ + "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"], + "formats" : ["markdown", "html"] + }' +``` ### Search (v0) (Beta) From fa8875d64d4246f0d0b7eecbfcffbbccce473033 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 15:09:50 -0300 Subject: [PATCH 097/102] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index cd76793c..c7185b79 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -209,14 +209,15 @@ export async function scrapSingleUrl( if (action.type === "click" || action.type === "write" || action.type === "press") { const result: Action[] = []; // Don't add a wait if the previous action is a wait - if (index === 0 || array[index - 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === 0 || array[index - 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } + // Fire-engine now handles wait times automatically, leaving the code here for now result.push(action); // Don't add a wait if the next action is a wait - if (index === array.length - 1 || array[index + 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === array.length - 1 || array[index + 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } return result; } return [action as Action]; From 726430c2e641666626b400ca62f32062a31978f4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 16:51:49 -0300 Subject: [PATCH 098/102] Nick: llm extract in batch scrape --- apps/api/src/controllers/v1/batch-scrape.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 7c68341b..cde4bd76 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -4,6 +4,7 @@ import { BatchScrapeRequest, batchScrapeRequestSchema, CrawlResponse, + legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, } from "./types"; @@ -34,6 +35,8 @@ export async function batchScrapeController( } const pageOptions = legacyScrapeOptions(req.body); + const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; + const sc: StoredCrawl = { crawlerOptions: null, @@ -65,6 +68,7 @@ export async function batchScrapeController( plan: req.auth.plan, crawlerOptions: null, pageOptions, + extractorOptions, origin: "api", crawl_id: id, sitemapped: true, From 0bad436061191fd304dd66ea201eccc97f0e7474 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 17:04:42 -0300 Subject: [PATCH 099/102] Nick: fixed the batch scrape + llm extract billing --- apps/api/src/main/runWebScraper.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 8eb679e7..8bd0c12c 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -121,8 +121,13 @@ export async function runWebScraper({ : docs; if(is_scrape === false) { - billTeam(team_id, undefined, filteredDocs.length).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) { + creditsToBeBilled = 5; + } + + billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); } From 3d1bb82aa27865bfda143a952c4b33b0f8b7b18c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 20:16:11 -0300 Subject: [PATCH 100/102] Nick: languages support --- apps/api/src/controllers/v1/types.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 8c60c0fb..633bbdf1 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -116,8 +116,10 @@ export const scrapeOptions = z.object({ { message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", } - ).transform(val => val ? val.toUpperCase() : 'US') + ).transform(val => val ? val.toUpperCase() : 'US'), + languages: z.string().array().optional(), }).optional(), + // Deprecated geolocation: z.object({ country: z.string().optional().refine( @@ -125,7 +127,8 @@ export const scrapeOptions = z.object({ { message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", } - ).transform(val => val ? val.toUpperCase() : 'US') + ).transform(val => val ? val.toUpperCase() : 'US'), + languages: z.string().array().optional(), }).optional(), skipTlsVerification: z.boolean().default(false), }).strict(strictMessage) From b6ce49e5bbc06a1e5027e794fcb3f7b4163f1704 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 20:24:16 -0300 Subject: [PATCH 101/102] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 491df1e4..bbe934fe 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -82,6 +82,10 @@ export interface CrawlScrapeOptions { onlyMainContent?: boolean; waitFor?: number; timeout?: number; + location?: { + country?: string; + languages?: string[]; + }; } export type Action = { From 6d38c65466ca66731b367141480db585dd87dce9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 20:25:28 -0300 Subject: [PATCH 102/102] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a7fb2d83..b8738e5e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.7.1", + "version": "1.7.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts",