add zod, create middleware, update openapi declaration, add crawl logic

This commit is contained in:
Gergő Móricz 2024-08-15 23:30:33 +02:00
parent 4165de1773
commit 8b7569f8f3
10 changed files with 1604 additions and 629 deletions

924
apps/api/openapi-v0.json Normal file
View File

@ -0,0 +1,924 @@
{
"openapi": "3.0.0",
"info": {
"title": "Firecrawl API",
"version": "0.0.0",
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
"servers": [
{
"url": "https://api.firecrawl.dev/v0"
}
],
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
},
"extractorOptions": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ScrapeResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl": {
"post": {
"summary": "Crawl multiple URLs based on options",
"operationId": "crawlUrls",
"tags": ["Crawling"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The base URL to start crawling from"
},
"crawlerOptions": {
"type": "object",
"properties": {
"includes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
},
"excludes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
},
"generateImgAltText": {
"type": "boolean",
"description": "Generate alt text for images using LLMs (must have a paid plan)",
"default": false
},
"returnOnlyUrls": {
"type": "boolean",
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false
},
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
},
"allowBackwardCrawling": {
"type": "boolean",
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
"default": false
},
"allowExternalContentLinks": {
"type": "boolean",
"description": "Allows the crawler to follow links to external websites.",
"default": false
}
}
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
}
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CrawlResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/search": {
"post": {
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
"operationId": "searchGoogle",
"tags": ["Search"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"format": "uri",
"description": "The query to search for"
},
"pageOptions": {
"type": "object",
"properties": {
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"fetchPageContent": {
"type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
}
}
},
"searchOptions": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of results. Max is 20 during beta."
}
}
}
},
"required": ["query"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SearchResponse"
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/status/{jobId}": {
"get": {
"tags": ["Crawl"],
"summary": "Get the status of a crawl job",
"operationId": "getCrawlStatus",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Status of the job (completed, active, failed, paused)"
},
"current": {
"type": "integer",
"description": "Current page number"
},
"total": {
"type": "integer",
"description": "Total number of pages"
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj"
},
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
}
}
}
}
}
},
"402": {
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
},
"429": {
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
},
"500": {
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
}
}
}
}
},
"components": {
"securitySchemes": {
"bearerAuth": {
"type": "http",
"scheme": "bearer"
}
},
"schemas": {
"ScrapeResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
}
},
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
}
},
"SearchResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
}
}
}
}
}
}
},
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"type": "string"
}
}
}
}
},
"security": [
{
"bearerAuth": []
}
]
}

View File

@ -18,8 +18,8 @@
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeAndExtractFromUrl",
"summary": "Scrape a single URL",
"operationId": "scrape",
"tags": ["Scraping"],
"security": [
{
@ -38,94 +38,47 @@
"format": "uri",
"description": "The URL to scrape"
},
"pageOptions": {
"type": "object",
"properties": {
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeHtml": {
"type": "boolean",
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
"default": false
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
}
"formats": {
"type": "array",
"items": {
"type": "string",
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
},
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
"default": ["markdown"]
},
"extractorOptions": {
"headers": {
"type": "object",
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
"default": {},
"properties": {
"mode": {
"type": "string",
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"excludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": true
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
}
},
"required": ["url"]
@ -741,24 +694,42 @@
"success": {
"type": "boolean"
},
"warning": {
"type": "string",
"nullable": true,
"description": "Warning message to let you know of any issues."
},
"data": {
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
@ -780,27 +751,16 @@
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
"type": "object",
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
"nullable": true
},
"warning": {
"type": "string",
"nullable": true,
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
}
}
}
@ -810,24 +770,33 @@
"type": "object",
"properties": {
"markdown": {
"type": "string"
},
"content": {
"type": "string"
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if `includeHtml` is true"
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeRawHtml` is true"
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
@ -849,11 +818,11 @@
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"pageError": {
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
@ -871,34 +840,63 @@
"data": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {
"type": "string"
"markdown": {
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"markdown": {
"type": "string"
},
"content": {
"type": "string"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
}
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
}
}
}
@ -909,8 +907,15 @@
"CrawlResponse": {
"type": "object",
"properties": {
"jobId": {
"success": {
"type": "boolean"
},
"id": {
"type": "string"
},
"url": {
"type": "string",
"format": "uri"
}
}
}

View File

@ -106,7 +106,7 @@
"uuid": "^10.0.0",
"wordpos": "^2.1.0",
"xml2js": "^0.6.2",
"zod": "^3.23.4",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.1"
},
"nodemonConfig": {

View File

@ -189,7 +189,7 @@ importers:
specifier: ^0.6.2
version: 0.6.2
zod:
specifier: ^3.23.4
specifier: ^3.23.8
version: 3.23.8
zod-to-json-schema:
specifier: ^3.23.1

View File

@ -1,86 +1,68 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { Logger } from "../../../src/lib/logger";
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import { RequestWithAuth } from "./types";
export async function crawlStatusController(req: Request, res: Response) {
// TODO: validate req.params.jobId
export async function crawlStatusController(req: RequestWithAuth, res: Response) {
// const job = await getWebScraperQueue().getJob(req.params.jobId);
// if (!job) {
// return res.status(404).json({ error: "Job not found" });
// }
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.CrawlStatus
);
if (!success) {
return res.status(status).json({ error });
}
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
// const job = await getWebScraperQueue().getJob(req.params.jobId);
// if (!job) {
// return res.status(404).json({ error: "Job not found" });
// }
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") {
// const supabaseData = await supabaseGetJobById(req.params.jobId);
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
// if (supabaseData) {
// data = supabaseData.docs;
// }
// }
// let data = job.returnvalue;
// if (process.env.USE_DB_AUTHENTICATION === "true") {
// const supabaseData = await supabaseGetJobById(req.params.jobId);
// const jobStatus = await job.getState();
// if (supabaseData) {
// data = supabaseData.docs;
// }
// }
// const jobStatus = await job.getState();
// mock:
const id = uuidv4();
const result = {
totalCount: 100,
creditsUsed: 2,
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
status: "scraping", // scraping, completed, failed
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
data: [{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
},
{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}]
}
res.status(200).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ error: error.message });
// mock:
const id = uuidv4();
const result = {
totalCount: 100,
creditsUsed: 2,
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
status: "scraping", // scraping, completed, failed
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
data: [{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
},
{
markdown: "test",
content: "test",
html: "test",
rawHtml: "test",
linksOnPage: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}]
}
res.status(200).json(result);
}

View File

@ -1,140 +1,87 @@
import { Request, Response } from "express";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import { CrawlRequest, CrawlResponse } from "./types";
import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger";
export async function crawlController(req: Request<{}, {}, CrawlRequest>, res: Response<CrawlResponse>) {
// expected req.body
export async function crawlController(req: RequestWithAuth<CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
req.body = crawlRequestSchema.parse(req.body);
const id = uuidv4();
// req.body = {
// url: string
// crawlerOptions: {
// includePaths: string[]
// excludePaths: string[]
// maxDepth: number
// limit: number
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
// allowExternalLinks: boolean
// ignoreSitemap: number
// }
// scrapeOptions: Exclude<Scrape, "url">
// }
await logCrawl(id, req.auth.team_id);
const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions),
pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ success: false, error });
}
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
try {
createIdempotencyKey(req);
} catch (error) {
Logger.error(error);
return res.status(500).json({ success: false, error: error.message });
}
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
let url = req.body.url;
if (!url) {
return res.status(400).json({ success: false, error: "Url is required" });
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
try {
url = checkAndUpdateURL(url).url;
} catch (error) {
return res.status(400).json({ success: false, error: 'Invalid Url' });
}
// TODO: add job to queue
const id = uuidv4();
return res.status(200).json({
success: true,
id,
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
});
// const mode = req.body.mode ?? "crawl";
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ success: false, error: error.message });
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
const jobs = sitemap.map(x => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
team_id: req.auth.team_id,
crawlerOptions,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: 20,
}
};
})
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
const job = await addScrapeJob({
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id,
pageOptions: pageOptions,
origin: "api",
crawl_id: id,
}, {
priority: 15,
});
await addCrawlJob(id, job.id);
}
return res.status(200).json({
success: true,
id,
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
});
}

View File

@ -1,12 +1,12 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { Logger } from "../../../src/lib/logger";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import { MapRequest, MapResponse } from "./types";
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
import { checkTeamCredits } from "../../services/billing/credit_billing";
export async function mapController(req: Request<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
export async function mapController(req: RequestWithAuth<MapResponse, MapRequest>, res: Response<MapResponse>) {
req.body = mapRequestSchema.parse(req.body);
console.log(req.body);
// expected req.body
// req.body = {
@ -14,106 +14,53 @@ export async function mapController(req: Request<{}, MapResponse, MapRequest>, r
// crawlerOptions:
// }
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ success: false, error });
}
// if (req.headers["x-idempotency-key"]) {
// const isIdempotencyValid = await validateIdempotencyKey(req);
// if (!isIdempotencyValid) {
// return res.status(409).json({ error: "Idempotency key already used" });
// }
// try {
// createIdempotencyKey(req);
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
// await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// return res.status(402).json({ error: "Insufficient credits" });
// }
// const mode = req.body.mode ?? "crawl";
let url = req.body.url;
if (!url) {
return res.status(400).json({ success: false, error: "Url is required" });
}
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
try {
url = checkAndUpdateURL(url).url;
} catch (error) {
return res.status(400).json({ success: false, error: 'Invalid Url' });
}
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// const mode = req.body.mode ?? "crawl";
// await logCrawl(job.id.toString(), team_id);
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
} catch (error) {
Logger.error(error);
return res.status(500).json({ success: false, error: error.message });
}
// res.json({ jobId: job.id });
}

View File

@ -1,26 +1,11 @@
import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../types";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { Logger } from '../../lib/logger';
import { checkAndUpdateURL } from '../../lib/validateUrl';
import { ScrapeRequest, ScrapeResponse } from "./types";
import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
let url = req.body.url;
if (!url) {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
try {
url = checkAndUpdateURL(url).url;
} catch (error) {
return { success: false, error: "Invalid URL", returnCode: 400 };
}
export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
req.body = scrapeRequestSchema.parse(req.body);
console.log(req.body);
// TODO: check req.body
// mockup req.body
@ -37,137 +22,124 @@ export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRe
// waitFor: number
// }
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
RateLimiterMode.Scrape
);
if (!success) {
return res.status(status).json({ success: false, error });
}
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
// check credits
// check credits
const result: ScrapeResponse = {
success: true,
warning: "test",
data: {
markdown: "test",
html: "test",
rawHtml: "test",
links: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
const result: ScrapeResponse = {
success: true,
warning: "test",
data: {
markdown: "test",
html: "test",
rawHtml: "test",
links: ["test1", "test2"],
screenshot: "test",
metadata: {
title: "test",
description: "test",
language: "test",
sourceURL: "test",
statusCode: 200,
error: "test"
}
}
return res.status(200).json(result);
// const crawlerOptions = req.body.crawlerOptions ?? {};
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
// const origin = req.body.origin ?? defaultOrigin;
// let timeout = req.body.timeout ?? defaultTimeout;
// if (extractorOptions.mode.includes("llm-extraction")) {
// pageOptions.onlyMainContent = true;
// timeout = req.body.timeout ?? 90000;
// }
// const checkCredits = async () => {
// try {
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// earlyReturn = true;
// return res.status(402).json({ error: "Insufficient credits" });
// }
// } catch (error) {
// Logger.error(error);
// earlyReturn = true;
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
// }
// };
// await checkCredits();
// const jobId = uuidv4();
// const startTime = new Date().getTime();
// const result = await scrapeHelper(
// jobId,
// req,
// team_id,
// crawlerOptions,
// pageOptions,
// extractorOptions,
// timeout,
// plan
// );
// const endTime = new Date().getTime();
// const timeTakenInSeconds = (endTime - startTime) / 1000;
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
// if (result.success) {
// let creditsToBeBilled = 1; // Assuming 1 credit per document
// const creditsPerLLMExtract = 50;
// if (extractorOptions.mode.includes("llm-extraction")) {
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
// creditsToBeBilled += creditsPerLLMExtract;
// }
// let startTimeBilling = new Date().getTime();
// if (earlyReturn) {
// // Don't bill if we're early returning
// return;
// }
// const billingResult = await billTeam(
// team_id,
// creditsToBeBilled
// );
// if (!billingResult.success) {
// return res.status(402).json({
// success: false,
// error: "Failed to bill team. Insufficient credits or subscription not found.",
// });
// }
// }
// logJob({
// job_id: jobId,
// success: result.success,
// message: result.error,
// num_docs: 1,
// docs: [result.data],
// time_taken: timeTakenInSeconds,
// team_id: team_id,
// mode: "scrape",
// url: req.body.url,
// crawlerOptions: crawlerOptions,
// pageOptions: pageOptions,
// origin: origin,
// extractor_options: extractorOptions,
// num_tokens: numTokens,
// });
// return res.status(result.returnCode).json(result);
} catch (error) {
Logger.error(error);
return res.status(500).json({ success: false, error: error.message });
}
return res.status(200).json(result);
// const crawlerOptions = req.body.crawlerOptions ?? {};
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
// const origin = req.body.origin ?? defaultOrigin;
// let timeout = req.body.timeout ?? defaultTimeout;
// if (extractorOptions.mode.includes("llm-extraction")) {
// pageOptions.onlyMainContent = true;
// timeout = req.body.timeout ?? 90000;
// }
// const checkCredits = async () => {
// try {
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
// if (!creditsCheckSuccess) {
// earlyReturn = true;
// return res.status(402).json({ error: "Insufficient credits" });
// }
// } catch (error) {
// Logger.error(error);
// earlyReturn = true;
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
// }
// };
// await checkCredits();
// const jobId = uuidv4();
// const startTime = new Date().getTime();
// const result = await scrapeHelper(
// jobId,
// req,
// team_id,
// crawlerOptions,
// pageOptions,
// extractorOptions,
// timeout,
// plan
// );
// const endTime = new Date().getTime();
// const timeTakenInSeconds = (endTime - startTime) / 1000;
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
// if (result.success) {
// let creditsToBeBilled = 1; // Assuming 1 credit per document
// const creditsPerLLMExtract = 50;
// if (extractorOptions.mode.includes("llm-extraction")) {
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
// creditsToBeBilled += creditsPerLLMExtract;
// }
// let startTimeBilling = new Date().getTime();
// if (earlyReturn) {
// // Don't bill if we're early returning
// return;
// }
// const billingResult = await billTeam(
// team_id,
// creditsToBeBilled
// );
// if (!billingResult.success) {
// return res.status(402).json({
// success: false,
// error: "Failed to bill team. Insufficient credits or subscription not found.",
// });
// }
// }
// logJob({
// job_id: jobId,
// success: result.success,
// message: result.error,
// num_docs: 1,
// docs: [result.data],
// time_taken: timeTakenInSeconds,
// team_id: team_id,
// mode: "scrape",
// url: req.body.url,
// crawlerOptions: crawlerOptions,
// pageOptions: pageOptions,
// origin: origin,
// extractor_options: extractorOptions,
// num_tokens: numTokens,
// });
// return res.status(result.returnCode).json(result);
}

View File

@ -1,36 +1,96 @@
import { Request } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
export type ScrapeRequest = {
url: string;
formats?: Format[];
headers?: { [K: string]: string };
includeTags?: string[];
excludeTags?: string[];
onlyMainContent?: boolean;
timeout?: number;
waitFor?: number;
}
const url = z.preprocess(x => {
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
if (x.startsWith("://")) {
return "http" + x;
} else {
return "http://" + x;
}
} else {
return x;
}
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
export type CrawlerOptions = {
includePaths?: string[];
excludePaths?: string[];
maxDepth?: number;
limit?: number;
allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
allowExternalLinks?: boolean;
ignoreSitemap?: boolean;
};
export const scrapeOptions = z.object({
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
.array()
.optional()
.default(["markdown"]),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), // default?
waitFor: z.number().int().nonnegative().finite().safe().default(0),
});
export type CrawlRequest = {
url: string;
crawlerOptions?: CrawlerOptions;
scrapeOptions?: Exclude<ScrapeRequest, "url">;
};
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export type MapRequest = {
url: string;
crawlerOptions?: CrawlerOptions;
};
export const scrapeRequestSchema = scrapeOptions.extend({ url });
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
maxDepth: z.number().default(10), // default?
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
});
// export type CrawlerOptions = {
// includePaths?: string[];
// excludePaths?: string[];
// maxDepth?: number;
// limit?: number;
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
// allowExternalLinks?: boolean;
// ignoreSitemap?: boolean;
// };
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = z.object({
url,
crawlerOptions: crawlerOptions.default({}),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
});
// export type CrawlRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
// };
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({ url });
// export type MapRequest = {
// url: string;
// crawlerOptions?: CrawlerOptions;
// };
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string,
@ -77,6 +137,7 @@ export type Document = {
export type ErrorResponse = {
success: false;
error: string;
details?: any;
};
export type ScrapeResponse = ErrorResponse | {
@ -95,3 +156,40 @@ export type MapResponse = ErrorResponse | {
success: true;
links: string[];
}
type AuthObject = {
team_id: string;
plan: string;
}
export interface RequestWithMaybeAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
auth?: AuthObject;
}
export interface RequestWithAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
auth: AuthObject;
}
export function legacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxCrawledDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
};
}
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return {
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
};
}

View File

@ -1,9 +1,18 @@
import express from "express";
import express, { NextFunction, Request, Response } from "express";
import { crawlController } from "../../src/controllers/v1/crawl";
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { scrapeController } from "../../src/controllers/v1/scrape";
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { mapController } from "../../src/controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/v1/auth";
import { Logger } from "../lib/logger";
import { createIdempotencyKey } from "../services/idempotency/create";
import { validateIdempotencyKey } from "../services/idempotency/validate";
import { ZodError } from "zod";
import { checkTeamCredits } from "../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -12,13 +21,96 @@ import { mapController } from "../../src/controllers/v1/map";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
next();
})()
.catch(err => next(err));
};
}
function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
const { success, team_id, error, status, plan } = await authenticateUser(
req,
res,
rateLimiterMode,
);
if (!success) {
return res.status(status).json({ success: false, error });
}
req.auth = { team_id, plan };
next();
})()
.catch(err => next(err));
}
}
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
(async () => {
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
// try {
createIdempotencyKey(req);
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ success: false, error: error.message });
// }
}
next();
})()
.catch(err => next(err));
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
}
}
export const v1Router = express.Router();
v1Router.post("/v1/scrape", scrapeController);
v1Router.post("/v1/crawl", crawlController);
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
v1Router.post(
"/v1/scrape",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
wrap(scrapeController)
);
v1Router.post(
"/v1/crawl",
authMiddleware(RateLimiterMode.Crawl),
idempotencyMiddleware,
checkCreditsMiddleware(1),
wrap(crawlController)
);
v1Router.post(
"/v1/map",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(1),
wrap(mapController)
);
v1Router.get(
"/v1/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
);
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
// v1Router.delete("/v1/crawl/:jobId", crawlCancelController);
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
// // Auth route for key based authentication
@ -31,4 +123,12 @@ v1Router.get("/v1/crawl/:jobId", crawlStatusController);
// v1Router.get("/v1/health/liveness", livenessController);
// v1Router.get("/v1/health/readiness", readinessController);
v1Router.post("/v1/map", mapController);
v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
if (err instanceof ZodError) {
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
} else {
const id = uuidv4();
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err));
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
}
});