mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
add zod, create middleware, update openapi declaration, add crawl logic
This commit is contained in:
parent
4165de1773
commit
8b7569f8f3
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
|
@ -0,0 +1,924 @@
|
|||
{
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": "Firecrawl API",
|
||||
"version": "0.0.0",
|
||||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
"url": "https://api.firecrawl.dev/v0"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl": {
|
||||
"post": {
|
||||
"summary": "Crawl multiple URLs based on options",
|
||||
"operationId": "crawlUrls",
|
||||
"tags": ["Crawling"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The base URL to start crawling from"
|
||||
},
|
||||
"crawlerOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"includes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
},
|
||||
"excludes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
},
|
||||
"generateImgAltText": {
|
||||
"type": "boolean",
|
||||
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||
"default": false
|
||||
},
|
||||
"returnOnlyUrls": {
|
||||
"type": "boolean",
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
"description": "Ignore the website sitemap when crawling",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CrawlResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search": {
|
||||
"post": {
|
||||
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||
"operationId": "searchGoogle",
|
||||
"tags": ["Search"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"fetchPageContent": {
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"searchOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results. Max is 20 during beta."
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SearchResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/status/{jobId}": {
|
||||
"get": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Get the status of a crawl job",
|
||||
"operationId": "getCrawlStatus",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Status of the job (completed, active, failed, paused)"
|
||||
},
|
||||
"current": {
|
||||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"bearerAuth": {
|
||||
"type": "http",
|
||||
"scheme": "bearer"
|
||||
}
|
||||
},
|
||||
"schemas": {
|
||||
"ScrapeResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"SearchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
|
@ -18,8 +18,8 @@
|
|||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrape",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
|
@ -38,94 +38,47 @@
|
|||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
"formats": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||
},
|
||||
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||
"default": ["markdown"]
|
||||
},
|
||||
"extractorOptions": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": true
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
|
@ -741,24 +694,42 @@
|
|||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Warning message to let you know of any issues."
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -780,27 +751,16 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -810,24 +770,33 @@
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
|
@ -849,11 +818,11 @@
|
|||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
|
@ -871,34 +840,63 @@
|
|||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
"markdown": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -909,8 +907,15 @@
|
|||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@
|
|||
"uuid": "^10.0.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"xml2js": "^0.6.2",
|
||||
"zod": "^3.23.4",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.1"
|
||||
},
|
||||
"nodemonConfig": {
|
||||
|
|
|
@ -189,7 +189,7 @@ importers:
|
|||
specifier: ^0.6.2
|
||||
version: 0.6.2
|
||||
zod:
|
||||
specifier: ^3.23.4
|
||||
specifier: ^3.23.8
|
||||
version: 3.23.8
|
||||
zod-to-json-schema:
|
||||
specifier: ^3.23.1
|
||||
|
|
|
@ -1,86 +1,68 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { RequestWithAuth } from "./types";
|
||||
|
||||
export async function crawlStatusController(req: Request, res: Response) {
|
||||
// TODO: validate req.params.jobId
|
||||
export async function crawlStatusController(req: RequestWithAuth, res: Response) {
|
||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
// if (!job) {
|
||||
// return res.status(404).json({ error: "Job not found" });
|
||||
// }
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
|
||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
||||
// if (!job) {
|
||||
// return res.status(404).json({ error: "Job not found" });
|
||||
// }
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
|
||||
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
// let data = job.returnvalue;
|
||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||
// const jobStatus = await job.getState();
|
||||
|
||||
// if (supabaseData) {
|
||||
// data = supabaseData.docs;
|
||||
// }
|
||||
// }
|
||||
|
||||
// const jobStatus = await job.getState();
|
||||
|
||||
// mock:
|
||||
const id = uuidv4();
|
||||
const result = {
|
||||
totalCount: 100,
|
||||
creditsUsed: 2,
|
||||
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
|
||||
status: "scraping", // scraping, completed, failed
|
||||
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
data: [{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
},
|
||||
{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
res.status(200).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
// mock:
|
||||
const id = uuidv4();
|
||||
const result = {
|
||||
totalCount: 100,
|
||||
creditsUsed: 2,
|
||||
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
|
||||
status: "scraping", // scraping, completed, failed
|
||||
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
data: [{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
},
|
||||
{
|
||||
markdown: "test",
|
||||
content: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
linksOnPage: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
res.status(200).json(result);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,140 +1,87 @@
|
|||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import { CrawlRequest, CrawlResponse } from "./types";
|
||||
import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
export async function crawlController(req: Request<{}, {}, CrawlRequest>, res: Response<CrawlResponse>) {
|
||||
// expected req.body
|
||||
export async function crawlController(req: RequestWithAuth<CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// crawlerOptions: {
|
||||
// includePaths: string[]
|
||||
// excludePaths: string[]
|
||||
// maxDepth: number
|
||||
// limit: number
|
||||
// allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks: boolean
|
||||
// ignoreSitemap: number
|
||||
// }
|
||||
// scrapeOptions: Exclude<Scrape, "url">
|
||||
// }
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions),
|
||||
pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
try {
|
||||
createIdempotencyKey(req);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ success: false, error: "Url is required" });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (error) {
|
||||
return res.status(400).json({ success: false, error: 'Invalid Url' });
|
||||
}
|
||||
|
||||
// TODO: add job to queue
|
||||
|
||||
const id = uuidv4();
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
|
||||
});
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ success: false, error: error.message });
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
const jobs = sitemap.map(x => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
}
|
||||
};
|
||||
})
|
||||
|
||||
await lockURLs(id, jobs.map(x => x.data.url));
|
||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const job = await addScrapeJob({
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
}, {
|
||||
priority: 15,
|
||||
});
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import { MapRequest, MapResponse } from "./types";
|
||||
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
||||
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
|
||||
export async function mapController(req: Request<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||
export async function mapController(req: RequestWithAuth<MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
console.log(req.body);
|
||||
// expected req.body
|
||||
|
||||
// req.body = {
|
||||
|
@ -14,106 +14,53 @@ export async function mapController(req: Request<{}, MapResponse, MapRequest>, r
|
|||
// crawlerOptions:
|
||||
// }
|
||||
|
||||
try {
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Crawl
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
// if (req.headers["x-idempotency-key"]) {
|
||||
// const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
// if (!isIdempotencyValid) {
|
||||
// return res.status(409).json({ error: "Idempotency key already used" });
|
||||
// }
|
||||
// try {
|
||||
// createIdempotencyKey(req);
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
||||
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
// await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return res.status(400).json({ success: false, error: "Url is required" });
|
||||
}
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (error) {
|
||||
return res.status(400).json({ success: false, error: 'Invalid Url' });
|
||||
}
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
// res.json({ jobId: job.id });
|
||||
}
|
||||
|
|
|
@ -1,26 +1,11 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { Logger } from '../../lib/logger';
|
||||
import { checkAndUpdateURL } from '../../lib/validateUrl';
|
||||
import { ScrapeRequest, ScrapeResponse } from "./types";
|
||||
import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||
|
||||
export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||
let url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
try {
|
||||
url = checkAndUpdateURL(url).url;
|
||||
} catch (error) {
|
||||
return { success: false, error: "Invalid URL", returnCode: 400 };
|
||||
}
|
||||
export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||
req.body = scrapeRequestSchema.parse(req.body);
|
||||
console.log(req.body);
|
||||
|
||||
// TODO: check req.body
|
||||
// mockup req.body
|
||||
|
@ -37,137 +22,124 @@ export async function scrapeController(req: Request<{}, ScrapeResponse, ScrapeRe
|
|||
// waitFor: number
|
||||
// }
|
||||
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
|
||||
// check credits
|
||||
// check credits
|
||||
|
||||
const result: ScrapeResponse = {
|
||||
success: true,
|
||||
warning: "test",
|
||||
data: {
|
||||
markdown: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
links: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
const result: ScrapeResponse = {
|
||||
success: true,
|
||||
warning: "test",
|
||||
data: {
|
||||
markdown: "test",
|
||||
html: "test",
|
||||
rawHtml: "test",
|
||||
links: ["test1", "test2"],
|
||||
screenshot: "test",
|
||||
metadata: {
|
||||
title: "test",
|
||||
description: "test",
|
||||
language: "test",
|
||||
sourceURL: "test",
|
||||
statusCode: 200,
|
||||
error: "test"
|
||||
}
|
||||
}
|
||||
|
||||
return res.status(200).json(result);
|
||||
|
||||
// const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
// const origin = req.body.origin ?? defaultOrigin;
|
||||
// let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// pageOptions.onlyMainContent = true;
|
||||
// timeout = req.body.timeout ?? 90000;
|
||||
// }
|
||||
|
||||
// const checkCredits = async () => {
|
||||
// try {
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// earlyReturn = true;
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// earlyReturn = true;
|
||||
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
// }
|
||||
// };
|
||||
|
||||
|
||||
// await checkCredits();
|
||||
|
||||
// const jobId = uuidv4();
|
||||
|
||||
// const startTime = new Date().getTime();
|
||||
// const result = await scrapeHelper(
|
||||
// jobId,
|
||||
// req,
|
||||
// team_id,
|
||||
// crawlerOptions,
|
||||
// pageOptions,
|
||||
// extractorOptions,
|
||||
// timeout,
|
||||
// plan
|
||||
// );
|
||||
// const endTime = new Date().getTime();
|
||||
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
// if (result.success) {
|
||||
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
// const creditsPerLLMExtract = 50;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
// creditsToBeBilled += creditsPerLLMExtract;
|
||||
// }
|
||||
|
||||
// let startTimeBilling = new Date().getTime();
|
||||
|
||||
// if (earlyReturn) {
|
||||
// // Don't bill if we're early returning
|
||||
// return;
|
||||
// }
|
||||
// const billingResult = await billTeam(
|
||||
// team_id,
|
||||
// creditsToBeBilled
|
||||
// );
|
||||
// if (!billingResult.success) {
|
||||
// return res.status(402).json({
|
||||
// success: false,
|
||||
// error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// logJob({
|
||||
// job_id: jobId,
|
||||
// success: result.success,
|
||||
// message: result.error,
|
||||
// num_docs: 1,
|
||||
// docs: [result.data],
|
||||
// time_taken: timeTakenInSeconds,
|
||||
// team_id: team_id,
|
||||
// mode: "scrape",
|
||||
// url: req.body.url,
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: origin,
|
||||
// extractor_options: extractorOptions,
|
||||
// num_tokens: numTokens,
|
||||
// });
|
||||
|
||||
|
||||
// return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
|
||||
return res.status(200).json(result);
|
||||
|
||||
// const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
// const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
// const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
// const origin = req.body.origin ?? defaultOrigin;
|
||||
// let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// pageOptions.onlyMainContent = true;
|
||||
// timeout = req.body.timeout ?? 90000;
|
||||
// }
|
||||
|
||||
// const checkCredits = async () => {
|
||||
// try {
|
||||
// const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
// if (!creditsCheckSuccess) {
|
||||
// earlyReturn = true;
|
||||
// return res.status(402).json({ error: "Insufficient credits" });
|
||||
// }
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// earlyReturn = true;
|
||||
// return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
// }
|
||||
// };
|
||||
|
||||
|
||||
// await checkCredits();
|
||||
|
||||
// const jobId = uuidv4();
|
||||
|
||||
// const startTime = new Date().getTime();
|
||||
// const result = await scrapeHelper(
|
||||
// jobId,
|
||||
// req,
|
||||
// team_id,
|
||||
// crawlerOptions,
|
||||
// pageOptions,
|
||||
// extractorOptions,
|
||||
// timeout,
|
||||
// plan
|
||||
// );
|
||||
// const endTime = new Date().getTime();
|
||||
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
// const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
// if (result.success) {
|
||||
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
// const creditsPerLLMExtract = 50;
|
||||
|
||||
// if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
// creditsToBeBilled += creditsPerLLMExtract;
|
||||
// }
|
||||
|
||||
// let startTimeBilling = new Date().getTime();
|
||||
|
||||
// if (earlyReturn) {
|
||||
// // Don't bill if we're early returning
|
||||
// return;
|
||||
// }
|
||||
// const billingResult = await billTeam(
|
||||
// team_id,
|
||||
// creditsToBeBilled
|
||||
// );
|
||||
// if (!billingResult.success) {
|
||||
// return res.status(402).json({
|
||||
// success: false,
|
||||
// error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// logJob({
|
||||
// job_id: jobId,
|
||||
// success: result.success,
|
||||
// message: result.error,
|
||||
// num_docs: 1,
|
||||
// docs: [result.data],
|
||||
// time_taken: timeTakenInSeconds,
|
||||
// team_id: team_id,
|
||||
// mode: "scrape",
|
||||
// url: req.body.url,
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: origin,
|
||||
// extractor_options: extractorOptions,
|
||||
// num_tokens: numTokens,
|
||||
// });
|
||||
|
||||
|
||||
// return res.status(result.returnCode).json(result);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,36 +1,96 @@
|
|||
import { Request } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
|
||||
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
|
||||
|
||||
export type ScrapeRequest = {
|
||||
url: string;
|
||||
formats?: Format[];
|
||||
headers?: { [K: string]: string };
|
||||
includeTags?: string[];
|
||||
excludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
timeout?: number;
|
||||
waitFor?: number;
|
||||
}
|
||||
const url = z.preprocess(x => {
|
||||
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
||||
if (x.startsWith("://")) {
|
||||
return "http" + x;
|
||||
} else {
|
||||
return "http://" + x;
|
||||
}
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
|
||||
|
||||
export type CrawlerOptions = {
|
||||
includePaths?: string[];
|
||||
excludePaths?: string[];
|
||||
maxDepth?: number;
|
||||
limit?: number;
|
||||
allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks?: boolean;
|
||||
ignoreSitemap?: boolean;
|
||||
};
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"]),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
});
|
||||
|
||||
export type CrawlRequest = {
|
||||
url: string;
|
||||
crawlerOptions?: CrawlerOptions;
|
||||
scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
};
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export type MapRequest = {
|
||||
url: string;
|
||||
crawlerOptions?: CrawlerOptions;
|
||||
};
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({ url });
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
});
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
// includePaths?: string[];
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const crawlRequestSchema = z.object({
|
||||
url,
|
||||
crawlerOptions: crawlerOptions.default({}),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
});
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({ url });
|
||||
|
||||
// export type MapRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// };
|
||||
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
markdown?: string,
|
||||
|
@ -77,6 +137,7 @@ export type Document = {
|
|||
export type ErrorResponse = {
|
||||
success: false;
|
||||
error: string;
|
||||
details?: any;
|
||||
};
|
||||
|
||||
export type ScrapeResponse = ErrorResponse | {
|
||||
|
@ -95,3 +156,40 @@ export type MapResponse = ErrorResponse | {
|
|||
success: true;
|
||||
links: string[];
|
||||
}
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: string;
|
||||
}
|
||||
|
||||
export interface RequestWithMaybeAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
||||
auth?: AuthObject;
|
||||
}
|
||||
|
||||
export interface RequestWithAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
||||
auth: AuthObject;
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
maxCrawledLinks: x.limit,
|
||||
maxCrawledDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,9 +1,18 @@
|
|||
import express from "express";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import { crawlController } from "../../src/controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { mapController } from "../../src/controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/v1/auth";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
import { ZodError } from "zod";
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
|
@ -12,13 +21,96 @@ import { mapController } from "../../src/controllers/v1/map";
|
|||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
};
|
||||
}
|
||||
|
||||
function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
}
|
||||
|
||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
(async () => {
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
// try {
|
||||
createIdempotencyKey(req);
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ success: false, error: error.message });
|
||||
// }
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
}
|
||||
}
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post("/v1/scrape", scrapeController);
|
||||
v1Router.post("/v1/crawl", crawlController);
|
||||
v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
||||
v1Router.post(
|
||||
"/v1/scrape",
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/v1/crawl",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/v1/map",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/v1/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
// v1Router.post("/v1/crawlWebsitePreview", crawlPreviewController);
|
||||
// v1Router.delete("/v1/crawl/cancel/:jobId", crawlCancelController);
|
||||
// v1Router.delete("/v1/crawl/:jobId", crawlCancelController);
|
||||
// v1Router.get("/v1/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
|
@ -31,4 +123,12 @@ v1Router.get("/v1/crawl/:jobId", crawlStatusController);
|
|||
// v1Router.get("/v1/health/liveness", livenessController);
|
||||
// v1Router.get("/v1/health/readiness", readinessController);
|
||||
|
||||
v1Router.post("/v1/map", mapController);
|
||||
v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
} else {
|
||||
const id = uuidv4();
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err));
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
|
||||
}
|
||||
});
|
||||
|
|
Loading…
Reference in New Issue
Block a user