mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick: fixes js and pydantic implementation
This commit is contained in:
parent
c89964b230
commit
e6dbbf1bab
|
@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|||
});
|
||||
};
|
||||
import axios from "axios";
|
||||
import { z } from "zod";
|
||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
|
@ -38,7 +39,11 @@ export default class FirecrawlApp {
|
|||
};
|
||||
let jsonData = Object.assign({ url }, params);
|
||||
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
|
||||
const schema = zodToJsonSchema(params.extractorOptions.extractionSchema);
|
||||
let schema = params.extractorOptions.extractionSchema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof z.ZodSchema) {
|
||||
schema = zodToJsonSchema(schema);
|
||||
}
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
||||
}
|
||||
try {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.17",
|
||||
"version": "0.0.19",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
|
|
|
@ -91,9 +91,11 @@ export default class FirecrawlApp {
|
|||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { url, ...params };
|
||||
if (params?.extractorOptions?.extractionSchema) {
|
||||
const schema = zodToJsonSchema(
|
||||
params.extractorOptions.extractionSchema as z.ZodSchema
|
||||
);
|
||||
let schema = params.extractorOptions.extractionSchema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof z.ZodSchema) {
|
||||
schema = zodToJsonSchema(schema);
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
extractorOptions: {
|
||||
|
|
9
apps/js-sdk/package-lock.json
generated
9
apps/js-sdk/package-lock.json
generated
|
@ -9,7 +9,7 @@
|
|||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.17-beta.8",
|
||||
"@mendable/firecrawl-js": "^0.0.19",
|
||||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
|
@ -421,11 +421,10 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "0.0.17-beta.8",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.17-beta.8.tgz",
|
||||
"integrity": "sha512-d65AW+y4YUQ9oU4Jy8dqiuKBPr+QkAyOKYEwFev/GOpGbNfU6lBUGJlAujVXaVY6fDbUGkHoaEzUbuTsqZV+Ng==",
|
||||
"version": "0.0.19",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
|
||||
"integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.17-beta.5",
|
||||
"axios": "^1.6.8",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.17-beta.8",
|
||||
"@mendable/firecrawl-js": "^0.0.19",
|
||||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
|
|
|
@ -3,7 +3,7 @@ import { z } from "zod";
|
|||
|
||||
async function a() {
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_FIRECRAWL_API_KEY",
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
|
@ -20,7 +20,7 @@ async function a() {
|
|||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
|
||||
extractorOptions: { extractionSchema: schema },
|
||||
});
|
||||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
|
|
|
@ -1,13 +1,36 @@
|
|||
from firecrawl import FirecrawlApp
|
||||
|
||||
|
||||
app = FirecrawlApp(api_key="YOUR_API_KEY")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
print(crawl_result[0]['markdown'])
|
||||
# crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
|
||||
job_id = crawl_result['jobId']
|
||||
print(job_id)
|
||||
# print(crawl_result[0]['markdown'])
|
||||
|
||||
# job_id = crawl_result['jobId']
|
||||
# print(job_id)
|
||||
|
||||
# status = app.check_crawl_status(job_id)
|
||||
# print(status)
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
a = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
status = app.check_crawl_status(job_id)
|
||||
print(status)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
import requests
|
||||
import time
|
||||
|
||||
|
@ -8,26 +9,51 @@ class FirecrawlApp:
|
|||
if self.api_key is None:
|
||||
raise ValueError('No API key provided')
|
||||
|
||||
def scrape_url(self, url, params=None):
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
class ScrapeParams(BaseModel):
|
||||
url: str
|
||||
extractorOptions: Optional[Dict[str, Any]] = None
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
json_data = {'url': url}
|
||||
# Prepare the base scrape parameters with the URL
|
||||
scrape_params = {'url': url}
|
||||
|
||||
# If there are additional params, process them
|
||||
if params:
|
||||
json_data.update(params)
|
||||
# Initialize extractorOptions if present
|
||||
extractor_options = params.get('extractorOptions', {})
|
||||
# Check and convert the extractionSchema if it's a Pydantic model
|
||||
if 'extractionSchema' in extractor_options:
|
||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||
# Update the scrape_params with the processed extractorOptions
|
||||
scrape_params['extractorOptions'] = extractor_options
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key != 'extractorOptions':
|
||||
scrape_params[key] = value
|
||||
print(scrape_params)
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
'https://api.firecrawl.dev/v0/scrape',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
json=scrape_params
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
if response['success']:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
|
|
|
@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|||
|
||||
setup(
|
||||
name='firecrawl-py',
|
||||
version='0.0.6',
|
||||
version='0.0.7',
|
||||
url='https://github.com/mendableai/firecrawl',
|
||||
author='Mendable.ai',
|
||||
author_email='nick@mendable.ai',
|
||||
|
|
Loading…
Reference in New Issue
Block a user