Nick: fixes js and pydantic implementation

This commit is contained in:
Nicolas 2024-05-08 17:16:59 -07:00
parent c89964b230
commit e6dbbf1bab
9 changed files with 82 additions and 27 deletions

View File

@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
});
};
import axios from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
/**
* Main class for interacting with the Firecrawl API.
@ -38,7 +39,11 @@ export default class FirecrawlApp {
};
let jsonData = Object.assign({ url }, params);
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
const schema = zodToJsonSchema(params.extractorOptions.extractionSchema);
let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
}
try {

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.17",
"version": "0.0.19",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js",
"types": "types/index.d.ts",

View File

@ -91,9 +91,11 @@ export default class FirecrawlApp {
} as AxiosRequestHeaders;
let jsonData: Params = { url, ...params };
if (params?.extractorOptions?.extractionSchema) {
const schema = zodToJsonSchema(
params.extractorOptions.extractionSchema as z.ZodSchema
);
let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = {
...jsonData,
extractorOptions: {

View File

@ -9,7 +9,7 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.8",
"@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",
@ -421,11 +421,10 @@
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "0.0.17-beta.8",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.17-beta.8.tgz",
"integrity": "sha512-d65AW+y4YUQ9oU4Jy8dqiuKBPr+QkAyOKYEwFev/GOpGbNfU6lBUGJlAujVXaVY6fDbUGkHoaEzUbuTsqZV+Ng==",
"version": "0.0.19",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
"integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.5",
"axios": "^1.6.8",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"

View File

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.17-beta.8",
"@mendable/firecrawl-js": "^0.0.19",
"axios": "^1.6.8",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",

View File

@ -3,7 +3,7 @@ import { z } from "zod";
async function a() {
const app = new FirecrawlApp({
apiKey: "fc-YOUR_FIRECRAWL_API_KEY",
apiKey: "fc-YOUR_API_KEY",
});
// Define schema to extract contents into
@ -20,7 +20,7 @@ async function a() {
.length(5)
.describe("Top 5 stories on Hacker News"),
});
const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
extractorOptions: { extractionSchema: schema },
});
console.log(scrapeResult.data["llm_extraction"]);

View File

@ -1,13 +1,36 @@
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="YOUR_API_KEY")
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
print(crawl_result[0]['markdown'])
# crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
job_id = crawl_result['jobId']
print(job_id)
# print(crawl_result[0]['markdown'])
# job_id = crawl_result['jobId']
# print(job_id)
# status = app.check_crawl_status(job_id)
# print(status)
from pydantic import BaseModel, Field
from typing import List, Optional
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
a = app.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': TopArticlesSchema.model_json_schema(),
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
status = app.check_crawl_status(job_id)
print(status)

View File

@ -1,4 +1,5 @@
import os
from typing import Any, Dict, Optional
import requests
import time
@ -8,26 +9,51 @@ class FirecrawlApp:
if self.api_key is None:
raise ValueError('No API key provided')
def scrape_url(self, url, params=None):
from pydantic import BaseModel
from typing import Optional, Dict, Any
class ScrapeParams(BaseModel):
url: str
extractorOptions: Optional[Dict[str, Any]] = None
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'url': url}
# Prepare the base scrape parameters with the URL
scrape_params = {'url': url}
# If there are additional params, process them
if params:
json_data.update(params)
# Initialize extractorOptions if present
extractor_options = params.get('extractorOptions', {})
# Check and convert the extractionSchema if it's a Pydantic model
if 'extractionSchema' in extractor_options:
if hasattr(extractor_options['extractionSchema'], 'schema'):
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
print(scrape_params)
# Make the POST request with the prepared headers and JSON data
response = requests.post(
'https://api.firecrawl.dev/v0/scrape',
headers=headers,
json=json_data
json=scrape_params
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
if response['success']:
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')

View File

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(
name='firecrawl-py',
version='0.0.6',
version='0.0.7',
url='https://github.com/mendableai/firecrawl',
author='Mendable.ai',
author_email='nick@mendable.ai',