mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Caleb: added ajv json schema validation.
This commit is contained in:
parent
667f740315
commit
4f7737c922
|
@ -51,6 +51,7 @@
|
|||
"@nangohq/node": "^0.36.33",
|
||||
"@sentry/node": "^7.48.0",
|
||||
"@supabase/supabase-js": "^2.7.1",
|
||||
"ajv": "^8.12.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.4.0",
|
||||
"axios": "^1.3.4",
|
||||
|
|
|
@ -35,6 +35,9 @@ dependencies:
|
|||
'@supabase/supabase-js':
|
||||
specifier: ^2.7.1
|
||||
version: 2.39.7
|
||||
ajv:
|
||||
specifier: ^8.12.0
|
||||
version: 8.12.0
|
||||
async:
|
||||
specifier: ^3.2.5
|
||||
version: 3.2.5
|
||||
|
@ -1820,6 +1823,15 @@ packages:
|
|||
humanize-ms: 1.2.1
|
||||
dev: false
|
||||
|
||||
/ajv@8.12.0:
|
||||
resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==}
|
||||
dependencies:
|
||||
fast-deep-equal: 3.1.3
|
||||
json-schema-traverse: 1.0.0
|
||||
require-from-string: 2.0.2
|
||||
uri-js: 4.4.1
|
||||
dev: false
|
||||
|
||||
/ansi-escapes@4.3.2:
|
||||
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -2926,6 +2938,10 @@ packages:
|
|||
- supports-color
|
||||
dev: false
|
||||
|
||||
/fast-deep-equal@3.1.3:
|
||||
resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
|
||||
dev: false
|
||||
|
||||
/fast-fifo@1.3.2:
|
||||
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
|
||||
dev: false
|
||||
|
@ -3999,6 +4015,10 @@ packages:
|
|||
hasBin: true
|
||||
dev: false
|
||||
|
||||
/json-schema-traverse@1.0.0:
|
||||
resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
|
||||
dev: false
|
||||
|
||||
/json5@2.2.3:
|
||||
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
||||
engines: {node: '>=6'}
|
||||
|
@ -5264,6 +5284,11 @@ packages:
|
|||
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
/require-from-string@2.0.2:
|
||||
resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
dev: false
|
||||
|
||||
/resolve-cwd@3.0.0:
|
||||
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -5970,6 +5995,12 @@ packages:
|
|||
picocolors: 1.0.0
|
||||
dev: true
|
||||
|
||||
/uri-js@4.4.1:
|
||||
resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
|
||||
dependencies:
|
||||
punycode: 2.3.1
|
||||
dev: false
|
||||
|
||||
/urlpattern-polyfill@10.0.0:
|
||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||
dev: false
|
||||
|
|
|
@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => {
|
|||
});
|
||||
|
||||
|
||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
|
||||
|
||||
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
||||
let llmExtraction = response.body.data.llm_extraction;
|
||||
|
||||
|
|
|
@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types";
|
|||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import Ajv from 'ajv';
|
||||
|
||||
export async function scrapeHelper(
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: any,
|
||||
extractorOptions: any
|
||||
extractorOptions: ExtractorOptions
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
|
@ -29,6 +30,7 @@ export async function scrapeHelper(
|
|||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
|
||||
const a = new WebScraperDataProvider();
|
||||
await a.setOptions({
|
||||
mode: "single_urls",
|
||||
|
|
|
@ -3,6 +3,8 @@ import OpenAI from 'openai'
|
|||
// import { LlamaModel } from 'node-llama-cpp'
|
||||
import { z } from 'zod'
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||
import Ajv from 'ajv';
|
||||
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||
|
||||
import {
|
||||
ScraperCompletionResult,
|
||||
|
@ -22,20 +24,29 @@ export async function generateCompletions(
|
|||
|
||||
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
|
||||
|
||||
|
||||
const completions = await Promise.all(documents.map(async (document: Document) => {
|
||||
switch (switchVariable) {
|
||||
case "openAI":
|
||||
const llm = new OpenAI();
|
||||
return await generateOpenAICompletions({
|
||||
const completionResult = await generateOpenAICompletions({
|
||||
client: llm,
|
||||
document: document,
|
||||
schema: schema,
|
||||
prompt: prompt
|
||||
});
|
||||
// Validate the JSON output against the schema using AJV
|
||||
const validate = ajv.compile(schema);
|
||||
if (!validate(completionResult.llm_extraction)) {
|
||||
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
||||
}
|
||||
|
||||
return completionResult;
|
||||
default:
|
||||
throw new Error('Invalid client');
|
||||
}
|
||||
}));
|
||||
|
||||
|
||||
return completions;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ function prepareOpenAIDoc(
|
|||
return [{ type: 'text', text: document.markdown }]
|
||||
}
|
||||
|
||||
export async function generateOpenAICompletions<T>({
|
||||
export async function generateOpenAICompletions({
|
||||
client,
|
||||
model = 'gpt-3.5-turbo',
|
||||
document,
|
||||
|
|
|
@ -57,7 +57,7 @@ export class Document {
|
|||
url?: string; // Used only in /search for now
|
||||
content: string;
|
||||
markdown?: string;
|
||||
llm_extraction?: string;
|
||||
llm_extraction?: Record<string, any>;
|
||||
createdAt?: Date;
|
||||
updatedAt?: Date;
|
||||
type?: string;
|
||||
|
|
Loading…
Reference in New Issue
Block a user