Caleb: added ajv json schema validation.

This commit is contained in:
Caleb Peffer 2024-04-29 12:12:55 -07:00
parent 667f740315
commit 4f7737c922
7 changed files with 54 additions and 4 deletions

View File

@ -51,6 +51,7 @@
"@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0",
"@supabase/supabase-js": "^2.7.1",
"ajv": "^8.12.0",
"async": "^3.2.5",
"async-mutex": "^0.4.0",
"axios": "^1.3.4",

View File

@ -35,6 +35,9 @@ dependencies:
'@supabase/supabase-js':
specifier: ^2.7.1
version: 2.39.7
ajv:
specifier: ^8.12.0
version: 8.12.0
async:
specifier: ^3.2.5
version: 3.2.5
@ -1820,6 +1823,15 @@ packages:
humanize-ms: 1.2.1
dev: false
/ajv@8.12.0:
resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==}
dependencies:
fast-deep-equal: 3.1.3
json-schema-traverse: 1.0.0
require-from-string: 2.0.2
uri-js: 4.4.1
dev: false
/ansi-escapes@4.3.2:
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
engines: {node: '>=8'}
@ -2926,6 +2938,10 @@ packages:
- supports-color
dev: false
/fast-deep-equal@3.1.3:
resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
dev: false
/fast-fifo@1.3.2:
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
dev: false
@ -3999,6 +4015,10 @@ packages:
hasBin: true
dev: false
/json-schema-traverse@1.0.0:
resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
dev: false
/json5@2.2.3:
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
engines: {node: '>=6'}
@ -5264,6 +5284,11 @@ packages:
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
engines: {node: '>=0.10.0'}
/require-from-string@2.0.2:
resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
engines: {node: '>=0.10.0'}
dev: false
/resolve-cwd@3.0.0:
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
engines: {node: '>=8'}
@ -5970,6 +5995,12 @@ packages:
picocolors: 1.0.0
dev: true
/uri-js@4.4.1:
resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
dependencies:
punycode: 2.3.1
dev: false
/urlpattern-polyfill@10.0.0:
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
dev: false

View File

@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => {
});
// Ensure that the job was successfully created before proceeding with LLM extraction
expect(response.statusCode).toBe(200);
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response.body.data.llm_extraction;

View File

@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import Ajv from 'ajv';
export async function scrapeHelper(
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: any,
extractorOptions: any
extractorOptions: ExtractorOptions
): Promise<{
success: boolean;
error?: string;
@ -29,6 +30,7 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}
const a = new WebScraperDataProvider();
await a.setOptions({
mode: "single_urls",

View File

@ -3,6 +3,8 @@ import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import Ajv from 'ajv';
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import {
ScraperCompletionResult,
@ -22,20 +24,29 @@ export async function generateCompletions(
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
const completions = await Promise.all(documents.map(async (document: Document) => {
switch (switchVariable) {
case "openAI":
const llm = new OpenAI();
return await generateOpenAICompletions({
const completionResult = await generateOpenAICompletions({
client: llm,
document: document,
schema: schema,
prompt: prompt
});
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
}
return completionResult;
default:
throw new Error('Invalid client');
}
}));
return completions;
}

View File

@ -31,7 +31,7 @@ function prepareOpenAIDoc(
return [{ type: 'text', text: document.markdown }]
}
export async function generateOpenAICompletions<T>({
export async function generateOpenAICompletions({
client,
model = 'gpt-3.5-turbo',
document,

View File

@ -57,7 +57,7 @@ export class Document {
url?: string; // Used only in /search for now
content: string;
markdown?: string;
llm_extraction?: string;
llm_extraction?: Record<string, any>;
createdAt?: Date;
updatedAt?: Date;
type?: string;