Caleb: added ajv json schema validation.

This commit is contained in:
Caleb Peffer 2024-04-29 12:12:55 -07:00
parent 667f740315
commit 4f7737c922
7 changed files with 54 additions and 4 deletions

View File

@ -51,6 +51,7 @@
"@nangohq/node": "^0.36.33", "@nangohq/node": "^0.36.33",
"@sentry/node": "^7.48.0", "@sentry/node": "^7.48.0",
"@supabase/supabase-js": "^2.7.1", "@supabase/supabase-js": "^2.7.1",
"ajv": "^8.12.0",
"async": "^3.2.5", "async": "^3.2.5",
"async-mutex": "^0.4.0", "async-mutex": "^0.4.0",
"axios": "^1.3.4", "axios": "^1.3.4",

View File

@ -35,6 +35,9 @@ dependencies:
'@supabase/supabase-js': '@supabase/supabase-js':
specifier: ^2.7.1 specifier: ^2.7.1
version: 2.39.7 version: 2.39.7
ajv:
specifier: ^8.12.0
version: 8.12.0
async: async:
specifier: ^3.2.5 specifier: ^3.2.5
version: 3.2.5 version: 3.2.5
@ -1820,6 +1823,15 @@ packages:
humanize-ms: 1.2.1 humanize-ms: 1.2.1
dev: false dev: false
/ajv@8.12.0:
resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==}
dependencies:
fast-deep-equal: 3.1.3
json-schema-traverse: 1.0.0
require-from-string: 2.0.2
uri-js: 4.4.1
dev: false
/ansi-escapes@4.3.2: /ansi-escapes@4.3.2:
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
engines: {node: '>=8'} engines: {node: '>=8'}
@ -2926,6 +2938,10 @@ packages:
- supports-color - supports-color
dev: false dev: false
/fast-deep-equal@3.1.3:
resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
dev: false
/fast-fifo@1.3.2: /fast-fifo@1.3.2:
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
dev: false dev: false
@ -3999,6 +4015,10 @@ packages:
hasBin: true hasBin: true
dev: false dev: false
/json-schema-traverse@1.0.0:
resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
dev: false
/json5@2.2.3: /json5@2.2.3:
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
engines: {node: '>=6'} engines: {node: '>=6'}
@ -5264,6 +5284,11 @@ packages:
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
engines: {node: '>=0.10.0'} engines: {node: '>=0.10.0'}
/require-from-string@2.0.2:
resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
engines: {node: '>=0.10.0'}
dev: false
/resolve-cwd@3.0.0: /resolve-cwd@3.0.0:
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
engines: {node: '>=8'} engines: {node: '>=8'}
@ -5970,6 +5995,12 @@ packages:
picocolors: 1.0.0 picocolors: 1.0.0
dev: true dev: true
/uri-js@4.4.1:
resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
dependencies:
punycode: 2.3.1
dev: false
/urlpattern-polyfill@10.0.0: /urlpattern-polyfill@10.0.0:
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==} resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
dev: false dev: false

View File

@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => {
}); });
// Ensure that the job was successfully created before proceeding with LLM extraction
expect(response.statusCode).toBe(200);
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction` // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response.body.data.llm_extraction; let llmExtraction = response.body.data.llm_extraction;

View File

@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job"; import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities"; import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import Ajv from 'ajv';
export async function scrapeHelper( export async function scrapeHelper(
req: Request, req: Request,
team_id: string, team_id: string,
crawlerOptions: any, crawlerOptions: any,
pageOptions: any, pageOptions: any,
extractorOptions: any extractorOptions: ExtractorOptions
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -29,6 +30,7 @@ export async function scrapeHelper(
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
} }
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",

View File

@ -3,6 +3,8 @@ import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp' // import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod' import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema' import { zodToJsonSchema } from 'zod-to-json-schema'
import Ajv from 'ajv';
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { import {
ScraperCompletionResult, ScraperCompletionResult,
@ -22,20 +24,29 @@ export async function generateCompletions(
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
const completions = await Promise.all(documents.map(async (document: Document) => { const completions = await Promise.all(documents.map(async (document: Document) => {
switch (switchVariable) { switch (switchVariable) {
case "openAI": case "openAI":
const llm = new OpenAI(); const llm = new OpenAI();
return await generateOpenAICompletions({ const completionResult = await generateOpenAICompletions({
client: llm, client: llm,
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt prompt: prompt
}); });
// Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) {
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
}
return completionResult;
default: default:
throw new Error('Invalid client'); throw new Error('Invalid client');
} }
})); }));
return completions; return completions;
} }

View File

@ -31,7 +31,7 @@ function prepareOpenAIDoc(
return [{ type: 'text', text: document.markdown }] return [{ type: 'text', text: document.markdown }]
} }
export async function generateOpenAICompletions<T>({ export async function generateOpenAICompletions({
client, client,
model = 'gpt-3.5-turbo', model = 'gpt-3.5-turbo',
document, document,

View File

@ -57,7 +57,7 @@ export class Document {
url?: string; // Used only in /search for now url?: string; // Used only in /search for now
content: string; content: string;
markdown?: string; markdown?: string;
llm_extraction?: string; llm_extraction?: Record<string, any>;
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;
type?: string; type?: string;