mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Caleb: added ajv json schema validation.
This commit is contained in:
parent
667f740315
commit
4f7737c922
|
@ -51,6 +51,7 @@
|
||||||
"@nangohq/node": "^0.36.33",
|
"@nangohq/node": "^0.36.33",
|
||||||
"@sentry/node": "^7.48.0",
|
"@sentry/node": "^7.48.0",
|
||||||
"@supabase/supabase-js": "^2.7.1",
|
"@supabase/supabase-js": "^2.7.1",
|
||||||
|
"ajv": "^8.12.0",
|
||||||
"async": "^3.2.5",
|
"async": "^3.2.5",
|
||||||
"async-mutex": "^0.4.0",
|
"async-mutex": "^0.4.0",
|
||||||
"axios": "^1.3.4",
|
"axios": "^1.3.4",
|
||||||
|
|
|
@ -35,6 +35,9 @@ dependencies:
|
||||||
'@supabase/supabase-js':
|
'@supabase/supabase-js':
|
||||||
specifier: ^2.7.1
|
specifier: ^2.7.1
|
||||||
version: 2.39.7
|
version: 2.39.7
|
||||||
|
ajv:
|
||||||
|
specifier: ^8.12.0
|
||||||
|
version: 8.12.0
|
||||||
async:
|
async:
|
||||||
specifier: ^3.2.5
|
specifier: ^3.2.5
|
||||||
version: 3.2.5
|
version: 3.2.5
|
||||||
|
@ -1820,6 +1823,15 @@ packages:
|
||||||
humanize-ms: 1.2.1
|
humanize-ms: 1.2.1
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/ajv@8.12.0:
|
||||||
|
resolution: {integrity: sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==}
|
||||||
|
dependencies:
|
||||||
|
fast-deep-equal: 3.1.3
|
||||||
|
json-schema-traverse: 1.0.0
|
||||||
|
require-from-string: 2.0.2
|
||||||
|
uri-js: 4.4.1
|
||||||
|
dev: false
|
||||||
|
|
||||||
/ansi-escapes@4.3.2:
|
/ansi-escapes@4.3.2:
|
||||||
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
|
resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
@ -2926,6 +2938,10 @@ packages:
|
||||||
- supports-color
|
- supports-color
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/fast-deep-equal@3.1.3:
|
||||||
|
resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/fast-fifo@1.3.2:
|
/fast-fifo@1.3.2:
|
||||||
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
|
resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==}
|
||||||
dev: false
|
dev: false
|
||||||
|
@ -3999,6 +4015,10 @@ packages:
|
||||||
hasBin: true
|
hasBin: true
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/json-schema-traverse@1.0.0:
|
||||||
|
resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/json5@2.2.3:
|
/json5@2.2.3:
|
||||||
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
|
||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
|
@ -5264,6 +5284,11 @@ packages:
|
||||||
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
||||||
engines: {node: '>=0.10.0'}
|
engines: {node: '>=0.10.0'}
|
||||||
|
|
||||||
|
/require-from-string@2.0.2:
|
||||||
|
resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
|
||||||
|
engines: {node: '>=0.10.0'}
|
||||||
|
dev: false
|
||||||
|
|
||||||
/resolve-cwd@3.0.0:
|
/resolve-cwd@3.0.0:
|
||||||
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
|
resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
@ -5970,6 +5995,12 @@ packages:
|
||||||
picocolors: 1.0.0
|
picocolors: 1.0.0
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
|
/uri-js@4.4.1:
|
||||||
|
resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
|
||||||
|
dependencies:
|
||||||
|
punycode: 2.3.1
|
||||||
|
dev: false
|
||||||
|
|
||||||
/urlpattern-polyfill@10.0.0:
|
/urlpattern-polyfill@10.0.0:
|
||||||
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
resolution: {integrity: sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==}
|
||||||
dev: false
|
dev: false
|
||||||
|
|
|
@ -285,6 +285,11 @@ describe("E2E Tests for API Routes", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
|
||||||
let llmExtraction = response.body.data.llm_extraction;
|
let llmExtraction = response.body.data.llm_extraction;
|
||||||
|
|
||||||
|
|
|
@ -7,13 +7,14 @@ import { RateLimiterMode } from "../types";
|
||||||
import { logJob } from "../services/logging/log_job";
|
import { logJob } from "../services/logging/log_job";
|
||||||
import { Document } from "../lib/entities";
|
import { Document } from "../lib/entities";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
|
import Ajv from 'ajv';
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
req: Request,
|
req: Request,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: any,
|
pageOptions: any,
|
||||||
extractorOptions: any
|
extractorOptions: ExtractorOptions
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
|
@ -29,6 +30,7 @@ export async function scrapeHelper(
|
||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
|
|
|
@ -3,6 +3,8 @@ import OpenAI from 'openai'
|
||||||
// import { LlamaModel } from 'node-llama-cpp'
|
// import { LlamaModel } from 'node-llama-cpp'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { zodToJsonSchema } from 'zod-to-json-schema'
|
import { zodToJsonSchema } from 'zod-to-json-schema'
|
||||||
|
import Ajv from 'ajv';
|
||||||
|
const ajv = new Ajv(); // Initialize AJV for JSON schema validation
|
||||||
|
|
||||||
import {
|
import {
|
||||||
ScraperCompletionResult,
|
ScraperCompletionResult,
|
||||||
|
@ -22,20 +24,29 @@ export async function generateCompletions(
|
||||||
|
|
||||||
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
|
const switchVariable = "openAI" // Placholder, want to think more about how we abstract the model provider
|
||||||
|
|
||||||
|
|
||||||
const completions = await Promise.all(documents.map(async (document: Document) => {
|
const completions = await Promise.all(documents.map(async (document: Document) => {
|
||||||
switch (switchVariable) {
|
switch (switchVariable) {
|
||||||
case "openAI":
|
case "openAI":
|
||||||
const llm = new OpenAI();
|
const llm = new OpenAI();
|
||||||
return await generateOpenAICompletions({
|
const completionResult = await generateOpenAICompletions({
|
||||||
client: llm,
|
client: llm,
|
||||||
document: document,
|
document: document,
|
||||||
schema: schema,
|
schema: schema,
|
||||||
prompt: prompt
|
prompt: prompt
|
||||||
});
|
});
|
||||||
|
// Validate the JSON output against the schema using AJV
|
||||||
|
const validate = ajv.compile(schema);
|
||||||
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
|
throw new Error(`LLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support. AJV error: ${validate.errors?.map(err => err.message).join(', ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return completionResult;
|
||||||
default:
|
default:
|
||||||
throw new Error('Invalid client');
|
throw new Error('Invalid client');
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
|
||||||
return completions;
|
return completions;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ function prepareOpenAIDoc(
|
||||||
return [{ type: 'text', text: document.markdown }]
|
return [{ type: 'text', text: document.markdown }]
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions<T>({
|
export async function generateOpenAICompletions({
|
||||||
client,
|
client,
|
||||||
model = 'gpt-3.5-turbo',
|
model = 'gpt-3.5-turbo',
|
||||||
document,
|
document,
|
||||||
|
|
|
@ -57,7 +57,7 @@ export class Document {
|
||||||
url?: string; // Used only in /search for now
|
url?: string; // Used only in /search for now
|
||||||
content: string;
|
content: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
llm_extraction?: string;
|
llm_extraction?: Record<string, any>;
|
||||||
createdAt?: Date;
|
createdAt?: Date;
|
||||||
updatedAt?: Date;
|
updatedAt?: Date;
|
||||||
type?: string;
|
type?: string;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user