diff --git a/example/answerCorrectness.json b/example/answerCorrectness.json new file mode 100644 index 0000000..52a8ec7 --- /dev/null +++ b/example/answerCorrectness.json @@ -0,0 +1,34 @@ +[ + { + "input": { + "context": "Tesla's Model 3 was first unveiled on March 31, 2016, with the first deliveries beginning in July 2017. The base model has an EPA-rated range of 272 miles and can accelerate from 0-60 mph in 5.8 seconds.", + "question": "When did Tesla start delivering the Model 3?" + }, + "output": "July 2017", + "criteria": "Answer must be exactly 'July 2017' based on the provided context" + }, + { + "input": { + "context": "The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the northeast coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef was declared a World Heritage site in 1981.", + "question": "How long is the Great Barrier Reef?" + }, + "output": "2,300 kilometers", + "criteria": "Answer must specify '2,300 kilometers' from the context, with units included" + }, + { + "input": { + "context": "Python was created by Guido van Rossum and was first released in 1991. The language emphasizes code readability with its notable use of significant whitespace. Python 3.0 was released in 2008, with major improvements in handling text strings.", + "question": "Who created Python?" + }, + "output": "Guido van Rossum", + "criteria": "Answer must be the full name 'Guido van Rossum' as stated in the context" + }, + { + "input": { + "context": "The human brain consists of approximately 86 billion neurons. Each neuron can make connections with up to 10,000 other neurons, resulting in trillions of neural connections. The brain uses about 20% of the body's total energy consumption.", + "question": "What percentage of the body's energy does the brain use?" + }, + "output": "20%", + "criteria": "Answer must be '20%' based on the context, including the percentage symbol" + } +] diff --git a/example/closeqa.json b/example/closeqa.json new file mode 100644 index 0000000..d2062c6 --- /dev/null +++ b/example/closeqa.json @@ -0,0 +1,12 @@ +[ + { + "input": "List the first three prime numbers in ascending order, separated by commas.", + "output": "2,3,5", + "criteria": "Numbers must be in correct order, separated by commas with no spaces" + }, + { + "input": "What are the three states of matter, in alphabetical order?", + "output": "gas,liquid,solid", + "criteria": "States must be in alphabetical order, lowercase, separated by commas with no spaces" + } +] diff --git a/src/commands/run-eval.ts b/src/commands/run-eval.ts index b7cbdd5..89c60f4 100644 --- a/src/commands/run-eval.ts +++ b/src/commands/run-eval.ts @@ -1,17 +1,48 @@ import { Command } from "https://deno.land/x/cliffy@v0.25.4/command/mod.ts"; -import { Factuality, JSONDiff, ValidJSON } from "autoevals"; import { ApiKeyLoader } from "../services/openAi.ts"; -import { FactualityEvaluator } from "../services/faculty-evaluator.ts"; +import { FactualityEval, FactualityEvalItem } from "../services/faculty.ts"; +import { CloseCaseQA, QAEvalItem } from "../services/close-qa.ts"; +import { error, success } from "../utils/color.ts"; export const RunEval = new Command() .default("run-eval") .description("Run the evaluation") + .option("-t, --type ", "Type of evaluation to run") + .option("-p, --print ", "Print the results") .arguments("") - .action(async (_, file: string) => { - console.log(`Running evaluation from directory: ${JSON.stringify(file)}`); + .action(async ({ type, print }, file: string) => { const keyLoader = new ApiKeyLoader(); await keyLoader.loadApiKey(); - const v = new FactualityEvaluator(); - const a = await v.evaluateFromJson(file); - console.log(a); + let generalResult: string; + if (type === "factuality") { + const faculty = new FactualityEval(); + const items = await faculty.parseFile(file); + const results = await faculty.evaluateItems(items); + console.log(results); + generalResult = JSON.stringify(results, null, 2); + } else if (type == "closeqa") { + const closeQa = new CloseCaseQA(); + const items = await closeQa.parseFile(file); + const result = await closeQa.evaluateItems(items); + console.log(result); + generalResult = JSON.stringify(result, null, 2); + } else { + console.error(error("Invalid evaluation type")); + return; + } + + console.log(success("=====================================")); + + if (print) { + const downloads = Deno.env.get("HOME") + "/Downloads"; + const filename = file.split("/").pop(); + await Deno.writeTextFile( + `${downloads}/${filename}-result.json`, + generalResult, + ); + + console.log( + success(`Results saved to ${downloads}/${filename}-result.json`), + ); + } }); diff --git a/src/main.ts b/src/main.ts index 4c0faa2..565a72a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -13,7 +13,6 @@ const program = new Command() .description("AI powered burrito LLM evaluation CLI tool") .action(() => { const db = DatabaseService.getInstance(); - console.log(db.getAllSettings()); if (db.getAllSettings().length == 0) { console.log(info(burroTitle)); console.log( diff --git a/src/services/close-qa.ts b/src/services/close-qa.ts new file mode 100644 index 0000000..63e8c1c --- /dev/null +++ b/src/services/close-qa.ts @@ -0,0 +1,33 @@ +import { ClosedQA } from "autoevals"; +import { Evaluate } from "./evaluate.ts"; + +export interface QAEvalItem { + input: string; + output: string; + criteria: string; +} + +export interface QAEvalResult { + name: string; + score: number; + metetadata: { + rationale : string; + choice: string; + } +} + +export class CloseCaseQA extends Evaluate { + async evaluateItems(items: QAEvalItem[]): Promise { + const results: QAEvalResult[] = []; + for await (const item of items) { + const result = await ClosedQA({ + input: item.input, + criteria: item.criteria, + item: item.output, + } as any); + results.push(result as QAEvalResult); + } + + return results; + } +} diff --git a/src/services/evaluate.ts b/src/services/evaluate.ts new file mode 100644 index 0000000..8ba4fee --- /dev/null +++ b/src/services/evaluate.ts @@ -0,0 +1,12 @@ +export class Evaluate { + async parseFile(jsonPath: string): Promise { + try { + const jsonContent = await Deno.readTextFile(jsonPath); + const evalItems = JSON.parse(jsonContent); + return evalItems as T; + } catch (error) { + console.error(`Failed to evaluate from JSON: ${error}`); + throw new Error(`Failed to evaluate from JSON`); + } + } +} diff --git a/src/services/faculty-evaluator.ts b/src/services/faculty-evaluator.ts deleted file mode 100644 index 338d149..0000000 --- a/src/services/faculty-evaluator.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { Factuality } from "autoevals"; - -interface EvalItem { - input: string; - output: string; - expected: string; -} - -interface FactualityResult { - score: number; - metadata: Record; -} - -interface EvalResult extends EvalItem { - score: number; - metadata: Record; -} - -export class FactualityEvaluator { - constructor() { - } - - async evaluateFromJson(jsonPath: string): Promise { - try { - const jsonContent = await Deno.readTextFile(jsonPath); - const evalItems: EvalItem[] = JSON.parse(jsonContent); - return await this.evaluateItems(evalItems); - } catch (error) { - console.error(`Failed to evaluate from JSON: ${error}`); - throw new Error(`Failed to evaluate from JSON`); - } - } - - private async evaluateItems(items: EvalItem[]): Promise { - const results: EvalResult[] = []; - - for await (const item of items) { - const result = await Factuality({ - input: item.input, - output: item.output, - expected: item.expected, - }); - - results.push({ - ...item, - score: result.score, - metadata: result, - }); - } - - return results; - } - - async evaluateSingle(item: EvalItem): Promise { - const result = await Factuality({ - input: item.input, - output: item.output, - expected: item.expected, - }); - - return { - ...item, - score: result.score, - metadata: result, - }; - } - - // Helper to print results - static printResults(results: EvalResult[]): void { - console.log("\nFactuality Evaluation Results:"); - console.log("=============================\n"); - - results.forEach((result, index) => { - console.log(`Item ${index + 1}:`); - console.log(`Input: ${result.input}`); - console.log(`Output: ${result.output}`); - console.log(`Expected: ${result.expected}`); - console.log(`Score: ${result.score}`); - console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`); - console.log("-----------------------------\n"); - }); - - // Print summary statistics - const avgScore = results.reduce((sum, r) => sum + r.score, 0) / - results.length; - console.log(`Average Score: ${avgScore.toFixed(3)}`); - } -} diff --git a/src/services/faculty.ts b/src/services/faculty.ts new file mode 100644 index 0000000..6d568ea --- /dev/null +++ b/src/services/faculty.ts @@ -0,0 +1,56 @@ +import { Factuality } from "autoevals"; +import { Evaluate } from "./evaluate.ts"; + +export interface FactualityEvalItem { + input: string; + output: string; + expected: string; +} + +interface FactualityEvalResult { + name: string; + score: number; + metadata: { + rationale: string + choice: string + } +} + +export class FactualityEval extends Evaluate { + static printResults(results: FactualityEvalResult[]): void { + console.log("\nFactuality Evaluation Results:"); + console.log("=============================\n"); + + results.forEach((result, index) => { + console.log(`Item ${index + 1}:`); + console.log(`Output: ${result.name}`); + console.log(`Score: ${result.score}`); + console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`); + console.log("-----------------------------\n"); + }); + + // Print summary statistics + const avgScore = results.reduce((sum, r) => sum + r.score, 0) / + results.length; + console.log(`Average Score: ${avgScore.toFixed(3)}`); + } + + async evaluateItems( + items: FactualityEvalItem[], + ): Promise { + const results: FactualityEvalResult[] = []; + + for await (const item of items) { + const result = await Factuality({ + input: item.input, + output: item.output, + expected: item.expected, + } as any); + + + results.push(result as FactualityEvalResult); + } + + return results; + } +}