From 14c6c8ec75cf81e2be70f5b5f06e4b5dd15481fe Mon Sep 17 00:00:00 2001 From: Martin Patino Date: Wed, 15 Jan 2025 17:06:20 -0700 Subject: [PATCH 1/3] feat: enhance encryption utilities and update database service for initial setup --- README.md | 88 +++++++++++++++++++++++++++++- db/settings.db | Bin 0 -> 16384 bytes example/answerCorrectness.json | 34 ++++++++++++ example/closeqa.json | 12 ++++ src/commands/run-eval.ts | 1 - src/main.ts | 1 - src/services/faculty-evaluator.ts | 9 ++- src/services/qa.ts | 28 ++++++++++ 8 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 example/answerCorrectness.json create mode 100644 example/closeqa.json create mode 100644 src/services/qa.ts diff --git a/README.md b/README.md index e50885d..8474957 100644 --- a/README.md +++ b/README.md @@ -1 +1,87 @@ -# Burro 🫏 +# Burro 🫏🌯 + +Burro is an AI-powered burrito LLM evaluation CLI tool. It helps you evaluate the factuality of responses generated by language models. + +## Features + +- Set and encrypt your OpenAI API key +- Run evaluations based on JSON input files +- View detailed evaluation results + +## Installation + +1. Clone the repository: + ```sh + git clone + cd + ``` + +2. Install dependencies: + ```sh + deno task check + ``` + +## Build Process + +To build the project, follow these steps: + +1. Ensure you have Deno installed. If not, you can install it from [here](https://deno.land/#installation). + +2. Run the following command to build the project: + ```sh + deno task build + ``` + +3. The build output will be available in the `dist` directory. + +## Usage + +### Set OpenAI API Key + +Before running evaluations, you need to set your OpenAI API key: + +```sh +deno task run set-openai-key +``` + +### Run Evaluation + +To run an evaluation based on a JSON input file: + +```sh +deno task run run-eval +``` + +### Example JSON Input + +The JSON input file should have the following structure: + +```json +[ + { + "input": "Which country has the highest population?", + "output": "People's Republic of China", + "expected": "China" + }, + { + "input": "What is the capital of France?", + "output": "The capital city of France is Paris", + "expected": "Paris" + }, + { + "input": "Who wrote Romeo and Juliet?", + "output": "The famous playwright William Shakespeare wrote Romeo and Juliet", + "expected": "William Shakespeare" + } +] +``` + +## Development + +### Run Tests + +To run the tests: + +```sh +deno test +``` diff --git a/db/settings.db b/db/settings.db index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..42275588340da494bfda70cd768e63b7a7e04473 100644 GIT binary patch literal 16384 zcmeI(O;6h}7zc1?MGVlOyCDvhd;}GizOsRYw9qg zR+CWV=eX}S#_hkw9C^|Hx*Y#bVkq~SNUXzv#&fYMKnpUZzS*P@nol?!_)si76SD3*!??#Ug zOF28?)Tqj0m&^D2WDfnP7m8**=n3EP>KzeH#{M$I3;f^g^n5R@k3=I3`VXcJdrfD& zuDtOsN)leGFj{1~W|uiF*p?-KVtw3s#I=ffe>B@D)oeDQ+L&dqp+1>kEhH4;`Au;( zLv@B>FlozEG1aT5Z@+?Hig;zc4i1Oy-e0SG_<0uX=z1Rwwb2tWV=e-OCd zQ5MMBH4|Ies>uwkq;>cKJIS4?qg-&+7m>?4r4Ap~*eIi&iB{*ptkp8z$TG4yJ<4?R z?;Am}U1qA9GV(pW5RKHgws-8BSIt3w|B#FD { const db = DatabaseService.getInstance(); - console.log(db.getAllSettings()); if (db.getAllSettings().length == 0) { console.log(info(burroTitle)); console.log( diff --git a/src/services/faculty-evaluator.ts b/src/services/faculty-evaluator.ts index 338d149..8b517e2 100644 --- a/src/services/faculty-evaluator.ts +++ b/src/services/faculty-evaluator.ts @@ -1,4 +1,4 @@ -import { Factuality } from "autoevals"; +import { Factuality, } from "autoevals"; interface EvalItem { input: string; @@ -35,16 +35,15 @@ export class FactualityEvaluator { const results: EvalResult[] = []; for await (const item of items) { - const result = await Factuality({ + const result = await Factuality({ // Assuming evaluate is a static method input: item.input, output: item.output, expected: item.expected, }); - results.push({ ...item, - score: result.score, - metadata: result, + score: result.score as number, + metadata: result || {}, }); } diff --git a/src/services/qa.ts b/src/services/qa.ts new file mode 100644 index 0000000..0650bff --- /dev/null +++ b/src/services/qa.ts @@ -0,0 +1,28 @@ +import { ClosedQA } from "autoevals"; + + +export class CloseCaseQA { + + private async evaluateItems(items: EvalItem[]): Promise { + const results: EvalResult[] = []; + + for await (const item of items) { + const result = await ClosedQA({ + input: item.input, + criteria: item.criteria, + }); + + console.log(result); + + // results.push({ + // ...item, + // score: result.score as number, + // metadata: result || {}, + // }); + } + + return results; + } + + +} \ No newline at end of file From 831305f927ade5f239fef7345a66c4dae2dadb59 Mon Sep 17 00:00:00 2001 From: Martin Patino Date: Thu, 16 Jan 2025 16:39:07 -0700 Subject: [PATCH 2/3] feat: implement evaluation classes for factuality and close case QA, and update run-eval command --- README.md | 26 ++++----- example/answerCorrectness.json | 60 ++++++++++----------- example/closeqa.json | 22 ++++---- src/commands/run-eval.ts | 44 +++++++++++++--- src/services/close-qa.ts | 30 +++++++++++ src/services/evaluate.ts | 12 +++++ src/services/faculty-evaluator.ts | 87 ------------------------------- src/services/faculty.ts | 63 ++++++++++++++++++++++ src/services/qa.ts | 28 ---------- 9 files changed, 198 insertions(+), 174 deletions(-) create mode 100644 src/services/close-qa.ts create mode 100644 src/services/evaluate.ts delete mode 100644 src/services/faculty-evaluator.ts create mode 100644 src/services/faculty.ts delete mode 100644 src/services/qa.ts diff --git a/README.md b/README.md index 8474957..52a2fc2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Burro 🫏🌯 -Burro is an AI-powered burrito LLM evaluation CLI tool. It helps you evaluate the factuality of responses generated by language models. +Burro is an AI-powered burrito LLM evaluation CLI tool. It helps you evaluate +the factuality of responses generated by language models. ## Features @@ -11,26 +12,27 @@ Burro is an AI-powered burrito LLM evaluation CLI tool. It helps you evaluate th ## Installation 1. Clone the repository: - ```sh - git clone - cd - ``` + ```sh + git clone + cd + ``` 2. Install dependencies: - ```sh - deno task check - ``` + ```sh + deno task check + ``` ## Build Process To build the project, follow these steps: -1. Ensure you have Deno installed. If not, you can install it from [here](https://deno.land/#installation). +1. Ensure you have Deno installed. If not, you can install it from + [here](https://deno.land/#installation). 2. Run the following command to build the project: - ```sh - deno task build - ``` + ```sh + deno task build + ``` 3. The build output will be available in the `dist` directory. diff --git a/example/answerCorrectness.json b/example/answerCorrectness.json index 048b6af..52a8ec7 100644 --- a/example/answerCorrectness.json +++ b/example/answerCorrectness.json @@ -1,34 +1,34 @@ [ - { - "input": { - "context": "Tesla's Model 3 was first unveiled on March 31, 2016, with the first deliveries beginning in July 2017. The base model has an EPA-rated range of 272 miles and can accelerate from 0-60 mph in 5.8 seconds.", - "question": "When did Tesla start delivering the Model 3?" - }, - "output": "July 2017", - "criteria": "Answer must be exactly 'July 2017' based on the provided context" + { + "input": { + "context": "Tesla's Model 3 was first unveiled on March 31, 2016, with the first deliveries beginning in July 2017. The base model has an EPA-rated range of 272 miles and can accelerate from 0-60 mph in 5.8 seconds.", + "question": "When did Tesla start delivering the Model 3?" }, - { - "input": { - "context": "The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the northeast coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef was declared a World Heritage site in 1981.", - "question": "How long is the Great Barrier Reef?" - }, - "output": "2,300 kilometers", - "criteria": "Answer must specify '2,300 kilometers' from the context, with units included" + "output": "July 2017", + "criteria": "Answer must be exactly 'July 2017' based on the provided context" + }, + { + "input": { + "context": "The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the northeast coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef was declared a World Heritage site in 1981.", + "question": "How long is the Great Barrier Reef?" }, - { - "input": { - "context": "Python was created by Guido van Rossum and was first released in 1991. The language emphasizes code readability with its notable use of significant whitespace. Python 3.0 was released in 2008, with major improvements in handling text strings.", - "question": "Who created Python?" - }, - "output": "Guido van Rossum", - "criteria": "Answer must be the full name 'Guido van Rossum' as stated in the context" + "output": "2,300 kilometers", + "criteria": "Answer must specify '2,300 kilometers' from the context, with units included" + }, + { + "input": { + "context": "Python was created by Guido van Rossum and was first released in 1991. The language emphasizes code readability with its notable use of significant whitespace. Python 3.0 was released in 2008, with major improvements in handling text strings.", + "question": "Who created Python?" }, - { - "input": { - "context": "The human brain consists of approximately 86 billion neurons. Each neuron can make connections with up to 10,000 other neurons, resulting in trillions of neural connections. The brain uses about 20% of the body's total energy consumption.", - "question": "What percentage of the body's energy does the brain use?" - }, - "output": "20%", - "criteria": "Answer must be '20%' based on the context, including the percentage symbol" - } -] \ No newline at end of file + "output": "Guido van Rossum", + "criteria": "Answer must be the full name 'Guido van Rossum' as stated in the context" + }, + { + "input": { + "context": "The human brain consists of approximately 86 billion neurons. Each neuron can make connections with up to 10,000 other neurons, resulting in trillions of neural connections. The brain uses about 20% of the body's total energy consumption.", + "question": "What percentage of the body's energy does the brain use?" + }, + "output": "20%", + "criteria": "Answer must be '20%' based on the context, including the percentage symbol" + } +] diff --git a/example/closeqa.json b/example/closeqa.json index 7190f9b..d2062c6 100644 --- a/example/closeqa.json +++ b/example/closeqa.json @@ -1,12 +1,12 @@ [ - { - "input": "List the first three prime numbers in ascending order, separated by commas.", - "output": "2,3,5", - "criteria": "Numbers must be in correct order, separated by commas with no spaces" - }, - { - "input": "What are the three states of matter, in alphabetical order?", - "output": "gas,liquid,solid", - "criteria": "States must be in alphabetical order, lowercase, separated by commas with no spaces" - } -] \ No newline at end of file + { + "input": "List the first three prime numbers in ascending order, separated by commas.", + "output": "2,3,5", + "criteria": "Numbers must be in correct order, separated by commas with no spaces" + }, + { + "input": "What are the three states of matter, in alphabetical order?", + "output": "gas,liquid,solid", + "criteria": "States must be in alphabetical order, lowercase, separated by commas with no spaces" + } +] diff --git a/src/commands/run-eval.ts b/src/commands/run-eval.ts index 28ec021..89c60f4 100644 --- a/src/commands/run-eval.ts +++ b/src/commands/run-eval.ts @@ -1,16 +1,48 @@ import { Command } from "https://deno.land/x/cliffy@v0.25.4/command/mod.ts"; import { ApiKeyLoader } from "../services/openAi.ts"; -import { FactualityEvaluator } from "../services/faculty-evaluator.ts"; +import { FactualityEval, FactualityEvalItem } from "../services/faculty.ts"; +import { CloseCaseQA, QAEvalItem } from "../services/close-qa.ts"; +import { error, success } from "../utils/color.ts"; export const RunEval = new Command() .default("run-eval") .description("Run the evaluation") + .option("-t, --type ", "Type of evaluation to run") + .option("-p, --print ", "Print the results") .arguments("") - .action(async (_, file: string) => { - console.log(`Running evaluation from directory: ${JSON.stringify(file)}`); + .action(async ({ type, print }, file: string) => { const keyLoader = new ApiKeyLoader(); await keyLoader.loadApiKey(); - const v = new FactualityEvaluator(); - const a = await v.evaluateFromJson(file); - console.log(a); + let generalResult: string; + if (type === "factuality") { + const faculty = new FactualityEval(); + const items = await faculty.parseFile(file); + const results = await faculty.evaluateItems(items); + console.log(results); + generalResult = JSON.stringify(results, null, 2); + } else if (type == "closeqa") { + const closeQa = new CloseCaseQA(); + const items = await closeQa.parseFile(file); + const result = await closeQa.evaluateItems(items); + console.log(result); + generalResult = JSON.stringify(result, null, 2); + } else { + console.error(error("Invalid evaluation type")); + return; + } + + console.log(success("=====================================")); + + if (print) { + const downloads = Deno.env.get("HOME") + "/Downloads"; + const filename = file.split("/").pop(); + await Deno.writeTextFile( + `${downloads}/${filename}-result.json`, + generalResult, + ); + + console.log( + success(`Results saved to ${downloads}/${filename}-result.json`), + ); + } }); diff --git a/src/services/close-qa.ts b/src/services/close-qa.ts new file mode 100644 index 0000000..4007826 --- /dev/null +++ b/src/services/close-qa.ts @@ -0,0 +1,30 @@ +import { ClosedQA } from "autoevals"; +import { Evaluate } from "./evaluate.ts"; + +export interface QAEvalItem { + input: string; + output: string; + criteria: string; +} + +export interface QAEvalResult { + name: string; + score: number; + metetadata: Record; +} + +export class CloseCaseQA extends Evaluate { + async evaluateItems(items: QAEvalItem[]): Promise { + const results: QAEvalResult[] = []; + for await (const item of items) { + const result = await ClosedQA({ + input: item.input, + criteria: item.criteria, + item: item.output, + }); + results.push(result as QAEvalResult); + } + + return results; + } +} diff --git a/src/services/evaluate.ts b/src/services/evaluate.ts new file mode 100644 index 0000000..8ba4fee --- /dev/null +++ b/src/services/evaluate.ts @@ -0,0 +1,12 @@ +export class Evaluate { + async parseFile(jsonPath: string): Promise { + try { + const jsonContent = await Deno.readTextFile(jsonPath); + const evalItems = JSON.parse(jsonContent); + return evalItems as T; + } catch (error) { + console.error(`Failed to evaluate from JSON: ${error}`); + throw new Error(`Failed to evaluate from JSON`); + } + } +} diff --git a/src/services/faculty-evaluator.ts b/src/services/faculty-evaluator.ts deleted file mode 100644 index 8b517e2..0000000 --- a/src/services/faculty-evaluator.ts +++ /dev/null @@ -1,87 +0,0 @@ -import { Factuality, } from "autoevals"; - -interface EvalItem { - input: string; - output: string; - expected: string; -} - -interface FactualityResult { - score: number; - metadata: Record; -} - -interface EvalResult extends EvalItem { - score: number; - metadata: Record; -} - -export class FactualityEvaluator { - constructor() { - } - - async evaluateFromJson(jsonPath: string): Promise { - try { - const jsonContent = await Deno.readTextFile(jsonPath); - const evalItems: EvalItem[] = JSON.parse(jsonContent); - return await this.evaluateItems(evalItems); - } catch (error) { - console.error(`Failed to evaluate from JSON: ${error}`); - throw new Error(`Failed to evaluate from JSON`); - } - } - - private async evaluateItems(items: EvalItem[]): Promise { - const results: EvalResult[] = []; - - for await (const item of items) { - const result = await Factuality({ // Assuming evaluate is a static method - input: item.input, - output: item.output, - expected: item.expected, - }); - results.push({ - ...item, - score: result.score as number, - metadata: result || {}, - }); - } - - return results; - } - - async evaluateSingle(item: EvalItem): Promise { - const result = await Factuality({ - input: item.input, - output: item.output, - expected: item.expected, - }); - - return { - ...item, - score: result.score, - metadata: result, - }; - } - - // Helper to print results - static printResults(results: EvalResult[]): void { - console.log("\nFactuality Evaluation Results:"); - console.log("=============================\n"); - - results.forEach((result, index) => { - console.log(`Item ${index + 1}:`); - console.log(`Input: ${result.input}`); - console.log(`Output: ${result.output}`); - console.log(`Expected: ${result.expected}`); - console.log(`Score: ${result.score}`); - console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`); - console.log("-----------------------------\n"); - }); - - // Print summary statistics - const avgScore = results.reduce((sum, r) => sum + r.score, 0) / - results.length; - console.log(`Average Score: ${avgScore.toFixed(3)}`); - } -} diff --git a/src/services/faculty.ts b/src/services/faculty.ts new file mode 100644 index 0000000..3331081 --- /dev/null +++ b/src/services/faculty.ts @@ -0,0 +1,63 @@ +import { Factuality } from "autoevals"; +import { Evaluate } from "./evaluate.ts"; + +export interface FactualityEvalItem { + input: string; + output: string; + expected: string; +} + +interface FactualityEvalResult extends FactualityEvalItem { + score: number; + metadata: Record; +} + +export class FactualityEval extends Evaluate { + static printResults(results: FactualityEvalResult[]): void { + console.log("\nFactuality Evaluation Results:"); + console.log("=============================\n"); + + results.forEach((result, index) => { + console.log(`Item ${index + 1}:`); + console.log(`Input: ${result.input}`); + console.log(`Output: ${result.output}`); + console.log(`Expected: ${result.expected}`); + console.log(`Score: ${result.score}`); + console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`); + console.log("-----------------------------\n"); + }); + + // Print summary statistics + const avgScore = results.reduce((sum, r) => sum + r.score, 0) / + results.length; + console.log(`Average Score: ${avgScore.toFixed(3)}`); + } + + async evaluateItems( + items: FactualityEvalItem[], + ): Promise { + const results: FactualityEvalResult[] = []; + + for await (const item of items) { + const result = await this.evaluateSingle({ // Assuming evaluate is a static method + input: item.input, + output: item.output, + expected: item.expected, + }); + + results.push(result as FactualityEvalResult); + } + + return results; + } + + async evaluateSingle(item: FactualityEvalItem): Promise { + const result = await Factuality({ + input: item.input, + output: item.output, + expected: item.expected, + }); + + return result; + } +} diff --git a/src/services/qa.ts b/src/services/qa.ts deleted file mode 100644 index 0650bff..0000000 --- a/src/services/qa.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { ClosedQA } from "autoevals"; - - -export class CloseCaseQA { - - private async evaluateItems(items: EvalItem[]): Promise { - const results: EvalResult[] = []; - - for await (const item of items) { - const result = await ClosedQA({ - input: item.input, - criteria: item.criteria, - }); - - console.log(result); - - // results.push({ - // ...item, - // score: result.score as number, - // metadata: result || {}, - // }); - } - - return results; - } - - -} \ No newline at end of file From 78a52a422fe65ce73dcb889d4bdb7464195b7783 Mon Sep 17 00:00:00 2001 From: Martin Patino Date: Thu, 16 Jan 2025 16:53:47 -0700 Subject: [PATCH 3/3] feat: update QAEvalResult and FactualityEvalResult interfaces to include rationale and choice metadata --- src/services/close-qa.ts | 9 ++++++--- src/services/faculty.ts | 27 ++++++++++----------------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/services/close-qa.ts b/src/services/close-qa.ts index 4007826..63e8c1c 100644 --- a/src/services/close-qa.ts +++ b/src/services/close-qa.ts @@ -10,7 +10,10 @@ export interface QAEvalItem { export interface QAEvalResult { name: string; score: number; - metetadata: Record; + metetadata: { + rationale : string; + choice: string; + } } export class CloseCaseQA extends Evaluate { @@ -18,10 +21,10 @@ export class CloseCaseQA extends Evaluate { const results: QAEvalResult[] = []; for await (const item of items) { const result = await ClosedQA({ - input: item.input, + input: item.input, criteria: item.criteria, item: item.output, - }); + } as any); results.push(result as QAEvalResult); } diff --git a/src/services/faculty.ts b/src/services/faculty.ts index 3331081..6d568ea 100644 --- a/src/services/faculty.ts +++ b/src/services/faculty.ts @@ -7,9 +7,13 @@ export interface FactualityEvalItem { expected: string; } -interface FactualityEvalResult extends FactualityEvalItem { +interface FactualityEvalResult { + name: string; score: number; - metadata: Record; + metadata: { + rationale: string + choice: string + } } export class FactualityEval extends Evaluate { @@ -19,9 +23,7 @@ export class FactualityEval extends Evaluate { results.forEach((result, index) => { console.log(`Item ${index + 1}:`); - console.log(`Input: ${result.input}`); - console.log(`Output: ${result.output}`); - console.log(`Expected: ${result.expected}`); + console.log(`Output: ${result.name}`); console.log(`Score: ${result.score}`); console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`); console.log("-----------------------------\n"); @@ -39,25 +41,16 @@ export class FactualityEval extends Evaluate { const results: FactualityEvalResult[] = []; for await (const item of items) { - const result = await this.evaluateSingle({ // Assuming evaluate is a static method + const result = await Factuality({ input: item.input, output: item.output, expected: item.expected, - }); + } as any); + results.push(result as FactualityEvalResult); } return results; } - - async evaluateSingle(item: FactualityEvalItem): Promise { - const result = await Factuality({ - input: item.input, - output: item.output, - expected: item.expected, - }); - - return result; - } }