Skip to content

Commit

Permalink
Merge pull request #2 from thisguymartin/update-prompt
Browse files Browse the repository at this point in the history
feat: enhance encryption utilities and update database service for in…
  • Loading branch information
thisguymartin authored Jan 17, 2025
2 parents e8d9d85 + 1dd5d73 commit 4b7fbc4
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 96 deletions.
34 changes: 34 additions & 0 deletions example/answerCorrectness.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[
{
"input": {
"context": "Tesla's Model 3 was first unveiled on March 31, 2016, with the first deliveries beginning in July 2017. The base model has an EPA-rated range of 272 miles and can accelerate from 0-60 mph in 5.8 seconds.",
"question": "When did Tesla start delivering the Model 3?"
},
"output": "July 2017",
"criteria": "Answer must be exactly 'July 2017' based on the provided context"
},
{
"input": {
"context": "The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the northeast coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef was declared a World Heritage site in 1981.",
"question": "How long is the Great Barrier Reef?"
},
"output": "2,300 kilometers",
"criteria": "Answer must specify '2,300 kilometers' from the context, with units included"
},
{
"input": {
"context": "Python was created by Guido van Rossum and was first released in 1991. The language emphasizes code readability with its notable use of significant whitespace. Python 3.0 was released in 2008, with major improvements in handling text strings.",
"question": "Who created Python?"
},
"output": "Guido van Rossum",
"criteria": "Answer must be the full name 'Guido van Rossum' as stated in the context"
},
{
"input": {
"context": "The human brain consists of approximately 86 billion neurons. Each neuron can make connections with up to 10,000 other neurons, resulting in trillions of neural connections. The brain uses about 20% of the body's total energy consumption.",
"question": "What percentage of the body's energy does the brain use?"
},
"output": "20%",
"criteria": "Answer must be '20%' based on the context, including the percentage symbol"
}
]
12 changes: 12 additions & 0 deletions example/closeqa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[
{
"input": "List the first three prime numbers in ascending order, separated by commas.",
"output": "2,3,5",
"criteria": "Numbers must be in correct order, separated by commas with no spaces"
},
{
"input": "What are the three states of matter, in alphabetical order?",
"output": "gas,liquid,solid",
"criteria": "States must be in alphabetical order, lowercase, separated by commas with no spaces"
}
]
45 changes: 38 additions & 7 deletions src/commands/run-eval.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,48 @@
import { Command } from "https://deno.land/x/cliffy@v0.25.4/command/mod.ts";
import { Factuality, JSONDiff, ValidJSON } from "autoevals";
import { ApiKeyLoader } from "../services/openAi.ts";
import { FactualityEvaluator } from "../services/faculty-evaluator.ts";
import { FactualityEval, FactualityEvalItem } from "../services/faculty.ts";
import { CloseCaseQA, QAEvalItem } from "../services/close-qa.ts";
import { error, success } from "../utils/color.ts";

export const RunEval = new Command()
.default("run-eval")
.description("Run the evaluation")
.option("-t, --type <string>", "Type of evaluation to run")
.option("-p, --print <bool>", "Print the results")
.arguments("<file>")
.action(async (_, file: string) => {
console.log(`Running evaluation from directory: ${JSON.stringify(file)}`);
.action(async ({ type, print }, file: string) => {
const keyLoader = new ApiKeyLoader();
await keyLoader.loadApiKey();
const v = new FactualityEvaluator();
const a = await v.evaluateFromJson(file);
console.log(a);
let generalResult: string;
if (type === "factuality") {
const faculty = new FactualityEval();
const items = await faculty.parseFile<FactualityEvalItem[]>(file);
const results = await faculty.evaluateItems(items);
console.log(results);
generalResult = JSON.stringify(results, null, 2);
} else if (type == "closeqa") {
const closeQa = new CloseCaseQA();
const items = await closeQa.parseFile<QAEvalItem[]>(file);
const result = await closeQa.evaluateItems(items);
console.log(result);
generalResult = JSON.stringify(result, null, 2);
} else {
console.error(error("Invalid evaluation type"));
return;
}

console.log(success("====================================="));

if (print) {
const downloads = Deno.env.get("HOME") + "/Downloads";
const filename = file.split("/").pop();
await Deno.writeTextFile(
`${downloads}/${filename}-result.json`,
generalResult,
);

console.log(
success(`Results saved to ${downloads}/${filename}-result.json`),
);
}
});
1 change: 0 additions & 1 deletion src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ const program = new Command()
.description("AI powered burrito LLM evaluation CLI tool")
.action(() => {
const db = DatabaseService.getInstance();
console.log(db.getAllSettings());
if (db.getAllSettings().length == 0) {
console.log(info(burroTitle));
console.log(
Expand Down
33 changes: 33 additions & 0 deletions src/services/close-qa.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { ClosedQA } from "autoevals";
import { Evaluate } from "./evaluate.ts";

export interface QAEvalItem {
input: string;
output: string;
criteria: string;
}

export interface QAEvalResult {
name: string;
score: number;
metetadata: {
rationale : string;
choice: string;
}
}

export class CloseCaseQA extends Evaluate {
async evaluateItems(items: QAEvalItem[]): Promise<QAEvalResult[]> {
const results: QAEvalResult[] = [];
for await (const item of items) {
const result = await ClosedQA({
input: item.input,
criteria: item.criteria,
item: item.output,
} as any);
results.push(result as QAEvalResult);
}

return results;
}
}
12 changes: 12 additions & 0 deletions src/services/evaluate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
export class Evaluate {
async parseFile<T>(jsonPath: string): Promise<T> {
try {
const jsonContent = await Deno.readTextFile(jsonPath);
const evalItems = JSON.parse(jsonContent);
return evalItems as T;
} catch (error) {
console.error(`Failed to evaluate from JSON: ${error}`);
throw new Error(`Failed to evaluate from JSON`);
}
}
}
88 changes: 0 additions & 88 deletions src/services/faculty-evaluator.ts

This file was deleted.

56 changes: 56 additions & 0 deletions src/services/faculty.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { Factuality } from "autoevals";
import { Evaluate } from "./evaluate.ts";

export interface FactualityEvalItem {
input: string;
output: string;
expected: string;
}

interface FactualityEvalResult {
name: string;
score: number;
metadata: {
rationale: string
choice: string
}
}

export class FactualityEval extends Evaluate {
static printResults(results: FactualityEvalResult[]): void {
console.log("\nFactuality Evaluation Results:");
console.log("=============================\n");

results.forEach((result, index) => {
console.log(`Item ${index + 1}:`);
console.log(`Output: ${result.name}`);
console.log(`Score: ${result.score}`);
console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`);
console.log("-----------------------------\n");
});

// Print summary statistics
const avgScore = results.reduce((sum, r) => sum + r.score, 0) /
results.length;
console.log(`Average Score: ${avgScore.toFixed(3)}`);
}

async evaluateItems(
items: FactualityEvalItem[],
): Promise<FactualityEvalResult[]> {
const results: FactualityEvalResult[] = [];

for await (const item of items) {
const result = await Factuality({
input: item.input,
output: item.output,
expected: item.expected,
} as any);


results.push(result as FactualityEvalResult);
}

return results;
}
}

0 comments on commit 4b7fbc4

Please sign in to comment.