Merge pull request #2 from thisguymartin/update-prompt

feat: enhance encryption utilities and update database service for in…
thisguymartin · Jan 17, 2025 · 4b7fbc4 · 4b7fbc4
2 parents e8d9d85 + 1dd5d73
commit 4b7fbc4
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 96 deletions.
diff --git a/example/answerCorrectness.json b/example/answerCorrectness.json
@@ -0,0 +1,34 @@
+[
+  {
+    "input": {
+      "context": "Tesla's Model 3 was first unveiled on March 31, 2016, with the first deliveries beginning in July 2017. The base model has an EPA-rated range of 272 miles and can accelerate from 0-60 mph in 5.8 seconds.",
+      "question": "When did Tesla start delivering the Model 3?"
+    },
+    "output": "July 2017",
+    "criteria": "Answer must be exactly 'July 2017' based on the provided context"
+  },
+  {
+    "input": {
+      "context": "The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the northeast coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef was declared a World Heritage site in 1981.",
+      "question": "How long is the Great Barrier Reef?"
+    },
+    "output": "2,300 kilometers",
+    "criteria": "Answer must specify '2,300 kilometers' from the context, with units included"
+  },
+  {
+    "input": {
+      "context": "Python was created by Guido van Rossum and was first released in 1991. The language emphasizes code readability with its notable use of significant whitespace. Python 3.0 was released in 2008, with major improvements in handling text strings.",
+      "question": "Who created Python?"
+    },
+    "output": "Guido van Rossum",
+    "criteria": "Answer must be the full name 'Guido van Rossum' as stated in the context"
+  },
+  {
+    "input": {
+      "context": "The human brain consists of approximately 86 billion neurons. Each neuron can make connections with up to 10,000 other neurons, resulting in trillions of neural connections. The brain uses about 20% of the body's total energy consumption.",
+      "question": "What percentage of the body's energy does the brain use?"
+    },
+    "output": "20%",
+    "criteria": "Answer must be '20%' based on the context, including the percentage symbol"
+  }
+]
diff --git a/example/closeqa.json b/example/closeqa.json
@@ -0,0 +1,12 @@
+[
+  {
+    "input": "List the first three prime numbers in ascending order, separated by commas.",
+    "output": "2,3,5",
+    "criteria": "Numbers must be in correct order, separated by commas with no spaces"
+  },
+  {
+    "input": "What are the three states of matter, in alphabetical order?",
+    "output": "gas,liquid,solid",
+    "criteria": "States must be in alphabetical order, lowercase, separated by commas with no spaces"
+  }
+]
diff --git a/src/commands/run-eval.ts b/src/commands/run-eval.ts
@@ -1,17 +1,48 @@
 import { Command } from "https://deno.land/x/cliffy@v0.25.4/command/mod.ts";
-import { Factuality, JSONDiff, ValidJSON } from "autoevals";
 import { ApiKeyLoader } from "../services/openAi.ts";
-import { FactualityEvaluator } from "../services/faculty-evaluator.ts";
+import { FactualityEval, FactualityEvalItem } from "../services/faculty.ts";
+import { CloseCaseQA, QAEvalItem } from "../services/close-qa.ts";
+import { error, success } from "../utils/color.ts";
 
 export const RunEval = new Command()
   .default("run-eval")
   .description("Run the evaluation")
+  .option("-t, --type <string>", "Type of evaluation to run")
+  .option("-p, --print <bool>", "Print the results")
   .arguments("<file>")
-  .action(async (_, file: string) => {
-    console.log(`Running evaluation from directory: ${JSON.stringify(file)}`);
+  .action(async ({ type, print }, file: string) => {
     const keyLoader = new ApiKeyLoader();
     await keyLoader.loadApiKey();
-    const v = new FactualityEvaluator();
-    const a = await v.evaluateFromJson(file);
-    console.log(a);
+    let generalResult: string;
+    if (type === "factuality") {
+      const faculty = new FactualityEval();
+      const items = await faculty.parseFile<FactualityEvalItem[]>(file);
+      const results = await faculty.evaluateItems(items);
+      console.log(results);
+      generalResult = JSON.stringify(results, null, 2);
+    } else if (type == "closeqa") {
+      const closeQa = new CloseCaseQA();
+      const items = await closeQa.parseFile<QAEvalItem[]>(file);
+      const result = await closeQa.evaluateItems(items);
+      console.log(result);
+      generalResult = JSON.stringify(result, null, 2);
+    } else {
+      console.error(error("Invalid evaluation type"));
+      return;
+    }
+
+    console.log(success("====================================="));
+
+    if (print) {
+      const downloads = Deno.env.get("HOME") + "/Downloads";
+      const filename = file.split("/").pop();
+      await Deno.writeTextFile(
+        `${downloads}/${filename}-result.json`,
+        generalResult,
+      );
+
+      console.log(
+        success(`Results saved to ${downloads}/${filename}-result.json`),
+      );
+    }
   });
diff --git a/src/main.ts b/src/main.ts
@@ -13,7 +13,6 @@ const program = new Command()
   .description("AI powered burrito LLM evaluation CLI tool")
   .action(() => {
     const db = DatabaseService.getInstance();
-    console.log(db.getAllSettings());
     if (db.getAllSettings().length == 0) {
       console.log(info(burroTitle));
       console.log(

diff --git a/src/services/close-qa.ts b/src/services/close-qa.ts
@@ -0,0 +1,33 @@
+import { ClosedQA } from "autoevals";
+import { Evaluate } from "./evaluate.ts";
+
+export interface QAEvalItem {
+  input: string;
+  output: string;
+  criteria: string;
+}
+
+export interface QAEvalResult {
+  name: string;
+  score: number;
+  metetadata: {
+    rationale : string;
+    choice: string;
+  }
+}
+
+export class CloseCaseQA extends Evaluate {
+  async evaluateItems(items: QAEvalItem[]): Promise<QAEvalResult[]> {
+    const results: QAEvalResult[] = [];
+    for await (const item of items) {
+      const result = await ClosedQA({
+        input: item.input, 
+        criteria: item.criteria,
+        item: item.output,
+      } as any);
+      results.push(result as QAEvalResult);
+    }
+
+    return results;
+  }
+}
diff --git a/src/services/evaluate.ts b/src/services/evaluate.ts
@@ -0,0 +1,12 @@
+export class Evaluate {
+  async parseFile<T>(jsonPath: string): Promise<T> {
+    try {
+      const jsonContent = await Deno.readTextFile(jsonPath);
+      const evalItems = JSON.parse(jsonContent);
+      return evalItems as T;
+    } catch (error) {
+      console.error(`Failed to evaluate from JSON: ${error}`);
+      throw new Error(`Failed to evaluate from JSON`);
+    }
+  }
+}
diff --git a/src/services/faculty-evaluator.ts b/src/services/faculty-evaluator.ts
diff --git a/src/services/faculty.ts b/src/services/faculty.ts
@@ -0,0 +1,56 @@
+import { Factuality } from "autoevals";
+import { Evaluate } from "./evaluate.ts";
+
+export interface FactualityEvalItem {
+  input: string;
+  output: string;
+  expected: string;
+}
+
+interface FactualityEvalResult  {
+  name: string;
+  score: number;
+  metadata: {
+    rationale: string
+    choice: string
+  }
+}
+
+export class FactualityEval extends Evaluate {
+  static printResults(results: FactualityEvalResult[]): void {
+    console.log("\nFactuality Evaluation Results:");
+    console.log("=============================\n");
+
+    results.forEach((result, index) => {
+      console.log(`Item ${index + 1}:`);
+      console.log(`Output: ${result.name}`);
+      console.log(`Score: ${result.score}`);
+      console.log(`Metadata: ${JSON.stringify(result.metadata, null, 2)}`);
+      console.log("-----------------------------\n");
+    });
+
+    // Print summary statistics
+    const avgScore = results.reduce((sum, r) => sum + r.score, 0) /
+      results.length;
+    console.log(`Average Score: ${avgScore.toFixed(3)}`);
+  }
+
+  async evaluateItems(
+    items: FactualityEvalItem[],
+  ): Promise<FactualityEvalResult[]> {
+    const results: FactualityEvalResult[] = [];
+
+    for await (const item of items) {
+      const result = await Factuality({
+        input: item.input,
+        output: item.output,
+        expected: item.expected,
+      } as any);
+
+
+      results.push(result as FactualityEvalResult);
+    }
+
+    return results;
+  }
+}