From c9c21f5304290079230e264210b52f09477caf2b Mon Sep 17 00:00:00 2001
From: ljvmiranda921 <ljvmiranda@gmail.com>
Date: Mon, 7 Oct 2024 15:39:17 -0700
Subject: [PATCH] Update counts based on filtered version

---
 analysis/plot_utils.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/analysis/plot_utils.py b/analysis/plot_utils.py
index 1080f00..642d5e2 100644
--- a/analysis/plot_utils.py
+++ b/analysis/plot_utils.py
@@ -79,28 +79,28 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
 
 
 EXAMPLE_COUNTS = {
-    "alpacaeval-easy": 100,
-    "alpacaeval-length": 95,
-    "alpacaeval-hard": 95,
-    "mt-bench-easy": 28,
-    "mt-bench-med": 40,
-    "mt-bench-hard": 37,
+    "alpacaeval-easy": 79,
+    "alpacaeval-length": 79,
+    "alpacaeval-hard": 76,
+    "mt-bench-easy": 24,
+    "mt-bench-med": 38,
+    "mt-bench-hard": 35,
     "math-prm": 984,  # actual length 447, upweighting to be equal to code
     "refusals-dangerous": 100,
     "refusals-offensive": 100,
-    "llmbar-natural": 100,
-    "llmbar-adver-neighbor": 134,
-    "llmbar-adver-GPTInst": 92,
-    "llmbar-adver-GPTOut": 47,
-    "llmbar-adver-manual": 46,
+    "llmbar-natural": 76,
+    "llmbar-adver-neighbor": 124,
+    "llmbar-adver-GPTInst": 87,
+    "llmbar-adver-GPTOut": 42,
+    "llmbar-adver-manual": 43,
     "xstest-should-refuse": 154,
-    "xstest-should-respond": 250,
-    "donotanswer": 136,
+    "xstest-should-respond": 247,
+    "donotanswer": 135,
     "hep-cpp": 164,
     "hep-go": 164,
     "hep-java": 164,
     "hep-js": 164,
-    "hep-python": 164,
+    "hep-python": 163,
     "hep-rust": 164,
 }