From c9c21f5304290079230e264210b52f09477caf2b Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Mon, 7 Oct 2024 15:39:17 -0700 Subject: [PATCH] Update counts based on filtered version --- analysis/plot_utils.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/analysis/plot_utils.py b/analysis/plot_utils.py index 1080f00..642d5e2 100644 --- a/analysis/plot_utils.py +++ b/analysis/plot_utils.py @@ -79,28 +79,28 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]: EXAMPLE_COUNTS = { - "alpacaeval-easy": 100, - "alpacaeval-length": 95, - "alpacaeval-hard": 95, - "mt-bench-easy": 28, - "mt-bench-med": 40, - "mt-bench-hard": 37, + "alpacaeval-easy": 79, + "alpacaeval-length": 79, + "alpacaeval-hard": 76, + "mt-bench-easy": 24, + "mt-bench-med": 38, + "mt-bench-hard": 35, "math-prm": 984, # actual length 447, upweighting to be equal to code "refusals-dangerous": 100, "refusals-offensive": 100, - "llmbar-natural": 100, - "llmbar-adver-neighbor": 134, - "llmbar-adver-GPTInst": 92, - "llmbar-adver-GPTOut": 47, - "llmbar-adver-manual": 46, + "llmbar-natural": 76, + "llmbar-adver-neighbor": 124, + "llmbar-adver-GPTInst": 87, + "llmbar-adver-GPTOut": 42, + "llmbar-adver-manual": 43, "xstest-should-refuse": 154, - "xstest-should-respond": 250, - "donotanswer": 136, + "xstest-should-respond": 247, + "donotanswer": 135, "hep-cpp": 164, "hep-go": 164, "hep-java": 164, "hep-js": 164, - "hep-python": 164, + "hep-python": 163, "hep-rust": 164, }