3232from typing import Any , Dict , List , Optional , Union
3333
3434from lighteval .metrics .llm_as_judge import JudgeLM
35- from lighteval .metrics .metrics import Metric , MetricCategory , Metrics
36- from lighteval .metrics .utils .metric_utils import MetricUseCase
35+ from lighteval .metrics .metrics import Metric , Metrics
3736from lighteval .tasks .default_prompts import LETTER_INDICES
3837from lighteval .tasks .lighteval_task import LightevalTaskConfig
39- from lighteval .tasks .requests import Doc
38+ from lighteval .tasks .requests import Doc , SamplingMethod
4039
4140
4241# fmt: off
@@ -104,7 +103,7 @@ def __init__(
104103 hf_subset = hf_subset ,
105104 prompt_function = arabic_mmlu_pfn ,
106105 hf_repo = "MBZUAI/ArabicMMLU" ,
107- metric = [Metrics .loglikelihood_acc_norm ],
106+ metrics = [Metrics .loglikelihood_acc_norm ],
108107 hf_avail_splits = ["test" ],
109108 evaluation_splits = ["test" ],
110109 few_shots_split = ["dev" ],
@@ -166,7 +165,7 @@ def __init__(
166165 hf_subset = hf_subset ,
167166 prompt_function = arabic_mmlu_ht_pfn ,
168167 hf_repo = "MBZUAI/human_translated_arabic_mmlu" ,
169- metric = [Metrics .loglikelihood_acc_norm ],
168+ metrics = [Metrics .loglikelihood_acc_norm ],
170169 hf_avail_splits = ["test" ],
171170 evaluation_splits = ["test" ],
172171 few_shots_split = None ,
@@ -231,7 +230,7 @@ def __init__(
231230 hf_subset = hf_subset ,
232231 prompt_function = arabic_mmlu_mt_pfn ,
233232 hf_repo = "OALL/Arabic_MMLU" ,
234- metric = [Metrics .loglikelihood_acc_norm ],
233+ metrics = [Metrics .loglikelihood_acc_norm ],
235234 hf_avail_splits = ["test" , "dev" ],
236235 evaluation_splits = ["test" ],
237236 few_shots_split = "dev" ,
@@ -287,7 +286,7 @@ def __init__(
287286 hf_subset = hf_subset ,
288287 prompt_function = acva_pfn ,
289288 hf_repo = "OALL/ACVA" ,
290- metric = [Metrics .loglikelihood_acc_norm ],
289+ metrics = [Metrics .loglikelihood_acc_norm ],
291290 hf_avail_splits = ["test" , "validation" ],
292291 evaluation_splits = ["test" ],
293292 few_shots_split = "validation" ,
@@ -344,7 +343,7 @@ def __init__(
344343 hf_subset = hf_subset ,
345344 prompt_function = aratrust_pfn ,
346345 hf_repo = "asas-ai/AraTrust-categorized" ,
347- metric = [Metrics .loglikelihood_acc_norm ],
346+ metrics = [Metrics .loglikelihood_acc_norm ],
348347 hf_avail_splits = ["train" ],
349348 evaluation_splits = ["train" ],
350349 few_shots_split = None ,
@@ -393,7 +392,7 @@ def arabic_exams_pfn(line, task_name: str = None):
393392 evaluation_splits = ["test" ],
394393 few_shots_split = "validation" ,
395394 few_shots_select = "sequential" ,
396- metric = [Metrics .loglikelihood_acc_norm ],
395+ metrics = [Metrics .loglikelihood_acc_norm ],
397396 trust_dataset = True ,
398397 version = 0 ,
399398)
@@ -444,7 +443,7 @@ def __init__(
444443 hf_subset = hf_subset ,
445444 prompt_function = alghafa_pfn ,
446445 hf_repo = "OALL/AlGhafa-Arabic-LLM-Benchmark-Native" ,
447- metric = [Metrics .loglikelihood_acc_norm ],
446+ metrics = [Metrics .loglikelihood_acc_norm ],
448447 hf_avail_splits = ["test" , "validation" ],
449448 evaluation_splits = ["test" ],
450449 few_shots_split = "validation" ,
@@ -471,7 +470,7 @@ def __init__(
471470 evaluation_splits = ["test" ],
472471 few_shots_split = "validation" ,
473472 few_shots_select = "sequential" ,
474- metric = [Metrics .loglikelihood_acc_norm ],
473+ metrics = [Metrics .loglikelihood_acc_norm ],
475474 trust_dataset = True ,
476475 version = 0 ,
477476)
@@ -488,7 +487,7 @@ def __init__(
488487 evaluation_splits = ["test" ],
489488 few_shots_split = "validation" ,
490489 few_shots_select = "sequential" ,
491- metric = [Metrics .loglikelihood_acc_norm ],
490+ metrics = [Metrics .loglikelihood_acc_norm ],
492491 trust_dataset = True ,
493492 version = 0 ,
494493)
@@ -505,7 +504,7 @@ def __init__(
505504 evaluation_splits = ["test" ],
506505 few_shots_split = "validation" ,
507506 few_shots_select = "sequential" ,
508- metric = [Metrics .loglikelihood_acc_norm ],
507+ metrics = [Metrics .loglikelihood_acc_norm ],
509508 trust_dataset = True ,
510509 version = 0 ,
511510)
@@ -522,7 +521,7 @@ def __init__(
522521 evaluation_splits = ["test" ],
523522 few_shots_split = "validation" ,
524523 few_shots_select = "sequential" ,
525- metric = [Metrics .loglikelihood_acc_norm ],
524+ metrics = [Metrics .loglikelihood_acc_norm ],
526525 trust_dataset = True ,
527526 version = 0 ,
528527)
@@ -539,7 +538,7 @@ def __init__(
539538 evaluation_splits = ["test" ],
540539 few_shots_split = "validation" ,
541540 few_shots_select = "sequential" ,
542- metric = [Metrics .loglikelihood_acc_norm ],
541+ metrics = [Metrics .loglikelihood_acc_norm ],
543542 trust_dataset = True ,
544543 version = 0 ,
545544)
@@ -556,7 +555,7 @@ def __init__(
556555 evaluation_splits = ["test" ],
557556 few_shots_split = "validation" ,
558557 few_shots_select = "sequential" ,
559- metric = [Metrics .loglikelihood_acc_norm ],
558+ metrics = [Metrics .loglikelihood_acc_norm ],
560559 trust_dataset = True ,
561560 version = 0 ,
562561)
@@ -594,7 +593,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
594593 evaluation_splits = ["test" ],
595594 few_shots_split = "validation" ,
596595 few_shots_select = "sequential" ,
597- metric = [Metrics .loglikelihood_acc_norm ],
596+ metrics = [Metrics .loglikelihood_acc_norm ],
598597 trust_dataset = True ,
599598 version = 0 ,
600599)
@@ -629,7 +628,7 @@ def copa_arabic_pfn(line, task_name: str = None):
629628 evaluation_splits = ["test" ],
630629 few_shots_split = "validation" ,
631630 few_shots_select = "sequential" ,
632- metric = [Metrics .loglikelihood_acc_norm ],
631+ metrics = [Metrics .loglikelihood_acc_norm ],
633632 trust_dataset = True ,
634633 version = 0 ,
635634)
@@ -673,7 +672,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
673672 evaluation_splits = ["test" ],
674673 few_shots_split = "validation" ,
675674 few_shots_select = "sequential" ,
676- metric = [Metrics .loglikelihood_acc_norm ],
675+ metrics = [Metrics .loglikelihood_acc_norm ],
677676 trust_dataset = True ,
678677 version = 0 ,
679678)
@@ -710,7 +709,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
710709 evaluation_splits = ["test" ],
711710 few_shots_split = "validation" ,
712711 few_shots_select = "sequential" ,
713- metric = [Metrics .loglikelihood_acc_norm ],
712+ metrics = [Metrics .loglikelihood_acc_norm ],
714713 trust_dataset = True ,
715714 version = 0 ,
716715)
@@ -761,7 +760,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
761760 evaluation_splits = ["test" ],
762761 few_shots_split = "validation" ,
763762 few_shots_select = "sequential" ,
764- metric = [Metrics .loglikelihood_acc_norm ],
763+ metrics = [Metrics .loglikelihood_acc_norm ],
765764 trust_dataset = True ,
766765 version = 0 ,
767766)
@@ -819,7 +818,7 @@ def __init__(
819818 hf_subset = hf_subset ,
820819 prompt_function = madinah_qa_pfn ,
821820 hf_repo = "MBZUAI/MadinahQA" ,
822- metric = [Metrics .loglikelihood_acc_norm ],
821+ metrics = [Metrics .loglikelihood_acc_norm ],
823822 hf_avail_splits = ["test" ],
824823 evaluation_splits = ["test" ],
825824 few_shots_split = ["dev" ],
@@ -849,11 +848,10 @@ def __init__(self, judge: JudgeLM):
849848 """
850849 self .judge = judge
851850 self .metric_name = "llm_as_judge"
852- self .category = MetricCategory . LLM_AS_JUDGE
851+ self .category = SamplingMethod . GENERATIVE
853852 self .corpus_level_fn = self .aggregate_scores
854853 self .sample_level_fn = self ._sample_level_fn
855854 self .higher_is_better = True # Fixed tuple syntax
856- self .use_case = MetricUseCase .NONE
857855
858856 def compute (self , responses : list [str ], formatted_docs : list [Doc ], ** kwargs ) -> dict [str , float ]:
859857 """
@@ -1039,7 +1037,7 @@ def process_judge_response(response) -> float:
10391037 hf_subset = None ,
10401038 hf_avail_splits = ["train" ],
10411039 evaluation_splits = ["train" ],
1042- metric = [wrapped_judge ],
1040+ metrics = [wrapped_judge ],
10431041 trust_dataset = True ,
10441042 generation_size = 200 ,
10451043 stop_sequence = [],
0 commit comments