huggingface
diff --git a/‎community_tasks/_template.py‎
Lines changed: 6 additions & 10 deletions b/‎community_tasks/_template.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎community_tasks/arabic_evals.py‎
Lines changed: 23 additions & 25 deletions b/‎community_tasks/arabic_evals.py‎
Lines changed: 23 additions & 25 deletions
diff --git a/‎community_tasks/french_evals.py‎
Lines changed: 4 additions & 13 deletions b/‎community_tasks/french_evals.py‎
Lines changed: 4 additions & 13 deletions
diff --git a/‎docs/source/_toctree.yml‎
Lines changed: 3 additions & 1 deletion b/‎docs/source/_toctree.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/adding-a-custom-task.mdx‎
Lines changed: 2 additions & 48 deletions b/‎docs/source/adding-a-custom-task.mdx‎
Lines changed: 2 additions & 48 deletions
@@ -30,13 +30,10 @@
 """
 
 import numpy as np
-from aenum import extend_enum
 
-from lighteval.metrics.metrics import Metrics, SampleLevelMetric
-from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
-from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.metrics.metrics import SampleLevelMetric
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 
 
 # DEFINE YOUR PROMPT FUNCTIONS
@@ -49,7 +46,7 @@ def prompt_fn(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query="",
-        choices="",
+        choices=[""],
         gold_index=0,
         instruction="",
     )
@@ -68,7 +65,7 @@ def prompt_fn(line, task_name: str = None):
     evaluation_splits=[],
     few_shots_split="",
     few_shots_select="",
-    metric=[],  # select your metric in Metrics
+    metrics=[],  # select your metric in Metrics
 )
 
 # EVALS WITH SUBSET
@@ -91,7 +88,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
             hf_repo="",
-            metric=[custom_metric],  # select your metric in Metrics or use your custom_metric
+            metrics=[custom_metric],  # select your metric in Metrics or use your custom_metric
             hf_avail_splits=[],
             evaluation_splits=[],
             few_shots_split="",
@@ -111,8 +108,7 @@ def __init__(
 custom_metric = SampleLevelMetric(
     metric_name="my_custom_metric_name",
     higher_is_better=True,
-    category=MetricCategory.IGNORED,
-    use_case=MetricUseCase.NONE,
+    category=SamplingMethod.GENERATIVE,  # or LOGPROBS, PERPLEXITY, etc.
     sample_level_fn=lambda x: x,  # how to compute score for one sample
     corpus_level_fn=np.mean,  # aggregation
 )
@@ -32,11 +32,10 @@
 from typing import Any, Dict, List, Optional, Union
 
 from lighteval.metrics.llm_as_judge import JudgeLM
-from lighteval.metrics.metrics import Metric, MetricCategory, Metrics
-from lighteval.metrics.utils.metric_utils import MetricUseCase
+from lighteval.metrics.metrics import Metric, Metrics
 from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.requests import Doc, SamplingMethod
 
 
 # fmt: off
@@ -104,7 +103,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=arabic_mmlu_pfn,
             hf_repo="MBZUAI/ArabicMMLU",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test"],
             evaluation_splits=["test"],
             few_shots_split=["dev"],
@@ -166,7 +165,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=arabic_mmlu_ht_pfn,
             hf_repo="MBZUAI/human_translated_arabic_mmlu",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test"],
             evaluation_splits=["test"],
             few_shots_split=None,
@@ -231,7 +230,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=arabic_mmlu_mt_pfn,
             hf_repo="OALL/Arabic_MMLU",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split="dev",
@@ -287,7 +286,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=acva_pfn,
             hf_repo="OALL/ACVA",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -344,7 +343,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=aratrust_pfn,
             hf_repo="asas-ai/AraTrust-categorized",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["train"],
             evaluation_splits=["train"],
             few_shots_split=None,
@@ -393,7 +392,7 @@ def arabic_exams_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -444,7 +443,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=alghafa_pfn,
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -471,7 +470,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -488,7 +487,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -505,7 +504,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -522,7 +521,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -539,7 +538,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -556,7 +555,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -594,7 +593,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -629,7 +628,7 @@ def copa_arabic_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -673,7 +672,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -710,7 +709,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -761,7 +760,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=[Metrics.loglikelihood_acc_norm],
+    metrics=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -819,7 +818,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=madinah_qa_pfn,
             hf_repo="MBZUAI/MadinahQA",
-            metric=[Metrics.loglikelihood_acc_norm],
+            metrics=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test"],
             evaluation_splits=["test"],
             few_shots_split=["dev"],
@@ -849,11 +848,10 @@ def __init__(self, judge: JudgeLM):
         """
         self.judge = judge
         self.metric_name = "llm_as_judge"
-        self.category = MetricCategory.LLM_AS_JUDGE
+        self.category = SamplingMethod.GENERATIVE
         self.corpus_level_fn = self.aggregate_scores
         self.sample_level_fn = self._sample_level_fn
         self.higher_is_better = True  # Fixed tuple syntax
-        self.use_case = MetricUseCase.NONE
 
     def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
         """
@@ -1039,7 +1037,7 @@ def process_judge_response(response) -> float:
     hf_subset=None,
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
-    metric=[wrapped_judge],
+    metrics=[wrapped_judge],
     trust_dataset=True,
     generation_size=200,
     stop_sequence=[],
 
@@ -32,16 +32,7 @@
 
 import random
 
-import numpy as np
-from aenum import extend_enum
-
-import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
-from lighteval.metrics.metrics import Metrics, SampleLevelMetric
-from lighteval.metrics.utils.metric_utils import (
-    MetricCategory,
-    MetricUseCase,
-    SampleLevelMetricGrouping,
-)
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.extended.ifeval.main import ifeval_metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
@@ -106,7 +97,7 @@ def prompt_bac_fr(line, task_name: str = None):
     suite=["community"],
     hf_repo="fr-gouv-coordination-ia/IFEval-fr",
     hf_subset="default",
-    metric=[ifeval_metrics],
+    metrics=[ifeval_metrics],
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
     few_shots_split="train",
@@ -128,7 +119,7 @@ def prompt_bac_fr(line, task_name: str = None):
     few_shots_split=None,
     few_shots_select="random_sampling",
     generation_size=1,
-    metric=[Metrics.loglikelihood_acc],
+    metrics=[Metrics.loglikelihood_acc],
     stop_sequence=["\n"],
     trust_dataset=True,
     version=0,
@@ -146,7 +137,7 @@ def prompt_bac_fr(line, task_name: str = None):
     few_shots_split=None,
     few_shots_select="random_sampling",
     generation_size=1,
-    metric=[Metrics.quasi_exact_match_math, Metrics.exact_match],
+    metrics=[Metrics.quasi_exact_match_math, Metrics.exact_match],
     stop_sequence=["\n"],
     trust_dataset=True,
     version=0,
 
@@ -41,9 +41,11 @@
     - local: package_reference/evaluation_tracker
       title: EvaluationTracker
     - local: package_reference/models
-      title: Models and ModelConfigs
+      title: Model Configs
     - local: package_reference/pipeline
       title: Pipeline
+    - local: package_reference/models_outputs
+      title: Model's Output
     title: Main classes
   - local: package_reference/metrics
     title: Metrics
 
@@ -41,7 +41,6 @@ def prompt_fn(line, task_name: str = None):
         query=line["question"],
         choices=[f" {c}" for c in line["choices"]],
         gold_index=line["gold"],
-        instruction="",
     )
 ```
 
@@ -53,8 +52,7 @@ in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-
 custom_metric = SampleLevelMetric(
     metric_name="my_custom_metric_name",
     higher_is_better=True,
-    category=MetricCategory.IGNORED,
-    use_case=MetricUseCase.NONE,
+    category=SamplingMethod.{GENERATIVE,LOGPROBS},
     sample_level_fn=lambda x: x,  # how to compute score for one sample
     corpus_level_fn=np.mean,  # How to aggregate the samples metrics
 )
@@ -77,7 +75,7 @@ task = LightevalTaskConfig(
     evaluation_splits=[],
     few_shots_split=None,
     few_shots_select=None,
-    metric=[],  # select your metric in Metrics
+    metrics=[],  # select your metric in Metrics
 )
 ```
 
@@ -111,50 +109,6 @@ class CustomSubsetTask(LightevalTaskConfig):
 SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
 ```
 
-Here is a list of the parameters and their meaning:
-
-- `name` (str), your evaluation name
-- `suite` (list), the suite(s) to which your evaluation should belong. This
-  field allows us to compare different task implementations and is used as a
-  task selection to differentiate the versions to launch. At the moment, you'll
-  find the keywords ["helm", "bigbench", "original", "lighteval", "community",
-  "custom"]; for core evals, please choose `lighteval`.
-- `prompt_function` (Callable), the prompt function you defined in the step
-  above
-- `hf_repo` (str), the path to your evaluation dataset on the hub
-- `hf_subset` (str), the specific subset you want to use for your evaluation
-  (note: when the dataset has no subset, fill this field with `"default"`, not
-  with `None` or `""`)
-- `hf_avail_splits` (list), all the splits available for your dataset (train,
-  valid or validation, test, other...)
-- `evaluation_splits` (list), the splits you want to use for evaluation
-- `few_shots_split` (str, can be `null`), the specific split from which you
-  want to select samples for your few-shot examples. It should be different
-  from the sets included in `evaluation_splits`
-- `few_shots_select` (str, can be `null`), the method that you will use to
-  select items for your few-shot examples. Can be `null`, or one of:
-    - `balanced` select examples from the `few_shots_split` with balanced
-      labels, to avoid skewing the few shot examples (hence the model
-      generations) toward one specific label
-    - `random` selects examples at random from the `few_shots_split`
-    - `random_sampling` selects new examples at random from the
-      `few_shots_split` for every new item, but if a sampled item is equal to
-      the current one, it is removed from the available samples
-    - `random_sampling_from_train` selects new examples at random from the
-      `few_shots_split` for every new item, but if a sampled item is equal to
-      the current one, it is kept! Only use this if you know what you are
-      doing.
-    - `sequential` selects the first `n` examples of the `few_shots_split`
-- `generation_size` (int), the maximum number of tokens allowed for a
-  generative evaluation. If your evaluation is a log likelihood evaluation
-  (multi-choice), this value should be -1
-- `stop_sequence` (list), a list of strings acting as end of sentence tokens
-  for your generation
-- `metric` (list), the metrics you want to use for your evaluation (see next
-  section for a detailed explanation)
-- `trust_dataset` (bool), set to True if you trust the dataset.
-
-
 Then you need to add your task to the `TASKS_TABLE` list.
 
 ```python