compressed some

Signed-off-by: dafnapension <dafnashein@yahoo.com>
IBM · May 1, 2024 · 400fa81 · 400fa81
1 parent 2f9bcdd
commit 400fa81
Showing 1 changed file with 20 additions and 84 deletions.
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -682,31 +682,30 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     # done from the individual groups' scores (True), as if each group is represented by one instance whose score instance
     # is the group's aggregated score, or from the whole stream (False), where each resample is then split to
     # groups, the score of which is then computed, and finally averaged with the other groups' scores.
-    grouping = None
+    grouping: dict = None
 
     # how to aggregate over the scores in the instances. Each and every score_name in score_names is aggregated (over
     # the instances in the stream or group) by this aggregating function.
-    # the None value must be overridden by the subclasses.
-    aggregating = None
-
-    # example: {"aggregating_function_name": "mean",
-    #               "aggregating_function": MetricWithConfidenceInterval.average_item_scores}
+    # Potentially, to be overridden by the subclasses.
+    aggregating: dict = None
+    # if not set by subclasses, it is set here to {
+    #     "aggregating_function_name": "mean",
+    #     "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
+    # }
 
     # another example: the user specifies a callable that aggregates a list of floats into one float.
-    # also specified is a name of a field, hereafter named subgroup_column, that the value of which, indicates the
+    # also specified is a name of a field, hereafter named subgroup_column, whose value indicates the
     # sub-list into which each instance (of the stream or group) belongs. Finally, one or two sets of values
     # of that subgroup_columns are specified, by which one or two sets of instances (from the stream or group)
-    # are identified.  If one set: the aggregation is only carried over that set, and the result of that aggregation
-    # becomes the score of the group-or-stream.  If two sets, the ratio between these two aggregations is the score
-    # of the group-or-stream:
+    # are identified.  If one set, named the subgroup, the aggregation is only carried over that subgroup (of interest
+    # to the user), and the result of that aggregation becomes the score of the group-or-stream.
+    # If two sets, named control and comparison, the ratio between these two aggregations is set to be the score
+    # of the group-or-stream.
 
     reference_field: str = NonPositionalField(default="references")
     prediction_field: str = NonPositionalField(default="prediction")
 
     def verify(self):
-        assert (
-            self.aggregating is not None
-        ), "self.aggregating must be a specified input arg"
         assert isinstance(self.aggregating, dict), "aggregating must be a dict"
         assert len(self.aggregating) == 2, "aggregating must consist of two fields"
         assert (
@@ -741,8 +740,12 @@ def verify(self):
         ), "'score_names' and 'to_score_names' must have the same length"
 
     def prepare(self):
-        # in this PR, we take the input args as they are, in order to run on all the currently defined individual instancemetrics,
-        # but we change the args right here to the shape we suggest they should have:
+        if self.aggregating is None:
+            self.aggregating = {
+                "aggregating_function_name": "mean",
+                "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
+            }
+
         if self.score_names is None:
             self.score_names = [self.main_score]
         self.prefix = ""
@@ -991,10 +994,6 @@ def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> di
 class Accuracy(InstanceMetric):
     grouping = None
     score_names = ["accuracy"]
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
     main_score = "accuracy"
     ci_scores = ["accuracy"]
 
@@ -1045,10 +1044,6 @@ def prepare(self):
 class UnsortedListExactMatch(InstanceMetric):
     main_score = "unsorted_list_exact_match"
     ci_scores = ["unsorted_list_exact_match"]
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
@@ -1062,10 +1057,6 @@ def compute(
 class StringContainment(InstanceMetric):
     main_score = "string_containment"
     ci_scores = ["string_containment"]
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
     prediction_type = "Any"  # string representation is compared
     single_reference_per_prediction = False  # multiple references allowed
@@ -1527,10 +1518,6 @@ def compute(self, references, predictions, task_data: List[Dict]):
 # Computes char edit distance, ignoring whitespace
 class CharEditDistance(InstanceMetric):
     main_score = "char_edit_distance"
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
     ci_scores = [main_score]
     prediction_type = "str"
     single_reference_per_prediction = True
@@ -1561,10 +1548,6 @@ def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
 
 class CharEditDistanceAccuracy(CharEditDistance):
     main_score = "char_edit_dist_accuracy"
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
     ci_scores = [main_score]
 
@@ -1911,10 +1894,6 @@ def lower(text):
 
 class TokenOverlap(InstanceMetric):
     score_names = ["f1", "precision", "recall"]
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
     main_score = "f1"
     ci_scores = ["f1", "precision", "recall"]
@@ -2100,9 +2079,8 @@ class LlamaIndexCorrectness(InstanceMetric):
     aggregating: dict = None
 
     openai_models: List[str] = ["gpt-3.5-turbo"]
-    anthropic_models: List[
-        str
-    ] = []  # this is here for the sake of documentation for future models
+    # anthropic_models is here for the sake of documentation for future models:
+    anthropic_models: List[str] = []
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
 
@@ -2141,11 +2119,6 @@ def prepare(self):
             f"correctness_llama_index_by_{self.model_name_normalized}_judge"
         )
 
-        self.aggregating = {
-            "aggregating_function_name": "mean",
-            "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-        }
-
         super().prepare()
 
         from llama_index.core.evaluation import CorrectnessEvaluator
@@ -2650,10 +2623,6 @@ def _compute(
 
 
 class MRR(RetrievalMetric):
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
     main_score = "mrr"
     ci_scores = ["mrr"]
 
@@ -2670,10 +2639,6 @@ def _compute(
 
 
 class MAP(RetrievalMetric):
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
     main_score = "map"
     ci_scores = ["map"]
 
@@ -2708,10 +2673,6 @@ def prepare(self):
             for k in self.k_list
         ]
         self.score_names = self.ci_scores
-        self.aggregating = {
-            "aggregating_function_name": "mean",
-            "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-        }
         super().prepare()
 
     @staticmethod
@@ -3089,10 +3050,6 @@ class GroupMeanAccuracy(Accuracy):
         "group_by_field": "task_data/group_id",
         "ci_samples_from_groups_scores": False,
     }
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
 
 class FixedGroupMeanAccuracy(Accuracy):
@@ -3101,10 +3058,6 @@ class FixedGroupMeanAccuracy(Accuracy):
         "group_by_field": "task_data/group_id",
         "ci_samples_from_groups_scores": True,
     }
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
 
 # same as above, now using StringContainment
@@ -3113,10 +3066,6 @@ class GroupMeanStringContainment(StringContainment):
         "group_by_field": "task_data/group_id",
         "ci_samples_from_groups_scores": False,
     }
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
 
 class FixedGroupMeanStringContainment(StringContainment):
@@ -3125,10 +3074,6 @@ class FixedGroupMeanStringContainment(StringContainment):
         "group_by_field": "task_data/group_id",
         "ci_samples_from_groups_scores": True,
     }
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": InstanceMetric.average_item_scores,
-    }
 
 
 # take only the (fixed) group mean of baseline or other (paraphrases) scores
@@ -3232,10 +3177,6 @@ class GroupMeanTokenOverlap(TokenOverlap):
         "group_by_field": "task_data/group_id",
         "ci_samples_from_groups_scores": False,
     }
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
 
 
 # using Cohens's h for proportions
@@ -3415,11 +3356,6 @@ class BinaryAccuracy(InstanceMetric):
     prediction_type = "str"
     single_reference_per_prediction = True
 
-    aggregating = {
-        "aggregating_function_name": "mean",
-        "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
-    }
-
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict: