Skip to content

Commit

Permalink
compressed some
Browse files Browse the repository at this point in the history
Signed-off-by: dafnapension <dafnashein@yahoo.com>
  • Loading branch information
dafnapension committed May 1, 2024
1 parent 2f9bcdd commit 400fa81
Showing 1 changed file with 20 additions and 84 deletions.
104 changes: 20 additions & 84 deletions src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,31 +682,30 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
# done from the individual groups' scores (True), as if each group is represented by one instance whose score instance
# is the group's aggregated score, or from the whole stream (False), where each resample is then split to
# groups, the score of which is then computed, and finally averaged with the other groups' scores.
grouping = None
grouping: dict = None

# how to aggregate over the scores in the instances. Each and every score_name in score_names is aggregated (over
# the instances in the stream or group) by this aggregating function.
# the None value must be overridden by the subclasses.
aggregating = None

# example: {"aggregating_function_name": "mean",
# "aggregating_function": MetricWithConfidenceInterval.average_item_scores}
# Potentially, to be overridden by the subclasses.
aggregating: dict = None
# if not set by subclasses, it is set here to {
# "aggregating_function_name": "mean",
# "aggregating_function": MetricWithConfidenceInterval.average_item_scores,
# }

# another example: the user specifies a callable that aggregates a list of floats into one float.
# also specified is a name of a field, hereafter named subgroup_column, that the value of which, indicates the
# also specified is a name of a field, hereafter named subgroup_column, whose value indicates the
# sub-list into which each instance (of the stream or group) belongs. Finally, one or two sets of values
# of that subgroup_columns are specified, by which one or two sets of instances (from the stream or group)
# are identified. If one set: the aggregation is only carried over that set, and the result of that aggregation
# becomes the score of the group-or-stream. If two sets, the ratio between these two aggregations is the score
# of the group-or-stream:
# are identified. If one set, named the subgroup, the aggregation is only carried over that subgroup (of interest
# to the user), and the result of that aggregation becomes the score of the group-or-stream.
# If two sets, named control and comparison, the ratio between these two aggregations is set to be the score
# of the group-or-stream.

reference_field: str = NonPositionalField(default="references")
prediction_field: str = NonPositionalField(default="prediction")

def verify(self):
assert (
self.aggregating is not None
), "self.aggregating must be a specified input arg"
assert isinstance(self.aggregating, dict), "aggregating must be a dict"
assert len(self.aggregating) == 2, "aggregating must consist of two fields"
assert (
Expand Down Expand Up @@ -741,8 +740,12 @@ def verify(self):
), "'score_names' and 'to_score_names' must have the same length"

def prepare(self):
# in this PR, we take the input args as they are, in order to run on all the currently defined individual instancemetrics,
# but we change the args right here to the shape we suggest they should have:
if self.aggregating is None:
self.aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

if self.score_names is None:
self.score_names = [self.main_score]
self.prefix = ""
Expand Down Expand Up @@ -991,10 +994,6 @@ def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> di
class Accuracy(InstanceMetric):
grouping = None
score_names = ["accuracy"]
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}
main_score = "accuracy"
ci_scores = ["accuracy"]

Expand Down Expand Up @@ -1045,10 +1044,6 @@ def prepare(self):
class UnsortedListExactMatch(InstanceMetric):
main_score = "unsorted_list_exact_match"
ci_scores = ["unsorted_list_exact_match"]
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

def compute(
self, references: List[Any], prediction: Any, task_data: List[Dict]
Expand All @@ -1062,10 +1057,6 @@ def compute(
class StringContainment(InstanceMetric):
main_score = "string_containment"
ci_scores = ["string_containment"]
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

prediction_type = "Any" # string representation is compared
single_reference_per_prediction = False # multiple references allowed
Expand Down Expand Up @@ -1527,10 +1518,6 @@ def compute(self, references, predictions, task_data: List[Dict]):
# Computes char edit distance, ignoring whitespace
class CharEditDistance(InstanceMetric):
main_score = "char_edit_distance"
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}
ci_scores = [main_score]
prediction_type = "str"
single_reference_per_prediction = True
Expand Down Expand Up @@ -1561,10 +1548,6 @@ def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:

class CharEditDistanceAccuracy(CharEditDistance):
main_score = "char_edit_dist_accuracy"
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

ci_scores = [main_score]

Expand Down Expand Up @@ -1911,10 +1894,6 @@ def lower(text):

class TokenOverlap(InstanceMetric):
score_names = ["f1", "precision", "recall"]
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

main_score = "f1"
ci_scores = ["f1", "precision", "recall"]
Expand Down Expand Up @@ -2100,9 +2079,8 @@ class LlamaIndexCorrectness(InstanceMetric):
aggregating: dict = None

openai_models: List[str] = ["gpt-3.5-turbo"]
anthropic_models: List[
str
] = [] # this is here for the sake of documentation for future models
# anthropic_models is here for the sake of documentation for future models:
anthropic_models: List[str] = []
mock_models: List[str] = ["mock"]
external_api_models = openai_models + anthropic_models

Expand Down Expand Up @@ -2141,11 +2119,6 @@ def prepare(self):
f"correctness_llama_index_by_{self.model_name_normalized}_judge"
)

self.aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

super().prepare()

from llama_index.core.evaluation import CorrectnessEvaluator
Expand Down Expand Up @@ -2650,10 +2623,6 @@ def _compute(


class MRR(RetrievalMetric):
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}
main_score = "mrr"
ci_scores = ["mrr"]

Expand All @@ -2670,10 +2639,6 @@ def _compute(


class MAP(RetrievalMetric):
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}
main_score = "map"
ci_scores = ["map"]

Expand Down Expand Up @@ -2708,10 +2673,6 @@ def prepare(self):
for k in self.k_list
]
self.score_names = self.ci_scores
self.aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}
super().prepare()

@staticmethod
Expand Down Expand Up @@ -3089,10 +3050,6 @@ class GroupMeanAccuracy(Accuracy):
"group_by_field": "task_data/group_id",
"ci_samples_from_groups_scores": False,
}
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}


class FixedGroupMeanAccuracy(Accuracy):
Expand All @@ -3101,10 +3058,6 @@ class FixedGroupMeanAccuracy(Accuracy):
"group_by_field": "task_data/group_id",
"ci_samples_from_groups_scores": True,
}
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}


# same as above, now using StringContainment
Expand All @@ -3113,10 +3066,6 @@ class GroupMeanStringContainment(StringContainment):
"group_by_field": "task_data/group_id",
"ci_samples_from_groups_scores": False,
}
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}


class FixedGroupMeanStringContainment(StringContainment):
Expand All @@ -3125,10 +3074,6 @@ class FixedGroupMeanStringContainment(StringContainment):
"group_by_field": "task_data/group_id",
"ci_samples_from_groups_scores": True,
}
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": InstanceMetric.average_item_scores,
}


# take only the (fixed) group mean of baseline or other (paraphrases) scores
Expand Down Expand Up @@ -3232,10 +3177,6 @@ class GroupMeanTokenOverlap(TokenOverlap):
"group_by_field": "task_data/group_id",
"ci_samples_from_groups_scores": False,
}
aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}


# using Cohens's h for proportions
Expand Down Expand Up @@ -3415,11 +3356,6 @@ class BinaryAccuracy(InstanceMetric):
prediction_type = "str"
single_reference_per_prediction = True

aggregating = {
"aggregating_function_name": "mean",
"aggregating_function": MetricWithConfidenceInterval.average_item_scores,
}

def compute(
self, references: List[Any], prediction: Any, task_data: List[Dict]
) -> dict:
Expand Down

0 comments on commit 400fa81

Please sign in to comment.