add response variable name

martinscooper · martinscooper · commit 8b0d2c919d31 · 2025-02-21T13:57:58.000-03:00
Signed-off-by: Martín Santillán Cooper &lt;msantillancooper@ibm.com&gt;
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
@@ -63,6 +63,8 @@ class LLMJudge(BulkInstanceMetric):
     generate_summaries: bool = True
     format = "formats.chat_api"
     include_prompts_in_result: bool = False
+    response_variable_name_field: Optional[str] = None
+    response_variable_name: str = "response"
     criteria_field: str = None
     criteria: Criteria = None
     logger = get_logger()
@@ -103,6 +105,16 @@ def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
             for td in task_data
         ]
 
+    def get_response_variable_names(self, task_data: List[Dict[str, Any]]) -> List[str]:
+        if self.response_variable_name_field is None:
+            return [self.response_variable_name] * len(task_data)
+        try:
+            return [td[self.response_variable_name_field] for td in task_data]
+        except KeyError as e:
+            raise UnitxtError(
+                f"The response variable name field `{self.response_variable_name_field}` was not found in the task data instance."
+            ) from e
+
     def perform_evaluation_step(
         self,
         instances: list,
@@ -184,6 +196,8 @@ def prepare(self):
                 "response": str,
                 "criteria_description": str,
                 "display_options_instruction": str,
+                "response_variable_name": str,
+                "response_variable_name_title": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -202,6 +216,7 @@ def prepare(self):
                 "criteria_description": str,
                 "score_option_instruction": str,
                 "options": list,
+                "response_variable_name": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -341,6 +356,7 @@ def compute(
         criterias = self.get_criterias(task_data, evaluations_count)
         self.set_main_score(criterias)
         contexts = self.get_contexts(task_data)
+        response_variable_names = self.get_response_variable_names(task_data)
         if self.check_positional_bias:
             criterias += [
                 CriteriaWithOptions(
@@ -352,6 +368,7 @@ def compute(
                 for criteria in criterias
             ]
             contexts += contexts
+            response_variable_names += response_variable_names
             predictions += predictions
 
         parsed_criterias = [
@@ -373,13 +390,16 @@ def compute(
                 "response": prediction,
                 "display_options_instruction": display_options_instruction,
                 "criteria_description": criteria_description,
+                "response_variable_name": response_variable_name,
+                "response_variable_name_title": response_variable_name.capitalize(),
                 "data_classification_policy": ["public"],
             }
-            for context, prediction, criteria_description, display_options_instruction in zip(
+            for context, prediction, criteria_description, display_options_instruction, response_variable_name in zip(
                 contexts,
                 predictions,
                 criteria_description_list,
                 display_options_instruction_list,
+                response_variable_names
             )
         ]
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
@@ -416,12 +436,14 @@ def compute(
                 "criteria_description": criteria_description,
                 "score_option_instruction": score_option_instruction,
                 "options": criteria_option_names,
+                "response_variable_name": response_variable_name,
                 "data_classification_policy": ["public"],
             }
-            for criteria_description, score_option_instruction, criteria_option_names in zip(
+            for criteria_description, score_option_instruction, criteria_option_names, response_variable_name in zip(
                 criteria_description_list,
                 score_option_instruction_list,
                 criteria_option_names_list,
+                response_variable_names
             )
         ]
 
@@ -477,6 +499,8 @@ def prepare(self):
                 "option_b": str,
                 "criteria_name": str,
                 "criteria_description": str,
+                "response_variable_name": str,
+                "response_variable_name_title": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -494,6 +518,7 @@ def prepare(self):
             input_fields={
                 "score_option_instruction": str,
                 "options": list,
+                "response_variable_name": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -754,9 +779,11 @@ def compute(
 
         criterias = self.get_criterias(task_data, instances_count)
         contexts = self.get_contexts(task_data)
+        response_variable_names = self.get_response_variable_names(task_data)
         if self.check_positional_bias:
             criterias.extend(criterias)
             contexts.extend(contexts)
+            response_variable_names.extend(response_variable_names)
             for response_pairs, option_pairs in zip(
                 response_pairs_list, option_pairs_list
             ):
@@ -776,10 +803,12 @@ def compute(
                 "option_b": option_pair[1],
                 "criteria_name": criterias[i].name,
                 "criteria_description": criterias[i].description,
+                "response_variable_name": response_variable_name,
+                "response_variable_name_title": response_variable_name.capitalize(),
                 "data_classification_policy": ["public"],
             }
-            for i, (response_pairs, option_pairs) in enumerate(
-                zip(response_pairs_list, option_pairs_list)
+            for i, (response_pairs, option_pairs, response_variable_name) in enumerate(
+                zip(response_pairs_list, option_pairs_list, response_variable_names)
             )
             for response_pair, option_pair in zip(response_pairs, option_pairs)
         ]
@@ -853,15 +882,17 @@ def compute(
             {
                 "options": [f"Response {option}" for option in option_pair],
                 "score_option_instruction": score_option_instruction,
+                "response_variable_name": response_variable_name,
                 "data_classification_policy": ["public"],
             }
-            for option_pair, score_option_instruction in zip(
+            for option_pair, score_option_instruction, response_variable_name in zip(
                 [
                     option_pair
                     for option_pairs in option_pairs_list
                     for option_pair in option_pairs
                 ],
                 score_option_instruction_list,
+                response_variable_names
             )
         ]
 
diff --git a/src/unitxt/llm_as_judge_chat_templates.py b/src/unitxt/llm_as_judge_chat_templates.py
@@ -3,20 +3,20 @@
 direct_template_dict = {
     "assessment": InputOutputTemplate(
         input_format="""
-You are presented with a response generated subject to a context.
-The context includes information relevant to the nature or generation of the response.
-You will assess the quality of the response subject to an evaluation criteria.
+You are presented with a {response_variable_name} generated subject to a context.
+The context includes information relevant to the nature or generation of the {response_variable_name}.
+You will assess the quality of the {response_variable_name} subject to an evaluation criteria.
 ###Context:
 {context_variables}
 
-###Response:
+###{response_variable_name_title}:
 {response}
 
 ###Evaluation criteria:
 {criteria_description}
 {display_options_instruction}
 
-Briefly assess the quality of the response subject to the evaluation criteria.
+Briefly assess the quality of the {response_variable_name} subject to the evaluation criteria.
 Focus on the evaluation criteria during assessment, do not provide a general assessment.
 Assessment:
 
@@ -29,7 +29,7 @@
 Summary:"""
     ),
     "answer": InputOutputTemplate(
-        input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response.
+        input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the {response_variable_name}.
 ###Evaluation criteria:
 {criteria_description}
 {score_option_instruction}
@@ -41,8 +41,8 @@
 
 pairwise_template_dict = {
     "assessment": InputOutputTemplate(
-        input_format="""You are provided a pair of responses (Response {option_a} and Response {option_b}) generated subject to a context.
-You will choose the better quality response subject to the evaluation criteria.
+        input_format="""You are provided a pair of {response_variable_name}s ({response_variable_name_title} {option_a} and {response_variable_name_title} {option_b}) generated subject to a context.
+You will choose the better quality {response_variable_name} subject to the evaluation criteria.
 
 This is the context:
 {context_variables}
@@ -51,25 +51,25 @@
 {criteria_name}
 {criteria_description}
 
-Response {option_a}:
+{response_variable_name_title} {option_a}:
 {response_a}
-Response {option_b}:
+{response_variable_name_title} {option_b}:
 {response_b}
 
-Keeping the evaluation criteria in mind, briefly assess which response is better.
+Keeping the evaluation criteria in mind, briefly assess which {response_variable_name} is better.
 Focus on the evaluation criteria during assessment, do not provide a general assessment.
 Assessment:
 
 Lets think step by step """
     ),
     "summarization": InputOutputTemplate(
-        input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which response won.
+        input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which {response_variable_name} won.
 
 Assessment: {assessment}
 Summary:"""
     ),
     "answer": InputOutputTemplate(
-        input_format="""Now considering the evaluation criteria, which response is better quality? Only include the chosen response.
+        input_format="""Now considering the evaluation criteria, which {response_variable_name} is better quality? Only include the chosen {response_variable_name}.
 {score_option_instruction}
 Answer: """,
         postprocessors=["processors.match_closest_option"],