Skip to content

Commit 8b0d2c9

Browse files
committed
add response variable name
Signed-off-by: Martín Santillán Cooper <msantillancooper@ibm.com>
1 parent e24eccb commit 8b0d2c9

File tree

2 files changed

+49
-18
lines changed

2 files changed

+49
-18
lines changed

src/unitxt/llm_as_judge.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ class LLMJudge(BulkInstanceMetric):
6363
generate_summaries: bool = True
6464
format = "formats.chat_api"
6565
include_prompts_in_result: bool = False
66+
response_variable_name_field: Optional[str] = None
67+
response_variable_name: str = "response"
6668
criteria_field: str = None
6769
criteria: Criteria = None
6870
logger = get_logger()
@@ -103,6 +105,16 @@ def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
103105
for td in task_data
104106
]
105107

108+
def get_response_variable_names(self, task_data: List[Dict[str, Any]]) -> List[str]:
109+
if self.response_variable_name_field is None:
110+
return [self.response_variable_name] * len(task_data)
111+
try:
112+
return [td[self.response_variable_name_field] for td in task_data]
113+
except KeyError as e:
114+
raise UnitxtError(
115+
f"The response variable name field `{self.response_variable_name_field}` was not found in the task data instance."
116+
) from e
117+
106118
def perform_evaluation_step(
107119
self,
108120
instances: list,
@@ -184,6 +196,8 @@ def prepare(self):
184196
"response": str,
185197
"criteria_description": str,
186198
"display_options_instruction": str,
199+
"response_variable_name": str,
200+
"response_variable_name_title": str,
187201
},
188202
reference_fields={},
189203
prediction_type=str,
@@ -202,6 +216,7 @@ def prepare(self):
202216
"criteria_description": str,
203217
"score_option_instruction": str,
204218
"options": list,
219+
"response_variable_name": str,
205220
},
206221
reference_fields={},
207222
prediction_type=str,
@@ -341,6 +356,7 @@ def compute(
341356
criterias = self.get_criterias(task_data, evaluations_count)
342357
self.set_main_score(criterias)
343358
contexts = self.get_contexts(task_data)
359+
response_variable_names = self.get_response_variable_names(task_data)
344360
if self.check_positional_bias:
345361
criterias += [
346362
CriteriaWithOptions(
@@ -352,6 +368,7 @@ def compute(
352368
for criteria in criterias
353369
]
354370
contexts += contexts
371+
response_variable_names += response_variable_names
355372
predictions += predictions
356373

357374
parsed_criterias = [
@@ -373,13 +390,16 @@ def compute(
373390
"response": prediction,
374391
"display_options_instruction": display_options_instruction,
375392
"criteria_description": criteria_description,
393+
"response_variable_name": response_variable_name,
394+
"response_variable_name_title": response_variable_name.capitalize(),
376395
"data_classification_policy": ["public"],
377396
}
378-
for context, prediction, criteria_description, display_options_instruction in zip(
397+
for context, prediction, criteria_description, display_options_instruction, response_variable_name in zip(
379398
contexts,
380399
predictions,
381400
criteria_description_list,
382401
display_options_instruction_list,
402+
response_variable_names
383403
)
384404
]
385405
assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
@@ -416,12 +436,14 @@ def compute(
416436
"criteria_description": criteria_description,
417437
"score_option_instruction": score_option_instruction,
418438
"options": criteria_option_names,
439+
"response_variable_name": response_variable_name,
419440
"data_classification_policy": ["public"],
420441
}
421-
for criteria_description, score_option_instruction, criteria_option_names in zip(
442+
for criteria_description, score_option_instruction, criteria_option_names, response_variable_name in zip(
422443
criteria_description_list,
423444
score_option_instruction_list,
424445
criteria_option_names_list,
446+
response_variable_names
425447
)
426448
]
427449

@@ -477,6 +499,8 @@ def prepare(self):
477499
"option_b": str,
478500
"criteria_name": str,
479501
"criteria_description": str,
502+
"response_variable_name": str,
503+
"response_variable_name_title": str,
480504
},
481505
reference_fields={},
482506
prediction_type=str,
@@ -494,6 +518,7 @@ def prepare(self):
494518
input_fields={
495519
"score_option_instruction": str,
496520
"options": list,
521+
"response_variable_name": str,
497522
},
498523
reference_fields={},
499524
prediction_type=str,
@@ -754,9 +779,11 @@ def compute(
754779

755780
criterias = self.get_criterias(task_data, instances_count)
756781
contexts = self.get_contexts(task_data)
782+
response_variable_names = self.get_response_variable_names(task_data)
757783
if self.check_positional_bias:
758784
criterias.extend(criterias)
759785
contexts.extend(contexts)
786+
response_variable_names.extend(response_variable_names)
760787
for response_pairs, option_pairs in zip(
761788
response_pairs_list, option_pairs_list
762789
):
@@ -776,10 +803,12 @@ def compute(
776803
"option_b": option_pair[1],
777804
"criteria_name": criterias[i].name,
778805
"criteria_description": criterias[i].description,
806+
"response_variable_name": response_variable_name,
807+
"response_variable_name_title": response_variable_name.capitalize(),
779808
"data_classification_policy": ["public"],
780809
}
781-
for i, (response_pairs, option_pairs) in enumerate(
782-
zip(response_pairs_list, option_pairs_list)
810+
for i, (response_pairs, option_pairs, response_variable_name) in enumerate(
811+
zip(response_pairs_list, option_pairs_list, response_variable_names)
783812
)
784813
for response_pair, option_pair in zip(response_pairs, option_pairs)
785814
]
@@ -853,15 +882,17 @@ def compute(
853882
{
854883
"options": [f"Response {option}" for option in option_pair],
855884
"score_option_instruction": score_option_instruction,
885+
"response_variable_name": response_variable_name,
856886
"data_classification_policy": ["public"],
857887
}
858-
for option_pair, score_option_instruction in zip(
888+
for option_pair, score_option_instruction, response_variable_name in zip(
859889
[
860890
option_pair
861891
for option_pairs in option_pairs_list
862892
for option_pair in option_pairs
863893
],
864894
score_option_instruction_list,
895+
response_variable_names
865896
)
866897
]
867898

src/unitxt/llm_as_judge_chat_templates.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
direct_template_dict = {
44
"assessment": InputOutputTemplate(
55
input_format="""
6-
You are presented with a response generated subject to a context.
7-
The context includes information relevant to the nature or generation of the response.
8-
You will assess the quality of the response subject to an evaluation criteria.
6+
You are presented with a {response_variable_name} generated subject to a context.
7+
The context includes information relevant to the nature or generation of the {response_variable_name}.
8+
You will assess the quality of the {response_variable_name} subject to an evaluation criteria.
99
###Context:
1010
{context_variables}
1111
12-
###Response:
12+
###{response_variable_name_title}:
1313
{response}
1414
1515
###Evaluation criteria:
1616
{criteria_description}
1717
{display_options_instruction}
1818
19-
Briefly assess the quality of the response subject to the evaluation criteria.
19+
Briefly assess the quality of the {response_variable_name} subject to the evaluation criteria.
2020
Focus on the evaluation criteria during assessment, do not provide a general assessment.
2121
Assessment:
2222
@@ -29,7 +29,7 @@
2929
Summary:"""
3030
),
3131
"answer": InputOutputTemplate(
32-
input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response.
32+
input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the {response_variable_name}.
3333
###Evaluation criteria:
3434
{criteria_description}
3535
{score_option_instruction}
@@ -41,8 +41,8 @@
4141

4242
pairwise_template_dict = {
4343
"assessment": InputOutputTemplate(
44-
input_format="""You are provided a pair of responses (Response {option_a} and Response {option_b}) generated subject to a context.
45-
You will choose the better quality response subject to the evaluation criteria.
44+
input_format="""You are provided a pair of {response_variable_name}s ({response_variable_name_title} {option_a} and {response_variable_name_title} {option_b}) generated subject to a context.
45+
You will choose the better quality {response_variable_name} subject to the evaluation criteria.
4646
4747
This is the context:
4848
{context_variables}
@@ -51,25 +51,25 @@
5151
{criteria_name}
5252
{criteria_description}
5353
54-
Response {option_a}:
54+
{response_variable_name_title} {option_a}:
5555
{response_a}
56-
Response {option_b}:
56+
{response_variable_name_title} {option_b}:
5757
{response_b}
5858
59-
Keeping the evaluation criteria in mind, briefly assess which response is better.
59+
Keeping the evaluation criteria in mind, briefly assess which {response_variable_name} is better.
6060
Focus on the evaluation criteria during assessment, do not provide a general assessment.
6161
Assessment:
6262
6363
Lets think step by step """
6464
),
6565
"summarization": InputOutputTemplate(
66-
input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which response won.
66+
input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which {response_variable_name} won.
6767
6868
Assessment: {assessment}
6969
Summary:"""
7070
),
7171
"answer": InputOutputTemplate(
72-
input_format="""Now considering the evaluation criteria, which response is better quality? Only include the chosen response.
72+
input_format="""Now considering the evaluation criteria, which {response_variable_name} is better quality? Only include the chosen {response_variable_name}.
7373
{score_option_instruction}
7474
Answer: """,
7575
postprocessors=["processors.match_closest_option"],

0 commit comments

Comments
 (0)