IBM · lilacheden · Nov 18, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py b/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py
@@ -11,7 +11,7 @@
             "Supported foundation models available with watsonx.ai. Watsonx.ai offers numerous foundation models."
         ],
         "ground_truths": ["Many Large Language Models are supported by Watsonx.ai"],
-        "a123123": "",
+        "metadata": {"data_classification_policy": ["public"]},
     },
     {
         "question": "What foundation models are available in watsonx.ai ?",
@@ -20,6 +20,7 @@
             "Supported foundation models available with Meta. Meta AI offers numerous foundation models."
         ],
         "ground_truths": ["Many Large Language Models are supported by Watsonx.ai"],
+        "metadata": {"data_classification_policy": ["public"]},
     },
     {
         "question": "What foundation models are available in watsonx.ai ?",
@@ -28,6 +29,7 @@
             "Supported foundation models available with Meta. Meta AI offers numerous foundation models."
         ],
         "ground_truths": ["Many Large Language Models are supported by Watsonx.ai"],
+        "metadata": {"data_classification_policy": ["public"]},
     },
     {
         "question": "What foundation models are available in watsonx.ai ?",
@@ -36,6 +38,7 @@
             "Supported foundation models available with Meta. Meta AI offers numerous foundation models."
         ],
         "ground_truths": ["Many Large Language Models are supported by Watsonx.ai"],
+        "metadata": {"data_classification_policy": ["public"]},
     },
     {
         "question": "What foundation models are available in watsonx.ai ?",
@@ -44,28 +47,28 @@
             "Supported foundation models available with Meta. Meta AI offers numerous foundation models."
         ],
         "ground_truths": ["Many Large Language Models are supported by Watsonx.ai"],
+        "metadata": {"data_classification_policy": ["public"]},
     },
 ]
 
-# select the desired metrics.
-# all available metrics are under "catalog.metrics.llm_as_judge.binary"
+# Select the desired metric(s).
+# Each metric measures a certain aspect of the generated answer (answer_correctness, faithfulness,
+# answer_relevance, context_relevance and correctness_holistic).
+# All available metrics are under "catalog.metrics.rag"
+# Those with extension "logprobs" provide a real value prediction in [0,1], the others provide a binary prediction.
+# By default, all judges use llama_3_1_70b_instruct_wml. We will soon see how to change this.
 metric_names = [
-    "answer_correctness_q_a_gt_loose_logprobs",
-    "answer_correctness_q_a_gt_strict_logprobs",
-    "faithfulness_q_c_a_logprobs",
-    "faithfulness_c_a_logprobs",
-    "context_relevance_q_c_ares_logprobs",
-    "answer_relevance_q_a_logprobs",
+    "metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs",
+    "metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs",
 ]
-metrics_path = "metrics.llm_as_judge.binary.llama_3_1_70b_instruct_wml_"
 
 # select the desired model.
 # all available models are under "catalog.engines.classification"
 model_names = [
-    "mixtral_8x7b_instruct_v01_wml",
-    # "gpt_4_turbo_openai",
+    "engines.classification.mixtral_8x7b_instruct_v01_wml",
+    "engines.classification.llama_3_1_70b_instruct_wml",
+    # "engines.classification.gpt_4_turbo_openai",
 ]
-models_path = "engines.classification"
 
 if __name__ == "__main__":
     multi_stream = MultiStream.from_iterables({"test": test_examples}, copying=True)
@@ -78,15 +81,14 @@
         for model_name in model_names:
             # override the metric with the inference model. the default model is llama_3_1_70b_instruct_wml so
             # no need to override when using it.
-            llmaj_metric_name = f"{metrics_path}{metric_name}[inference_model={models_path}.{model_name}]"
+            llmaj_metric_name = f"{metric_name}[inference_model={model_name}]"
 
             # apply the metric over the input
             metrics_operator = SequentialOperator(steps=[llmaj_metric_name])
             instances = metrics_operator(multi_stream)["test"]
             instances = list(instances)
 
-            # all scores will have this prefix
-            score_name = f"{model_name}_{metric_name}"
+            score_name = instances[0]["score"]["instance"]["score_name"]
             for i in range(len(instances)):
                 results[i][score_name] = instances[i]["score"]["instance"][score_name]
                 results[i][f"{score_name}_source"] = instances[i]["score"]["instance"][

diff --git a/prepare/metrics/llm_as_judge/binary_judge.py b/prepare/metrics/llm_as_judge/binary_judge.py
@@ -45,7 +45,7 @@ def get_prediction_field(metric_type):
                     inference_model=inference_model,
                     template=f"templates.rag_eval.{metric_type}.{template_name}{logprobs_label}",
                     task=task_name,
-                    format="formats.empty",
+                    format=None,
                     main_score=metric_label,
                     prediction_field=get_prediction_field(metric_type),
                     infer_log_probs=use_logprobs,

diff --git a/prepare/tasks/rag_eval.py b/prepare/tasks/rag_eval.py
@@ -100,7 +100,12 @@ def convert_to_dict_of_type(field_list):
             outputs=convert_to_dict_of_type(["is_correct", "number_val"]),
             metrics=rag_classification_metrics[binary_val],
             prediction_type="float",
-            defaults={"choices": ["yes", "no"], "is_correct": ["-"], "number_val": -1},
+            defaults={
+                "choices": ["yes", "no"],
+                "is_correct": ["-"],
+                "number_val": -1,
+                "contexts": ["-"],
+            },
         ),
         f"tasks.rag_eval.answer_correctness.{binary_val}",
         overwrite=True,

diff --git a/...metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json b/...metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...etrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json b/...etrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...xt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json b/...xt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...alog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json b/...alog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
     "task": "tasks.rag_eval.context_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "context_relevance_q_c_ares",
     "prediction_field": null,
     "infer_log_probs": false

diff --git a/...alog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json b/...alog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
     "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "correctness_holistic_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json b/...unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...itxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json b/...itxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...trics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/...trics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/..._as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/..._as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/...rics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_strict.json b/...rics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_strict.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_strict_logprobs.json b/...as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_strict_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/.../catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json b/.../catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/...metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/...og/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json b/...og/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
     "task": "tasks.rag_eval.context_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "context_relevance_q_c_ares",
     "prediction_field": null,
     "infer_log_probs": false

diff --git a/...s/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/...s/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
     "task": "tasks.rag_eval.context_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "context_relevance_q_c_ares_logprobs",
     "prediction_field": null,
     "infer_log_probs": true

diff --git a/...og/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json b/...og/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
     "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "correctness_holistic_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...s/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/...s/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
     "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "correctness_holistic_q_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/...itxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json b/...itxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...log/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json b/...log/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/...xt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json b/...xt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_q_c_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...g/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/...g/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
     "task": "tasks.rag_eval.faithfulness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "faithfulness_q_c_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/generic_inference_engine_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/generic_inference_engine_q_a_gt_loose.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...unitxt/catalog/metrics/rag/answer_correctness/generic_inference_engine_q_a_gt_strict.json b/...unitxt/catalog/metrics/rag/answer_correctness/generic_inference_engine_q_a_gt_strict.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...nitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json b/...nitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...alog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json b/...alog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_loose_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/...itxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_strict.json b/...itxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_strict.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/...log/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_strict_logprobs.json b/...log/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_strict_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_correctness.judge_simplified_format_logprobs",
     "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_correctness_q_a_gt_strict_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/generic_inference_engine_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/generic_inference_engine_q_a.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a",
     "prediction_field": "answer",
     "infer_log_probs": false

diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json
@@ -3,7 +3,7 @@
     "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
     "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
     "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "answer_relevance_q_a_logprobs",
     "prediction_field": "answer",
     "infer_log_probs": true

diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json
@@ -5,7 +5,7 @@
     },
     "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
     "task": "tasks.rag_eval.context_relevance.binary",
-    "format": "formats.empty",
+    "format": null,
     "main_score": "context_relevance_q_c_ares",
     "prediction_field": null,
     "infer_log_probs": false