IBM · martinscooper · Sep 8, 2025 · Sep 16, 2025 · Sep 30, 2025 · Oct 5, 2025
diff --git a/.github/workflows/catalog_consistency.yml b/.github/workflows/catalog_consistency.yml
@@ -30,7 +30,7 @@ jobs:
 
      - uses: actions/setup-python@v5
        with:
-         python-version: '3.9'
+         python-version: '3.10'
 
      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
      - run: uv pip install --system -e ".[tests]"

diff --git a/examples/evaluate_evalassist_judge.py b/examples/evaluate_evalassist_judge.py
@@ -0,0 +1,64 @@
+from unitxt.api import create_dataset, evaluate
+from unitxt.evalassist_judge import EvalAssistLLMJudgeDirect
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.llm_as_judge_constants import (
+    CriteriaWithOptions,
+)
+
+criteria = CriteriaWithOptions.from_obj(
+    {
+        "name": "Temperature in Fahrenheit and Celsius",
+        "description": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
+        "options": [
+            {
+                "name": "Correct",
+                "description": "The temperature reading is provided in both Fahrenheit and Celsius.",
+            },
+            {
+                "name": "Partially Correct",
+                "description": "The temperature reading is provided either in Fahrenheit or Celsius, but not both.",
+            },
+            {
+                "name": "Incorrect",
+                "description": "There is no numerical temperature reading in the response.",
+            },
+        ],
+        "option_map": {"Correct": 1.0, "Partially Correct": 0.5, "Incorrect": 0.0},
+        "context_fields": ["question"],
+    }
+)
+
+
+data = [
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+]
+
+metric = EvalAssistLLMJudgeDirect(
+    inference_engine=CrossProviderInferenceEngine(
+        model="llama-3-3-70b-instruct",
+        max_tokens=1024,
+        data_classification_policy=["public"],
+    ),
+    criteria=criteria,
+)
+
+
+dataset = create_dataset(
+    task="tasks.qa.open", test_set=data, metrics=[metric], split="test"
+)
+
+predictions = [
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+]
+
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Global Scores:")
+print(results.global_scores.summary)
+
+print("Instance Scores:")
+print(results.instance_scores)
diff --git a/examples/evaluate_faithfulness_metrics.py b/examples/evaluate_faithfulness_metrics.py
@@ -0,0 +1,151 @@
+import json
+
+import pandas as pd
+import unitxt
+from unitxt.api import evaluate, load_dataset
+from unitxt.benchmark import Benchmark
+from unitxt.inference import MetricInferenceEngine
+from unitxt.standard import DatasetRecipe
+from unitxt.templates import InputOutputTemplate
+
+unitxt.settings.allow_unverified_code = True
+unitxt.settings.dataset_cache_default = True
+
+card_subsets = [
+    "covidqa",
+    "cuad",
+    "delucionqa",
+    "emanual",
+    "expertqa",
+    "finqa",
+    "hagrid",
+    "hotpotqa",
+    "msmarco",
+    "pubmedqa",
+    "tatqa",
+    #                "techqa" # Fails due to bad char in text
+]
+
+# card_subsets = ["covidqa"]
+card = "cards.rag_eval.faithfulness.ragbench"
+
+template = InputOutputTemplate(
+    output_format="{number_val}",
+    input_format="{question}",  # "CONTEXTS:{contexts}\n\n\n\QUESTION:{question}\n\n\nANSWER:{answer}",
+    postprocessors=["processors.cast_to_float_return_0_5_if_failed"],
+)
+
+subsets = {
+    card_subset: DatasetRecipe(
+        card=f"{card}.{card_subset}",
+        template=template,
+        metrics=[
+            "metrics.f1_binary",
+            "metrics.f1_binary[average=macro,score_prefix=macro_]",
+        ],
+    )
+    for card_subset in card_subsets
+}
+
+benchmark = Benchmark(
+    format="formats.empty",
+    max_samples_per_subset=40,
+    loader_limit=300,
+    subsets=subsets,
+)
+
+dataset = load_dataset(
+    benchmark,
+    split="test",
+)
+for instance in dataset:
+    task_data = json.loads(instance["task_data"])
+
+
+metrics_to_score_names = {}
+
+criterion = "metrics.llm_as_judge.direct.criteria.reference_document_faithfulness"
+llm_as_judge_metric = f"metrics.llm_as_judge.direct.rits.llama3_3_70b[check_positional_bias=False,criteria={criterion}, context_fields=[contexts,question]]"
+llm_score_name = "reference_document_faithfulness"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+llm_as_judge_metric = f"metrics.llm_as_judge.direct.watsonx.llama3_3_70b[check_positional_bias=False,criteria={criterion}, context_fields=[contexts,question]]"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.rits.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.watsonx.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+criterion = "metrics.llm_as_judge.direct.criteria.reference_document_faithfulness2"
+llm_score_name = "reference_document_faithfulness2"
+llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.rits.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.watsonx.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+
+
+llm_as_judge_metric = (
+    "metrics.rag.external_rag.faithfulness.llama_3_3_70b_instruct_watsonx_judge"
+)
+llm_score_name = "faithfulness_judge"
+metrics_to_score_names[llm_as_judge_metric] = llm_score_name
+metrics_to_score_names["all_one"] = "score"
+df = pd.DataFrame(
+    columns=[
+        "metric",
+        "f1_macro",
+        "f1_faithful",
+        "f1_not_faithful",
+        "num_of_instances",
+    ]
+)
+
+for metric, score_name in metrics_to_score_names.items():
+    # print(json.dumps(task_data,indent=4))
+    # print(json.dumps(instance,indent=4))
+    # print(instance["references"])
+
+    if metric == "all_one":
+        new_predictions = [1.0] * len(dataset)
+    else:
+        model = MetricInferenceEngine(metric=metric, prediction_field="answer")
+        predictions = model(dataset)
+        new_predictions = [prediction[score_name] for prediction in predictions]
+    results = evaluate(
+        predictions=new_predictions, data=dataset, calc_confidence_intervals=False
+    )
+
+    sums = {}
+    counts = {}
+
+    for _, inner_dict in results.subsets_scores.items():
+        if isinstance(inner_dict, dict):
+            for key, value in inner_dict.items():
+                if isinstance(value, float):
+                    sums[key] = sums.get(key, 0) + value
+                    counts[key] = counts.get(key, 0) + 1
+    #
+    averages = {key: sums[key] / counts[key] for key in sums}
+
+    df.loc[len(df)] = [
+        str(metric),
+        averages["macro_f1_binary"],
+        averages["f1_binary"],
+        averages["f1_binary_neg"],
+        results.global_scores["num_of_instances"],
+    ]
+
+    print("Instance Results:")
+    print(results.instance_scores.summary)
+
+    print("Subsets Results (details):")
+    print(results.subsets_scores)
+
+    print("Subsets Results :")
+    print(results.subsets_scores.summary)
+
+    df = df.round(decimals=2)
+    print(df.to_markdown())
diff --git a/prepare/metrics/llm_as_judge/evalassist_judge.py b/prepare/metrics/llm_as_judge/evalassist_judge.py
@@ -0,0 +1,24 @@
+from unitxt import add_to_catalog
+from unitxt.evalassist_judge import EvalAssistLLMJudgeDirect
+from unitxt.inference import CrossProviderInferenceEngine
+
+for provider in ["watsonx", "rits"]:
+    for model in ["llama-3-3-70b-instruct"]:
+        eval_assist_judge = EvalAssistLLMJudgeDirect(
+            inference_engine=CrossProviderInferenceEngine(
+                provider=provider,
+                model=model,
+                max_tokens=1024,
+                temperature=0.0,
+            )
+        )
+        if model == "llama-3-3-70b-instruct":
+            catalog_model = "llama3_3_70b"
+        else:
+            raise ValueError(f"Model {model} not supported")
+
+        add_to_catalog(
+            eval_assist_judge,
+            f"metrics.llm_as_judge.evalassist.direct.{provider}.{catalog_model}",
+            overwrite=True,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,7 +108,8 @@ tests = [
     "sqlparse",
     "diskcache",
     "pydantic",
-    "jsonschema_rs"
+    "jsonschema_rs",
+    "evalassist"
 ]
 ui = [
     "gradio",

diff --git a/...unitxt/catalog/metrics/llm_as_judge/direct/criteria/reference_document_faithfulness2.json b/...unitxt/catalog/metrics/llm_as_judge/direct/criteria/reference_document_faithfulness2.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "criteria_with_options",
+    "name": "reference_document_faithfulness2",
+    "description": "\n          Is the prediction grounded in the reference document?\n\n          To be grounded in the reference document, all the information of the prediction must either be present in the reference documentor deducible from the reference document.\n\nBase your answer only on the information in the reference document If the prediction is correct but not present in the reference document then it is not grounded.\n        ",
+    "prediction_field": "response",
+    "context_fields": [
+        "reference_document"
+    ],
+    "options": [
+        {
+            "__type__": "criteria_option",
+            "name": "Completely grounded",
+            "description": "The prediction is fully grounded in the reference document."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Mostly grounded",
+            "description": "the vast majority of the information in the prediction is grounded in the reference document, but there is a small or negligible part of the prediction which is not present in the reference document"
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Somewhat grounded",
+            "description": "Some of the information in the prediction is grounded in the reference document."
+        },
+        {
+            "__type__": "criteria_option",
+            "name": "Not grounded",
+            "description": "Most or all of the information in the prediction is not grounded in the reference documemnt"
+        }
+    ],
+    "option_map": {
+        "Completely Grounded": 1.0,
+        "Mostly grounded": 0.75,
+        "Somewhat grounded": 0.25,
+        "Not grounded": 0.0
+    }
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/evalassist/direct/rits/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/evalassist/direct/rits/llama3_3_70b.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "eval_assist_llm_judge_direct",
+    "inference_engine": {
+        "__type__": "cross_provider_inference_engine",
+        "provider": "rits",
+        "model": "llama-3-3-70b-instruct",
+        "max_tokens": 1024,
+        "temperature": 0.0
+    }
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/evalassist/direct/watsonx/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/evalassist/direct/watsonx/llama3_3_70b.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "eval_assist_llm_judge_direct",
+    "inference_engine": {
+        "__type__": "cross_provider_inference_engine",
+        "provider": "watsonx",
+        "model": "llama-3-3-70b-instruct",
+        "max_tokens": 1024,
+        "temperature": 0.0
+    }
+}