Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/catalog_consistency.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:

- uses: actions/setup-python@v5
with:
python-version: '3.9'
python-version: '3.10'

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system -e ".[tests]"
Expand Down
64 changes: 64 additions & 0 deletions examples/evaluate_evalassist_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from unitxt.api import create_dataset, evaluate
from unitxt.evalassist_judge import EvalAssistLLMJudgeDirect
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge_constants import (
CriteriaWithOptions,
)

criteria = CriteriaWithOptions.from_obj(
{
"name": "Temperature in Fahrenheit and Celsius",
"description": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
"options": [
{
"name": "Correct",
"description": "The temperature reading is provided in both Fahrenheit and Celsius.",
},
{
"name": "Partially Correct",
"description": "The temperature reading is provided either in Fahrenheit or Celsius, but not both.",
},
{
"name": "Incorrect",
"description": "There is no numerical temperature reading in the response.",
},
],
"option_map": {"Correct": 1.0, "Partially Correct": 0.5, "Incorrect": 0.0},
"context_fields": ["question"],
}
)


data = [
{"question": "How is the weather?"},
{"question": "How is the weather?"},
{"question": "How is the weather?"},
]

metric = EvalAssistLLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine(
model="llama-3-3-70b-instruct",
max_tokens=1024,
data_classification_policy=["public"],
),
criteria=criteria,
)


dataset = create_dataset(
task="tasks.qa.open", test_set=data, metrics=[metric], split="test"
)

predictions = [
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
]

results = evaluate(predictions=predictions, data=dataset)

print("Global Scores:")
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores)
151 changes: 151 additions & 0 deletions examples/evaluate_faithfulness_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import json

import pandas as pd
import unitxt
from unitxt.api import evaluate, load_dataset
from unitxt.benchmark import Benchmark
from unitxt.inference import MetricInferenceEngine
from unitxt.standard import DatasetRecipe
from unitxt.templates import InputOutputTemplate

unitxt.settings.allow_unverified_code = True
unitxt.settings.dataset_cache_default = True

card_subsets = [
"covidqa",
"cuad",
"delucionqa",
"emanual",
"expertqa",
"finqa",
"hagrid",
"hotpotqa",
"msmarco",
"pubmedqa",
"tatqa",
# "techqa" # Fails due to bad char in text
]

# card_subsets = ["covidqa"]
card = "cards.rag_eval.faithfulness.ragbench"

template = InputOutputTemplate(
output_format="{number_val}",
input_format="{question}", # "CONTEXTS:{contexts}\n\n\n\QUESTION:{question}\n\n\nANSWER:{answer}",
postprocessors=["processors.cast_to_float_return_0_5_if_failed"],
)

subsets = {
card_subset: DatasetRecipe(
card=f"{card}.{card_subset}",
template=template,
metrics=[
"metrics.f1_binary",
"metrics.f1_binary[average=macro,score_prefix=macro_]",
],
)
for card_subset in card_subsets
}

benchmark = Benchmark(
format="formats.empty",
max_samples_per_subset=40,
loader_limit=300,
subsets=subsets,
)

dataset = load_dataset(
benchmark,
split="test",
)
for instance in dataset:
task_data = json.loads(instance["task_data"])


metrics_to_score_names = {}

criterion = "metrics.llm_as_judge.direct.criteria.reference_document_faithfulness"
llm_as_judge_metric = f"metrics.llm_as_judge.direct.rits.llama3_3_70b[check_positional_bias=False,criteria={criterion}, context_fields=[contexts,question]]"
llm_score_name = "reference_document_faithfulness"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name

llm_as_judge_metric = f"metrics.llm_as_judge.direct.watsonx.llama3_3_70b[check_positional_bias=False,criteria={criterion}, context_fields=[contexts,question]]"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name

llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.rits.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name

llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.watsonx.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name

criterion = "metrics.llm_as_judge.direct.criteria.reference_document_faithfulness2"
llm_score_name = "reference_document_faithfulness2"
llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.rits.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name

llm_as_judge_metric = f"metrics.llm_as_judge.evalassist.direct.watsonx.llama3_3_70b[criteria={criterion},context_fields=[contexts,question]]"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name


llm_as_judge_metric = (
"metrics.rag.external_rag.faithfulness.llama_3_3_70b_instruct_watsonx_judge"
)
llm_score_name = "faithfulness_judge"
metrics_to_score_names[llm_as_judge_metric] = llm_score_name
metrics_to_score_names["all_one"] = "score"
df = pd.DataFrame(
columns=[
"metric",
"f1_macro",
"f1_faithful",
"f1_not_faithful",
"num_of_instances",
]
)

for metric, score_name in metrics_to_score_names.items():
# print(json.dumps(task_data,indent=4))
# print(json.dumps(instance,indent=4))
# print(instance["references"])

if metric == "all_one":
new_predictions = [1.0] * len(dataset)
else:
model = MetricInferenceEngine(metric=metric, prediction_field="answer")
predictions = model(dataset)
new_predictions = [prediction[score_name] for prediction in predictions]
results = evaluate(
predictions=new_predictions, data=dataset, calc_confidence_intervals=False
)

sums = {}
counts = {}

for _, inner_dict in results.subsets_scores.items():
if isinstance(inner_dict, dict):
for key, value in inner_dict.items():
if isinstance(value, float):
sums[key] = sums.get(key, 0) + value
counts[key] = counts.get(key, 0) + 1
#
averages = {key: sums[key] / counts[key] for key in sums}

df.loc[len(df)] = [
str(metric),
averages["macro_f1_binary"],
averages["f1_binary"],
averages["f1_binary_neg"],
results.global_scores["num_of_instances"],
]

print("Instance Results:")
print(results.instance_scores.summary)

print("Subsets Results (details):")
print(results.subsets_scores)

print("Subsets Results :")
print(results.subsets_scores.summary)

df = df.round(decimals=2)
print(df.to_markdown())
24 changes: 24 additions & 0 deletions prepare/metrics/llm_as_judge/evalassist_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from unitxt import add_to_catalog
from unitxt.evalassist_judge import EvalAssistLLMJudgeDirect
from unitxt.inference import CrossProviderInferenceEngine

for provider in ["watsonx", "rits"]:
for model in ["llama-3-3-70b-instruct"]:
eval_assist_judge = EvalAssistLLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine(
provider=provider,
model=model,
max_tokens=1024,
temperature=0.0,
)
)
if model == "llama-3-3-70b-instruct":
catalog_model = "llama3_3_70b"
else:
raise ValueError(f"Model {model} not supported")

add_to_catalog(
eval_assist_judge,
f"metrics.llm_as_judge.evalassist.direct.{provider}.{catalog_model}",
overwrite=True,
)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ tests = [
"sqlparse",
"diskcache",
"pydantic",
"jsonschema_rs"
"jsonschema_rs",
"evalassist"
]
ui = [
"gradio",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"__type__": "criteria_with_options",
"name": "reference_document_faithfulness2",
"description": "\n Is the prediction grounded in the reference document?\n\n To be grounded in the reference document, all the information of the prediction must either be present in the reference documentor deducible from the reference document.\n\nBase your answer only on the information in the reference document If the prediction is correct but not present in the reference document then it is not grounded.\n ",
"prediction_field": "response",
"context_fields": [
"reference_document"
],
"options": [
{
"__type__": "criteria_option",
"name": "Completely grounded",
"description": "The prediction is fully grounded in the reference document."
},
{
"__type__": "criteria_option",
"name": "Mostly grounded",
"description": "the vast majority of the information in the prediction is grounded in the reference document, but there is a small or negligible part of the prediction which is not present in the reference document"
},
{
"__type__": "criteria_option",
"name": "Somewhat grounded",
"description": "Some of the information in the prediction is grounded in the reference document."
},
{
"__type__": "criteria_option",
"name": "Not grounded",
"description": "Most or all of the information in the prediction is not grounded in the reference documemnt"
}
],
"option_map": {
"Completely Grounded": 1.0,
"Mostly grounded": 0.75,
"Somewhat grounded": 0.25,
"Not grounded": 0.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"__type__": "eval_assist_llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"provider": "rits",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024,
"temperature": 0.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"__type__": "eval_assist_llm_judge_direct",
"inference_engine": {
"__type__": "cross_provider_inference_engine",
"provider": "watsonx",
"model": "llama-3-3-70b-instruct",
"max_tokens": 1024,
"temperature": 0.0
}
}
Loading
Loading