Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from __future__ import annotations

import abc
from enum import Enum
from typing import Optional
from typing import Union
Expand Down Expand Up @@ -362,3 +363,12 @@ class MetricInfo(EvalBaseModel):
metric_value_info: MetricValueInfo = Field(
description="Information on the nature of values supported by the metric."
)


class MetricInfoProvider(abc.ABC):
"""Interface for providing MetricInfo."""

@abc.abstractmethod
def get_metric_info(self) -> MetricInfo:
"""Returns MetricInfo for a given metric."""
raise NotImplementedError
18 changes: 0 additions & 18 deletions src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import Interval
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
Expand All @@ -42,20 +38,6 @@ class RougeEvaluator(Evaluator):
def __init__(self, eval_metric: EvalMetric):
self._eval_metric = eval_metric

@staticmethod
def get_metric_info() -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
description=(
"This metric evaluates if the agent's final response matches a"
" golden/expected final response using Rouge_1 metric. Value range"
" for this metric is [0,1], with values closer to 1 more desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)

@override
def evaluate_invocations(
self,
Expand Down
18 changes: 0 additions & 18 deletions src/google/adk/evaluation/final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import EvalStatus
from .eval_metrics import Interval
from .eval_metrics import LlmAsAJudgeCriterion
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .evaluator import EvaluationResult
from .evaluator import PerInvocationResult
from .llm_as_judge import AutoRaterScore
Expand Down Expand Up @@ -154,20 +150,6 @@ def __init__(
)
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT

@staticmethod
def get_metric_info() -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
description=(
"This metric evaluates if the agent's final response matches a"
" golden/expected final response using LLM as a judge. Value range"
" for this metric is [0,1], with values closer to 1 more desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)

@override
def format_auto_rater_prompt(
self,
Expand Down
19 changes: 0 additions & 19 deletions src/google/adk/evaluation/hallucinations_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@
from .eval_case import InvocationEvents
from .eval_metrics import EvalMetric
from .eval_metrics import HallucinationsCriterion
from .eval_metrics import Interval
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
Expand Down Expand Up @@ -310,21 +306,6 @@ def _setup_auto_rater(self) -> BaseLlm:
llm_class = llm_registry.resolve(model_id)
return llm_class(model=model_id)

@staticmethod
def get_metric_info() -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.HALLUCINATIONS_V1.value,
description=(
"This metric assesses whether a model response contains any false,"
" contradictory, or unsupported claims using a LLM as judge. Value"
" range for this metric is [0,1], with values closer to 1 more"
" desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)

def _create_context_for_step(
self,
app_details: Optional[AppDetails],
Expand Down
30 changes: 19 additions & 11 deletions src/google/adk/evaluation/metric_evaluator_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
from .evaluator import Evaluator
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
from .hallucinations_v1 import HallucinationsV1Evaluator
from .metric_info_providers import FinalResponseMatchV2EvaluatorMetricInfoProvider
from .metric_info_providers import HallucinationsV1EvaluatorMetricInfoProvider
from .metric_info_providers import PerTurnUserSimulatorQualityV1MetricInfoProvider
from .metric_info_providers import ResponseEvaluatorMetricInfoProvider
from .metric_info_providers import RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider
from .metric_info_providers import RubricBasedToolUseV1EvaluatorMetricInfoProvider
from .metric_info_providers import SafetyEvaluatorV1MetricInfoProvider
from .metric_info_providers import TrajectoryEvaluatorMetricInfoProvider
from .response_evaluator import ResponseEvaluator
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
Expand Down Expand Up @@ -91,44 +99,44 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
metric_evaluator_registry = MetricEvaluatorRegistry()

metric_evaluator_registry.register_evaluator(
metric_info=TrajectoryEvaluator.get_metric_info(),
metric_info=TrajectoryEvaluatorMetricInfoProvider().get_metric_info(),
evaluator=TrajectoryEvaluator,
)

metric_evaluator_registry.register_evaluator(
metric_info=ResponseEvaluator.get_metric_info(
metric_info=ResponseEvaluatorMetricInfoProvider(
PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value
),
).get_metric_info(),
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=ResponseEvaluator.get_metric_info(
metric_info=ResponseEvaluatorMetricInfoProvider(
PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
),
).get_metric_info(),
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=SafetyEvaluatorV1.get_metric_info(),
metric_info=SafetyEvaluatorV1MetricInfoProvider().get_metric_info(),
evaluator=SafetyEvaluatorV1,
)
metric_evaluator_registry.register_evaluator(
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
metric_info=FinalResponseMatchV2EvaluatorMetricInfoProvider().get_metric_info(),
evaluator=FinalResponseMatchV2Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
metric_info=RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider().get_metric_info(),
evaluator=RubricBasedFinalResponseQualityV1Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=HallucinationsV1Evaluator.get_metric_info(),
metric_info=HallucinationsV1EvaluatorMetricInfoProvider().get_metric_info(),
evaluator=HallucinationsV1Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=RubricBasedToolUseV1Evaluator.get_metric_info(),
metric_info=RubricBasedToolUseV1EvaluatorMetricInfoProvider().get_metric_info(),
evaluator=RubricBasedToolUseV1Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=PerTurnUserSimulatorQualityV1.get_metric_info(),
metric_info=PerTurnUserSimulatorQualityV1MetricInfoProvider().get_metric_info(),
evaluator=PerTurnUserSimulatorQualityV1,
)

Expand Down
185 changes: 185 additions & 0 deletions src/google/adk/evaluation/metric_info_providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from .eval_metrics import Interval
from .eval_metrics import MetricInfo
from .eval_metrics import MetricInfoProvider
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics


class TrajectoryEvaluatorMetricInfoProvider(MetricInfoProvider):
"""Metric info provider for TrajectoryEvaluator."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
description=(
"This metric compares two tool call trajectories (expected vs."
" actual) for the same user interaction. It performs an exact match"
" on the tool name and arguments for each step in the trajectory."
" A score of 1.0 indicates a perfect match, while 0.0 indicates a"
" mismatch. Higher values are better."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class ResponseEvaluatorMetricInfoProvider(MetricInfoProvider):
"""Metric info provider for ResponseEvaluator."""

def __init__(self, metric_name: str):
self._metric_name = metric_name

def get_metric_info(self) -> MetricInfo:
"""Returns MetricInfo for the given metric name."""
if PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value == self._metric_name:
return MetricInfo(
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
description=(
"This metric evaluates how coherent agent's response was. Value"
" range of this metric is [1,5], with values closer to 5 more"
" desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=1.0, max_value=5.0)
),
)
elif PrebuiltMetrics.RESPONSE_MATCH_SCORE.value == self._metric_name:
return MetricInfo(
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
description=(
"This metric evaluates if the agent's final response matches a"
" golden/expected final response using Rouge_1 metric. Value"
" range for this metric is [0,1], with values closer to 1 more"
" desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)
else:
raise ValueError(f"`{self._metric_name}` is not supported.")


class SafetyEvaluatorV1MetricInfoProvider(MetricInfoProvider):
"""Metric info provider for SafetyEvaluatorV1."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.SAFETY_V1.value,
description=(
"This metric evaluates the safety (harmlessness) of an Agent's"
" Response. Value range of the metric is [0, 1], with values closer"
" to 1 to be more desirable (safe)."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class FinalResponseMatchV2EvaluatorMetricInfoProvider(MetricInfoProvider):
"""Metric info provider for FinalResponseMatchV2Evaluator."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
description=(
"This metric evaluates if the agent's final response matches a"
" golden/expected final response using LLM as a judge. Value range"
" for this metric is [0,1], with values closer to 1 more desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider(
MetricInfoProvider
):
"""Metric info provider for RubricBasedFinalResponseQualityV1Evaluator."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value,
description=(
"This metric assess if the agent's final response against a set of"
" rubrics using LLM as a judge. Value range for this metric is"
" [0,1], with values closer to 1 more desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class HallucinationsV1EvaluatorMetricInfoProvider(MetricInfoProvider):
"""Metric info provider for HallucinationsV1Evaluator."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.HALLUCINATIONS_V1.value,
description=(
"This metric assesses whether a model response contains any false,"
" contradictory, or unsupported claims using a LLM as judge. Value"
" range for this metric is [0,1], with values closer to 1 more"
" desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class RubricBasedToolUseV1EvaluatorMetricInfoProvider(MetricInfoProvider):
"""Metric info provider for RubricBasedToolUseV1Evaluator."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
description=(
"This metric assess if the agent's usage of tools against a set of"
" rubrics using LLM as a judge. Value range for this metric is"
" [0,1], with values closer to 1 more desirable."
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


class PerTurnUserSimulatorQualityV1MetricInfoProvider(MetricInfoProvider):
"""Metric info provider for PerTurnUserSimulatorQualityV1."""

def get_metric_info(self) -> MetricInfo:
return MetricInfo(
metric_name=PrebuiltMetrics.PER_TURN_USER_SIMULATOR_QUALITY_V1,
description=(
"This metric evaluates if the user messages generated by a "
"user simulator follow the given conversation scenario. It "
"validates each message separately. The resulting metric "
"computes the percentage of user messages that we mark as "
"valid. The value range for this metric is [0,1], with values "
"closer to 1 more desirable. "
),
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)
Loading
Loading