Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class PrebuiltMetrics(Enum):

RUBRIC_BASED_TOOL_USE_QUALITY_V1 = "rubric_based_tool_use_quality_v1"

PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"


MetricName: TypeAlias = Union[str, PrebuiltMetrics]
Threshold: TypeAlias = float
Expand Down Expand Up @@ -223,6 +225,19 @@ class MatchType(Enum):
)


class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion):
"""Criterion for LLM-backed User Simulator Evaluators."""

stop_signal: str = Field(
default="</finished>",
description=(
"Stop signal to validate the successful completion of a conversation."
" For optimal performance, this should match the one in the User"
" Simulator."
),
)


class EvalMetric(EvalBaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""

Expand Down
4 changes: 4 additions & 0 deletions src/google/adk/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from pydantic import BaseModel
from typing_extensions import TypeAlias

from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalStatus
Expand Down Expand Up @@ -62,6 +63,7 @@ def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
conversation_scenario: Optional[ConversationScenario],
) -> EvaluationResult:
"""Returns EvaluationResult after performing evaluations using actual and expected invocations.

Expand All @@ -72,5 +74,7 @@ def evaluate_invocations(
usually act as a benchmark/golden response. If these are specified
usually the expectation is that the length of this list and actual
invocation is the same.
conversation_scenario: An optional conversation scenario for multi-turn
conversations.
"""
raise NotImplementedError()
2 changes: 2 additions & 0 deletions src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing_extensions import override

from ..dependencies.rouge_scorer import rouge_scorer
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import Interval
Expand Down Expand Up @@ -60,6 +61,7 @@ def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
_: Optional[ConversationScenario] = None,
) -> EvaluationResult:
if expected_invocations is None:
raise ValueError("expected_invocations is required for this metric.")
Expand Down
2 changes: 2 additions & 0 deletions src/google/adk/evaluation/hallucinations_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from ..utils.feature_decorator import experimental
from ._retry_options_utils import add_default_retry_options_if_not_present
from .app_details import AppDetails
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_case import InvocationEvent
from .eval_case import InvocationEvents
Expand Down Expand Up @@ -720,6 +721,7 @@ async def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
_: Optional[ConversationScenario] = None,
) -> EvaluationResult:
# expected_invocations are not required by the metric and if they are not
# supplied, we provide a list of None to rest of the code.
Expand Down
2 changes: 2 additions & 0 deletions src/google/adk/evaluation/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ..utils.feature_decorator import experimental
from ._retry_options_utils import add_default_retry_options_if_not_present
from .common import EvalBaseModel
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalMetric
Expand Down Expand Up @@ -118,6 +119,7 @@ async def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
_: Optional[ConversationScenario] = None,
) -> EvaluationResult:
if self._expected_invocations_required and expected_invocations is None:
raise ValueError("expected_invocations is needed by this metric.")
Expand Down
4 changes: 4 additions & 0 deletions src/google/adk/evaluation/local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from .base_eval_service import InferenceRequest
from .base_eval_service import InferenceResult
from .base_eval_service import InferenceStatus
from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import EvalMetricResult
Expand Down Expand Up @@ -256,6 +257,7 @@ async def _evaluate_single_inference_result(
eval_metric=eval_metric,
actual_invocations=inference_result.inferences,
expected_invocations=eval_case.conversation,
conversation_scenario=eval_case.conversation_scenario,
)
except Exception as e:
# We intentionally catch the Exception as we don't want failures to
Expand Down Expand Up @@ -345,6 +347,7 @@ async def _evaluate_metric(
eval_metric: EvalMetric,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
conversation_scenario: Optional[ConversationScenario],
) -> EvaluationResult:
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""

Expand All @@ -359,6 +362,7 @@ async def _evaluate_metric(
return await metric_evaluator.evaluate_invocations(
actual_invocations=actual_invocations,
expected_invocations=expected_invocations,
conversation_scenario=conversation_scenario,
)
else:
# Metrics that perform computation synchronously, mostly these don't
Expand Down
5 changes: 5 additions & 0 deletions src/google/adk/evaluation/metric_evaluator_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
from .safety_evaluator import SafetyEvaluatorV1
from .simulation.per_turn_user_simulator_quality_v1 import PerTurnUserSimulatorQualityV1
from .trajectory_evaluator import TrajectoryEvaluator

logger = logging.getLogger("google_adk." + __name__)
Expand Down Expand Up @@ -126,6 +127,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
metric_info=RubricBasedToolUseV1Evaluator.get_metric_info(),
evaluator=RubricBasedToolUseV1Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=PerTurnUserSimulatorQualityV1.get_metric_info(),
evaluator=PerTurnUserSimulatorQualityV1,
)

return metric_evaluator_registry

Expand Down
2 changes: 2 additions & 0 deletions src/google/adk/evaluation/response_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from typing_extensions import override

from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import Interval
Expand Down Expand Up @@ -100,6 +101,7 @@ def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
_: Optional[ConversationScenario] = None,
) -> EvaluationResult:
# If the metric is response_match_score, just use the RougeEvaluator.
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
Expand Down
Loading
Loading