Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/check-file-contents.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
echo ""

set +e
FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*cli.*import.*$' $CHANGED_FILES)
FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*\bcli\b.*import.*$' $CHANGED_FILES)
GREP_EXIT_CODE=$?
set -e

Expand Down
4 changes: 2 additions & 2 deletions contributing/samples/toolbox_agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ Install SQLite from [https://sqlite.org/](https://sqlite.org/)

### 3. Install Required Python Dependencies

**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-core` package, which is not automatically installed with the ADK. Install it using:
**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-adk` package, which is not automatically installed with the ADK. Install it using:

```bash
pip install toolbox-core
pip install google-adk[toolbox]
```

### 4. Create Database (Optional)
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,12 @@ extensions = [
"llama-index-readers-file>=0.4.0", # For retrieval using LlamaIndex.
"llama-index-embeddings-google-genai>=0.3.0", # For files retrieval using LlamaIndex.
"lxml>=5.3.0", # For load_web_page tool.
"toolbox-adk>=0.1.0", # For tools.toolbox_toolset.ToolboxToolset
"toolbox-adk>=0.5.7, <0.6.0", # For tools.toolbox_toolset.ToolboxToolset
]

otel-gcp = ["opentelemetry-instrumentation-google-genai>=0.3b0, <1.0.0"]

toolbox = ["toolbox-adk>=0.5.7, <0.6.0"]

[tool.pyink]
# Format py files following Google style-guide
Expand Down
3 changes: 2 additions & 1 deletion src/google/adk/agents/remote_a2a_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,8 @@ async def _handle_a2a_response(
and event.content is not None
and event.content.parts
):
event.content.parts[0].thought = True
for part in event.content.parts:
part.thought = True
elif (
isinstance(update, A2ATaskStatusUpdateEvent)
and update.status
Expand Down
1 change: 1 addition & 0 deletions src/google/adk/cli/adk_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ class AppInfo(common.BaseModel):
root_agent_name: str
description: str
language: Literal["yaml", "python"]
is_computer_use: bool = False


class ListAppsResponse(common.BaseModel):
Expand Down
16 changes: 16 additions & 0 deletions src/google/adk/cli/cli_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
from ..evaluation.eval_case import get_all_tool_calls
from ..evaluation.eval_case import IntermediateDataType
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import Interval
from ..evaluation.eval_metrics import MetricInfo
from ..evaluation.eval_metrics import MetricValueInfo
from ..evaluation.eval_result import EvalCaseResult
from ..evaluation.eval_sets_manager import EvalSetsManager
from ..utils.context_utils import Aclosing
Expand Down Expand Up @@ -70,6 +73,19 @@ def _get_agent_module(agent_module_file_path: str):
return _import_from_path(module_name, file_path)


def get_default_metric_info(
metric_name: str, description: str = ""
) -> MetricInfo:
"""Returns a default MetricInfo for a metric."""
return MetricInfo(
metric_name=metric_name,
description=description,
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)


def get_root_agent(agent_module_file_path: str) -> Agent:
"""Returns root agent given the agent module."""
agent_module = _get_agent_module(agent_module_file_path)
Expand Down
24 changes: 24 additions & 0 deletions src/google/adk/cli/cli_tools_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,11 @@ def cli_eval(
logs.setup_adk_logger(getattr(logging, log_level.upper()))

try:
import importlib

from ..evaluation.base_eval_service import InferenceConfig
from ..evaluation.base_eval_service import InferenceRequest
from ..evaluation.custom_metric_evaluator import _CustomMetricEvaluator
from ..evaluation.eval_config import get_eval_metrics_from_config
from ..evaluation.eval_config import get_evaluation_criteria_or_default
from ..evaluation.eval_result import EvalCaseResult
Expand All @@ -723,9 +726,11 @@ def cli_eval(
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
from ..evaluation.local_eval_sets_manager import load_eval_set_from_file
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
from ..evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
from ..evaluation.simulation.user_simulator_provider import UserSimulatorProvider
from .cli_eval import _collect_eval_results
from .cli_eval import _collect_inferences
from .cli_eval import get_default_metric_info
from .cli_eval import get_root_agent
from .cli_eval import parse_and_get_evals_to_run
from .cli_eval import pretty_print_eval_result
Expand Down Expand Up @@ -818,11 +823,30 @@ def cli_eval(
)

try:
metric_evaluator_registry = DEFAULT_METRIC_EVALUATOR_REGISTRY
if eval_config.custom_metrics:
for (
metric_name,
config,
) in eval_config.custom_metrics.items():
if config.metric_info:
metric_info = config.metric_info.model_copy()
metric_info.metric_name = metric_name
else:
metric_info = get_default_metric_info(
metric_name=metric_name, description=config.description
)

metric_evaluator_registry.register_evaluator(
metric_info, _CustomMetricEvaluator
)

eval_service = LocalEvalService(
root_agent=root_agent,
eval_sets_manager=eval_sets_manager,
eval_set_results_manager=eval_set_results_manager,
user_simulator_provider=user_simulator_provider,
metric_evaluator_registry=metric_evaluator_registry,
)

inference_results = asyncio.run(
Expand Down
6 changes: 6 additions & 0 deletions src/google/adk/cli/utils/agent_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from ...agents import config_agent_utils
from ...agents.base_agent import BaseAgent
from ...apps.app import App
from ...tools.computer_use.computer_use_toolset import ComputerUseToolset
from ...utils.feature_decorator import experimental
from .base_agent_loader import BaseAgentLoader

Expand Down Expand Up @@ -358,12 +359,17 @@ def list_agents_detailed(self) -> list[dict[str, Any]]:
agent = loaded

language = self._determine_agent_language(agent_name)
is_computer_use = any(
isinstance(t, ComputerUseToolset)
for t in getattr(agent, "tools", [])
)

app_info = {
"name": agent_name,
"root_agent_name": agent.name,
"description": agent.description,
"language": language,
"is_computer_use": is_computer_use,
}
apps_info.append(app_info)

Expand Down
79 changes: 79 additions & 0 deletions src/google/adk/evaluation/custom_metric_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import importlib
import inspect
from typing import Callable
from typing import Optional

from typing_extensions import override

from .eval_case import ConversationScenario
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator


def _get_metric_function(
custom_function_path: str,
) -> Callable[..., EvaluationResult]:
"""Returns the custom metric function from the given path."""
try:
module_name, function_name = custom_function_path.rsplit(".", 1)
module = importlib.import_module(module_name)
metric_function = getattr(module, function_name)
return metric_function
except (ImportError, AttributeError, ValueError) as e:
raise ImportError(
f"Could not import custom metric function from {custom_function_path}"
) from e


def _get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
if score is None:
return EvalStatus.NOT_EVALUATED
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


class _CustomMetricEvaluator(Evaluator):
"""Evaluator for custom metrics."""

def __init__(self, eval_metric: EvalMetric, custom_function_path: str):
self._eval_metric = eval_metric
self._metric_function = _get_metric_function(custom_function_path)

@override
async def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
conversation_scenario: Optional[ConversationScenario] = None,
) -> EvaluationResult:
if inspect.iscoroutinefunction(self._metric_function):
eval_result = await self._metric_function(
actual_invocations, expected_invocations, conversation_scenario
)
else:
eval_result = self._metric_function(
actual_invocations, expected_invocations, conversation_scenario
)

eval_result.overall_eval_status = _get_eval_status(
eval_result.overall_score, self._eval_metric.threshold
)
return eval_result
87 changes: 64 additions & 23 deletions src/google/adk/evaluation/eval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,46 @@
from ..agents.common_configs import CodeConfig
from ..evaluation.eval_metrics import EvalMetric
from .eval_metrics import BaseCriterion
from .eval_metrics import MetricInfo
from .eval_metrics import Threshold
from .simulation.user_simulator import BaseUserSimulatorConfig

logger = logging.getLogger("google_adk." + __name__)


class CustomMetricConfig(BaseModel):
"""Configuration for a custom metric."""

model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

code_config: CodeConfig = Field(
description=(
"Code config for the custom metric, used to locate the custom metric"
" function."
)
)
metric_info: Optional[MetricInfo] = Field(
default=None,
description="Metric info for the custom metric.",
)
description: str = Field(
default="",
description="Description for the custom metric info.",
)

@model_validator(mode="after")
def check_code_config_args(self) -> "CustomMetricConfig":
"""Checks that the code config does not have args."""
if self.code_config.args:
raise ValueError(
"args field in CodeConfig for custom metric is not supported."
)
return self


class EvalConfig(BaseModel):
"""Configurations needed to run an Eval.

Expand Down Expand Up @@ -74,24 +108,43 @@ class EvalConfig(BaseModel):
""",
)

custom_metrics: Optional[dict[str, CodeConfig]] = Field(
custom_metrics: Optional[dict[str, CustomMetricConfig]] = Field(
default=None,
description="""A dictionary mapping custom metric names to CodeConfig
objects, which specify the path to the function for each custom metric.
description="""A dictionary mapping custom metric names to
a CustomMetricConfig object.

If a metric name in `criteria` is also present in `custom_metrics`, the
corresponding `CodeConfig`'s `name` field will be used to locate the custom
metric implementation. The `name` field should contain the fully qualified
path to the custom metric function, e.g., `my.custom.metrics.metric_function`.
`code_config` in `CustomMetricConfig` will be used to locate the custom metric
implementation.

The `metric` field in `CustomMetricConfig` can be used to provide metric
information like `min_value`, `max_value`, and `description`. If `metric`
is not provided, a default `MetricInfo` will be created, using
`description` from `CustomMetricConfig` if provided, and default values
for `min_value` (0.0) and `max_value` (1.0).

Example:
{
"criteria": {
"my_custom_metric": 0.5
"my_custom_metric": 0.5,
"my_simple_metric": 0.8
},
"custom_metrics": {
"my_simple_metric": {
"code_config": {
"name": "path.to.my.simple.metric.function"
}
},
"my_custom_metric": {
"name": "path.to.my.custom.metric.function"
"code_config": {
"name": "path.to.my.custom.metric.function"
},
"metric": {
"metric_name": "my_custom_metric",
"min_value": -10.0,
"max_value": 10.0,
"description": "My custom metric."
}
}
}
}
Expand All @@ -103,17 +156,6 @@ class EvalConfig(BaseModel):
description="Config to be used by the user simulator.",
)

@model_validator(mode="after")
def check_custom_metrics_code_config_args(self) -> "EvalConfig":
if self.custom_metrics:
for metric_name, metric_config in self.custom_metrics.items():
if metric_config.args:
raise ValueError(
f"args field in CodeConfig for custom metric '{metric_name}' is"
" not supported."
)
return self


_DEFAULT_EVAL_CONFIG = EvalConfig(
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
Expand Down Expand Up @@ -144,11 +186,10 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
if eval_config.criteria:
for metric_name, criterion in eval_config.criteria.items():
custom_function_path = None
if (
eval_config.custom_metrics
and metric_name in eval_config.custom_metrics
if eval_config.custom_metrics and (
config := eval_config.custom_metrics.get(metric_name)
):
custom_function_path = eval_config.custom_metrics[metric_name].name
custom_function_path = config.code_config.name

if isinstance(criterion, float):
eval_metric_list.append(
Expand Down
Loading
Loading