diff --git a/.github/workflows/check-file-contents.yml b/.github/workflows/check-file-contents.yml index 6c02d904c7..974f3816a1 100644 --- a/.github/workflows/check-file-contents.yml +++ b/.github/workflows/check-file-contents.yml @@ -96,7 +96,7 @@ jobs: echo "" set +e - FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*cli.*import.*$' $CHANGED_FILES) + FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*\bcli\b.*import.*$' $CHANGED_FILES) GREP_EXIT_CODE=$? set -e diff --git a/contributing/samples/toolbox_agent/README.md b/contributing/samples/toolbox_agent/README.md index 1c94731ac5..56a4fe089e 100644 --- a/contributing/samples/toolbox_agent/README.md +++ b/contributing/samples/toolbox_agent/README.md @@ -26,10 +26,10 @@ Install SQLite from [https://sqlite.org/](https://sqlite.org/) ### 3. Install Required Python Dependencies -**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-core` package, which is not automatically installed with the ADK. Install it using: +**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-adk` package, which is not automatically installed with the ADK. Install it using: ```bash -pip install toolbox-core +pip install google-adk[toolbox] ``` ### 4. Create Database (Optional) diff --git a/pyproject.toml b/pyproject.toml index f612ef4df2..1af967046f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,11 +157,12 @@ extensions = [ "llama-index-readers-file>=0.4.0", # For retrieval using LlamaIndex. "llama-index-embeddings-google-genai>=0.3.0", # For files retrieval using LlamaIndex. "lxml>=5.3.0", # For load_web_page tool. - "toolbox-adk>=0.1.0", # For tools.toolbox_toolset.ToolboxToolset + "toolbox-adk>=0.5.7, <0.6.0", # For tools.toolbox_toolset.ToolboxToolset ] otel-gcp = ["opentelemetry-instrumentation-google-genai>=0.3b0, <1.0.0"] +toolbox = ["toolbox-adk>=0.5.7, <0.6.0"] [tool.pyink] # Format py files following Google style-guide diff --git a/src/google/adk/agents/remote_a2a_agent.py b/src/google/adk/agents/remote_a2a_agent.py index 23a9b47554..0ac47c0236 100644 --- a/src/google/adk/agents/remote_a2a_agent.py +++ b/src/google/adk/agents/remote_a2a_agent.py @@ -443,7 +443,8 @@ async def _handle_a2a_response( and event.content is not None and event.content.parts ): - event.content.parts[0].thought = True + for part in event.content.parts: + part.thought = True elif ( isinstance(update, A2ATaskStatusUpdateEvent) and update.status diff --git a/src/google/adk/cli/adk_web_server.py b/src/google/adk/cli/adk_web_server.py index 0f0657ee0c..752af89c34 100644 --- a/src/google/adk/cli/adk_web_server.py +++ b/src/google/adk/cli/adk_web_server.py @@ -330,6 +330,7 @@ class AppInfo(common.BaseModel): root_agent_name: str description: str language: Literal["yaml", "python"] + is_computer_use: bool = False class ListAppsResponse(common.BaseModel): diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 7176199b9f..2555f3429b 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -34,6 +34,9 @@ from ..evaluation.eval_case import get_all_tool_calls from ..evaluation.eval_case import IntermediateDataType from ..evaluation.eval_metrics import EvalMetric +from ..evaluation.eval_metrics import Interval +from ..evaluation.eval_metrics import MetricInfo +from ..evaluation.eval_metrics import MetricValueInfo from ..evaluation.eval_result import EvalCaseResult from ..evaluation.eval_sets_manager import EvalSetsManager from ..utils.context_utils import Aclosing @@ -70,6 +73,19 @@ def _get_agent_module(agent_module_file_path: str): return _import_from_path(module_name, file_path) +def get_default_metric_info( + metric_name: str, description: str = "" +) -> MetricInfo: + """Returns a default MetricInfo for a metric.""" + return MetricInfo( + metric_name=metric_name, + description=description, + metric_value_info=MetricValueInfo( + interval=Interval(min_value=0.0, max_value=1.0) + ), + ) + + def get_root_agent(agent_module_file_path: str) -> Agent: """Returns root agent given the agent module.""" agent_module = _get_agent_module(agent_module_file_path) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 241c696351..0875f2523d 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -712,8 +712,11 @@ def cli_eval( logs.setup_adk_logger(getattr(logging, log_level.upper())) try: + import importlib + from ..evaluation.base_eval_service import InferenceConfig from ..evaluation.base_eval_service import InferenceRequest + from ..evaluation.custom_metric_evaluator import _CustomMetricEvaluator from ..evaluation.eval_config import get_eval_metrics_from_config from ..evaluation.eval_config import get_evaluation_criteria_or_default from ..evaluation.eval_result import EvalCaseResult @@ -723,9 +726,11 @@ def cli_eval( from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager from ..evaluation.local_eval_sets_manager import load_eval_set_from_file from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager + from ..evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY from ..evaluation.simulation.user_simulator_provider import UserSimulatorProvider from .cli_eval import _collect_eval_results from .cli_eval import _collect_inferences + from .cli_eval import get_default_metric_info from .cli_eval import get_root_agent from .cli_eval import parse_and_get_evals_to_run from .cli_eval import pretty_print_eval_result @@ -818,11 +823,30 @@ def cli_eval( ) try: + metric_evaluator_registry = DEFAULT_METRIC_EVALUATOR_REGISTRY + if eval_config.custom_metrics: + for ( + metric_name, + config, + ) in eval_config.custom_metrics.items(): + if config.metric_info: + metric_info = config.metric_info.model_copy() + metric_info.metric_name = metric_name + else: + metric_info = get_default_metric_info( + metric_name=metric_name, description=config.description + ) + + metric_evaluator_registry.register_evaluator( + metric_info, _CustomMetricEvaluator + ) + eval_service = LocalEvalService( root_agent=root_agent, eval_sets_manager=eval_sets_manager, eval_set_results_manager=eval_set_results_manager, user_simulator_provider=user_simulator_provider, + metric_evaluator_registry=metric_evaluator_registry, ) inference_results = asyncio.run( diff --git a/src/google/adk/cli/utils/agent_loader.py b/src/google/adk/cli/utils/agent_loader.py index d6965e5bbb..5b86adeffa 100644 --- a/src/google/adk/cli/utils/agent_loader.py +++ b/src/google/adk/cli/utils/agent_loader.py @@ -32,6 +32,7 @@ from ...agents import config_agent_utils from ...agents.base_agent import BaseAgent from ...apps.app import App +from ...tools.computer_use.computer_use_toolset import ComputerUseToolset from ...utils.feature_decorator import experimental from .base_agent_loader import BaseAgentLoader @@ -358,12 +359,17 @@ def list_agents_detailed(self) -> list[dict[str, Any]]: agent = loaded language = self._determine_agent_language(agent_name) + is_computer_use = any( + isinstance(t, ComputerUseToolset) + for t in getattr(agent, "tools", []) + ) app_info = { "name": agent_name, "root_agent_name": agent.name, "description": agent.description, "language": language, + "is_computer_use": is_computer_use, } apps_info.append(app_info) diff --git a/src/google/adk/evaluation/custom_metric_evaluator.py b/src/google/adk/evaluation/custom_metric_evaluator.py new file mode 100644 index 0000000000..1eb8666db8 --- /dev/null +++ b/src/google/adk/evaluation/custom_metric_evaluator.py @@ -0,0 +1,79 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import importlib +import inspect +from typing import Callable +from typing import Optional + +from typing_extensions import override + +from .eval_case import ConversationScenario +from .eval_case import Invocation +from .eval_metrics import EvalMetric +from .eval_metrics import EvalStatus +from .evaluator import EvaluationResult +from .evaluator import Evaluator + + +def _get_metric_function( + custom_function_path: str, +) -> Callable[..., EvaluationResult]: + """Returns the custom metric function from the given path.""" + try: + module_name, function_name = custom_function_path.rsplit(".", 1) + module = importlib.import_module(module_name) + metric_function = getattr(module, function_name) + return metric_function + except (ImportError, AttributeError, ValueError) as e: + raise ImportError( + f"Could not import custom metric function from {custom_function_path}" + ) from e + + +def _get_eval_status(score: Optional[float], threshold: float) -> EvalStatus: + if score is None: + return EvalStatus.NOT_EVALUATED + return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED + + +class _CustomMetricEvaluator(Evaluator): + """Evaluator for custom metrics.""" + + def __init__(self, eval_metric: EvalMetric, custom_function_path: str): + self._eval_metric = eval_metric + self._metric_function = _get_metric_function(custom_function_path) + + @override + async def evaluate_invocations( + self, + actual_invocations: list[Invocation], + expected_invocations: Optional[list[Invocation]], + conversation_scenario: Optional[ConversationScenario] = None, + ) -> EvaluationResult: + if inspect.iscoroutinefunction(self._metric_function): + eval_result = await self._metric_function( + actual_invocations, expected_invocations, conversation_scenario + ) + else: + eval_result = self._metric_function( + actual_invocations, expected_invocations, conversation_scenario + ) + + eval_result.overall_eval_status = _get_eval_status( + eval_result.overall_score, self._eval_metric.threshold + ) + return eval_result diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py index 92b61ac57c..3cc5672ca9 100644 --- a/src/google/adk/evaluation/eval_config.py +++ b/src/google/adk/evaluation/eval_config.py @@ -28,12 +28,46 @@ from ..agents.common_configs import CodeConfig from ..evaluation.eval_metrics import EvalMetric from .eval_metrics import BaseCriterion +from .eval_metrics import MetricInfo from .eval_metrics import Threshold from .simulation.user_simulator import BaseUserSimulatorConfig logger = logging.getLogger("google_adk." + __name__) +class CustomMetricConfig(BaseModel): + """Configuration for a custom metric.""" + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + + code_config: CodeConfig = Field( + description=( + "Code config for the custom metric, used to locate the custom metric" + " function." + ) + ) + metric_info: Optional[MetricInfo] = Field( + default=None, + description="Metric info for the custom metric.", + ) + description: str = Field( + default="", + description="Description for the custom metric info.", + ) + + @model_validator(mode="after") + def check_code_config_args(self) -> "CustomMetricConfig": + """Checks that the code config does not have args.""" + if self.code_config.args: + raise ValueError( + "args field in CodeConfig for custom metric is not supported." + ) + return self + + class EvalConfig(BaseModel): """Configurations needed to run an Eval. @@ -74,24 +108,43 @@ class EvalConfig(BaseModel): """, ) - custom_metrics: Optional[dict[str, CodeConfig]] = Field( + custom_metrics: Optional[dict[str, CustomMetricConfig]] = Field( default=None, - description="""A dictionary mapping custom metric names to CodeConfig -objects, which specify the path to the function for each custom metric. + description="""A dictionary mapping custom metric names to +a CustomMetricConfig object. If a metric name in `criteria` is also present in `custom_metrics`, the -corresponding `CodeConfig`'s `name` field will be used to locate the custom -metric implementation. The `name` field should contain the fully qualified -path to the custom metric function, e.g., `my.custom.metrics.metric_function`. +`code_config` in `CustomMetricConfig` will be used to locate the custom metric +implementation. + +The `metric` field in `CustomMetricConfig` can be used to provide metric +information like `min_value`, `max_value`, and `description`. If `metric` +is not provided, a default `MetricInfo` will be created, using +`description` from `CustomMetricConfig` if provided, and default values +for `min_value` (0.0) and `max_value` (1.0). Example: { "criteria": { - "my_custom_metric": 0.5 + "my_custom_metric": 0.5, + "my_simple_metric": 0.8 }, "custom_metrics": { + "my_simple_metric": { + "code_config": { + "name": "path.to.my.simple.metric.function" + } + }, "my_custom_metric": { - "name": "path.to.my.custom.metric.function" + "code_config": { + "name": "path.to.my.custom.metric.function" + }, + "metric": { + "metric_name": "my_custom_metric", + "min_value": -10.0, + "max_value": 10.0, + "description": "My custom metric." + } } } } @@ -103,17 +156,6 @@ class EvalConfig(BaseModel): description="Config to be used by the user simulator.", ) - @model_validator(mode="after") - def check_custom_metrics_code_config_args(self) -> "EvalConfig": - if self.custom_metrics: - for metric_name, metric_config in self.custom_metrics.items(): - if metric_config.args: - raise ValueError( - f"args field in CodeConfig for custom metric '{metric_name}' is" - " not supported." - ) - return self - _DEFAULT_EVAL_CONFIG = EvalConfig( criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8} @@ -144,11 +186,10 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]: if eval_config.criteria: for metric_name, criterion in eval_config.criteria.items(): custom_function_path = None - if ( - eval_config.custom_metrics - and metric_name in eval_config.custom_metrics + if eval_config.custom_metrics and ( + config := eval_config.custom_metrics.get(metric_name) ): - custom_function_path = eval_config.custom_metrics[metric_name].name + custom_function_path = config.code_config.name if isinstance(criterion, float): eval_metric_list.append( diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 7031266e27..5b8cd21690 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -268,6 +268,22 @@ async def _evaluate_single_inference_result( else 'test_user_id' ) + if ( + inference_result.status == InferenceStatus.FAILURE + or inference_result.inferences is None + ): + logger.error( + 'Evaluation attempted on failed inference for eval case `%s`.' + ' Error: %s', + inference_result.eval_case_id, + inference_result.error_message, + ) + eval_case_result = await self._build_not_evaluated_eval_case_result( + inference_result=inference_result, + user_id=user_id, + ) + return (inference_result, eval_case_result) + if eval_case.conversation_scenario is None and len( inference_result.inferences ) != len(eval_case.conversation): @@ -464,6 +480,31 @@ def _generate_final_eval_status( return final_eval_status + async def _build_not_evaluated_eval_case_result( + self, + *, + inference_result: InferenceResult, + user_id: str, + ) -> EvalCaseResult: + """Constructs an EvalCaseResult for cases that could not be evaluated.""" + session_details = await self._session_service.get_session( + app_name=inference_result.app_name, + user_id=user_id, + session_id=inference_result.session_id, + ) + + return EvalCaseResult( + eval_set_file=inference_result.eval_set_id, + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + final_eval_status=EvalStatus.NOT_EVALUATED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id=inference_result.session_id, + session_details=session_details, + user_id=user_id, + ) + async def _perform_inference_single_eval_item( self, app_name: str, diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py index 9e1fc6c23b..c1010e5ddf 100644 --- a/src/google/adk/evaluation/metric_evaluator_registry.py +++ b/src/google/adk/evaluation/metric_evaluator_registry.py @@ -18,6 +18,7 @@ from ..errors.not_found_error import NotFoundError from ..utils.feature_decorator import experimental +from .custom_metric_evaluator import _CustomMetricEvaluator from .eval_metrics import EvalMetric from .eval_metrics import MetricInfo from .eval_metrics import PrebuiltMetrics @@ -62,7 +63,13 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator: if eval_metric.metric_name not in self._registry: raise NotFoundError(f"{eval_metric.metric_name} not found in registry.") - return self._registry[eval_metric.metric_name][0](eval_metric=eval_metric) + evaluator_type = self._registry[eval_metric.metric_name][0] + if issubclass(evaluator_type, _CustomMetricEvaluator): + return evaluator_type( + eval_metric=eval_metric, + custom_function_path=eval_metric.custom_function_path, + ) + return evaluator_type(eval_metric=eval_metric) def register_evaluator( self, diff --git a/src/google/adk/tools/toolbox_toolset.py b/src/google/adk/tools/toolbox_toolset.py index 73f27f3fc2..e1e7e576d6 100644 --- a/src/google/adk/tools/toolbox_toolset.py +++ b/src/google/adk/tools/toolbox_toolset.py @@ -35,19 +35,9 @@ class ToolboxToolset(BaseToolset): """A class that provides access to toolbox toolsets. - This class acts as a bridge to the `toolbox-adk` package. - You must install `toolbox-adk` to use this class. - Example: ```python - from toolbox_adk import CredentialStrategy - - toolbox_toolset = ToolboxToolset( - server_url="http://127.0.0.1:5000", - # toolset_name and tool_names are optional. If omitted, all tools are - loaded. - credentials=CredentialStrategy.toolbox_identity() - ) + toolbox_toolset = ToolboxToolset("http://127.0.0.1:5000") ``` """ @@ -64,29 +54,37 @@ def __init__( additional_headers: Optional[Mapping[str, str]] = None, **kwargs, ): - """Args: - - server_url: The URL of the toolbox server. - toolset_name: The name of the toolbox toolset to load. - tool_names: The names of the tools to load. - auth_token_getters: (Deprecated) Map of auth token getters. - bound_params: Parameters to bind to the tools. - credentials: (Optional) toolbox_adk.CredentialConfig object. - additional_headers: (Optional) Static headers dictionary. - **kwargs: Additional arguments passed to the underlying - toolbox_adk.ToolboxToolset. + """Initializes the ToolboxToolset. + + Args: + server_url: The URL of the toolbox server. + toolset_name: (Optional) The name of the toolbox toolset to load. + tool_names: (Optional) The names of the tools to load. + auth_token_getters: (Optional) A mapping of authentication service names + to callables that return the corresponding authentication token. see: + https://github.com/googleapis/mcp-toolbox-sdk-python/tree/main/packages/toolbox-core#authenticating-tools + for details. + bound_params: (Optional) A mapping of parameter names to bind to specific + values or callables that are called to produce values as needed. see: + https://github.com/googleapis/mcp-toolbox-sdk-python/tree/main/packages/toolbox-core#binding-parameter-values + for details. + credentials: (Optional) toolbox_adk.CredentialConfig object. + additional_headers: (Optional) Static headers mapping. + **kwargs: Additional arguments passed to the underlying + toolbox_adk.ToolboxToolset. + + The resulting ToolboxToolset will contain both tools loaded by tool_names + and toolset_name. + + Note: toolset_name and tool_names are optional. + If both are omitted, all tools are loaded. """ - if not toolset_name and not tool_names: - raise ValueError( - "Either 'toolset_name' or 'tool_names' must be provided." - ) - try: from toolbox_adk import ToolboxToolset as RealToolboxToolset # pylint: disable=import-outside-toplevel except ImportError as exc: raise ImportError( "ToolboxToolset requires the 'toolbox-adk' package. " - "Please install it using `pip install toolbox-adk`." + "Please install it using `pip install google-adk[toolbox]`." ) from exc super().__init__() @@ -95,10 +93,10 @@ def __init__( server_url=server_url, toolset_name=toolset_name, tool_names=tool_names, + auth_token_getters=auth_token_getters, + bound_params=bound_params, credentials=credentials, additional_headers=additional_headers, - bound_params=bound_params, - auth_token_getters=auth_token_getters, **kwargs, ) diff --git a/tests/unittests/cli/test_fast_api.py b/tests/unittests/cli/test_fast_api.py index b7a9773072..6a98f75a88 100755 --- a/tests/unittests/cli/test_fast_api.py +++ b/tests/unittests/cli/test_fast_api.py @@ -201,6 +201,7 @@ def list_agents_detailed(self): "root_agent_name": "test_agent", "description": "A test agent for unit testing", "language": "python", + "is_computer_use": False, }] return MockAgentLoader(".") @@ -735,6 +736,8 @@ def test_list_apps_detailed(test_app): assert "description" in app assert "language" in app assert app["language"] in ["yaml", "python"] + assert "isComputerUse" in app + assert not app["isComputerUse"] logger.info(f"Listed apps: {data}") diff --git a/tests/unittests/cli/utils/test_agent_loader.py b/tests/unittests/cli/utils/test_agent_loader.py index 4950fecbd3..130fd72229 100644 --- a/tests/unittests/cli/utils/test_agent_loader.py +++ b/tests/unittests/cli/utils/test_agent_loader.py @@ -20,6 +20,7 @@ import sys import tempfile from textwrap import dedent +from unittest import mock from google.adk.cli.utils import agent_loader as agent_loader_module from google.adk.cli.utils.agent_loader import AgentLoader @@ -49,7 +50,8 @@ def create_agent_structure( Args: temp_dir: The temporary directory to create the agent in agent_name: Name of the agent - structure_type: One of 'module', 'package_with_root', 'package_with_agent_module' + structure_type: One of 'module', 'package_with_root', + 'package_with_agent_module' """ if structure_type == "module": # Structure: agents_dir/agent_name.py @@ -928,3 +930,66 @@ def test_yaml_config_agents_dir_parameter(self): # Verify they are different agents assert default_agent.name != custom_agent.name assert explicit_agent.name == default_agent.name + + def test_list_agents_detailed_identifies_computer_use(self): + """Test that list_agents_detailed correctly identifies computer use capability.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + agent_name = "computer_use_agent" + + agent_dir = temp_path / agent_name + agent_dir.mkdir() + + (agent_dir / "__init__.py").write_text(dedent(f""" + from typing import Any + from unittest.mock import MagicMock + from google.adk.agents.base_agent import BaseAgent + from google.adk.tools.computer_use.computer_use_toolset import ComputerUseToolset + from google.adk.tools.computer_use.base_computer import BaseComputer + + class {agent_name.title()}Agent(BaseAgent): + tools: list[Any] = [] + + def __init__(self): + super().__init__(name="{agent_name}") + self.tools = [ComputerUseToolset(computer=MagicMock(spec=BaseComputer))] + + root_agent = {agent_name.title()}Agent() + """)) + + loader = AgentLoader(str(temp_path)) + detailed_list = loader.list_agents_detailed() + + assert len(detailed_list) == 1 + assert detailed_list[0]["name"] == agent_name + assert detailed_list[0]["is_computer_use"] + + def test_list_agents_detailed_detects_no_computer_use(self): + """Test that list_agents_detailed sets is_computer_use to False when toolset is absent.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + agent_name = "standard_agent" + + agent_dir = temp_path / agent_name + agent_dir.mkdir() + + (agent_dir / "__init__.py").write_text(dedent(f""" + from typing import Any + from google.adk.agents.base_agent import BaseAgent + + class {agent_name.title()}Agent(BaseAgent): + tools: list[Any] = [] + + def __init__(self): + super().__init__(name="{agent_name}") + self.tools = [] + + root_agent = {agent_name.title()}Agent() + """)) + + loader = AgentLoader(str(temp_path)) + detailed_list = loader.list_agents_detailed() + + assert len(detailed_list) == 1 + assert detailed_list[0]["name"] == agent_name + assert not detailed_list[0]["is_computer_use"] diff --git a/tests/unittests/evaluation/test_eval_config.py b/tests/unittests/evaluation/test_eval_config.py index fd1a7938eb..54f22b5066 100644 --- a/tests/unittests/evaluation/test_eval_config.py +++ b/tests/unittests/evaluation/test_eval_config.py @@ -109,8 +109,12 @@ def test_get_eval_metrics_from_config_with_custom_metrics(): }, }, custom_metrics={ - "custom_metric_1": {"name": "path/to/custom/metric_1"}, - "custom_metric_2": {"name": "path/to/custom/metric_2"}, + "custom_metric_1": { + "code_config": {"name": "path/to/custom/metric_1"}, + }, + "custom_metric_2": { + "code_config": {"name": "path/to/custom/metric_2"}, + }, }, ) eval_metrics = get_eval_metrics_from_config(eval_config) @@ -128,10 +132,12 @@ def test_get_eval_metrics_from_config_with_custom_metrics(): def test_custom_metric_code_config_with_args_raises_error(): with pytest.raises(ValueError): - eval_config = EvalConfig( + _ = EvalConfig( criteria={"custom_metric": 1.0}, custom_metrics={ - "custom_metric": {"name": "name", "args": [{"value": 1}]} + "custom_metric": { + "code_config": {"name": "name", "args": [{"value": 1}]}, + } }, ) diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index 08ef2aa8b0..4ba91711ee 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -325,6 +325,82 @@ async def test_evaluate_success( assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2 +@pytest.mark.asyncio +async def test_evaluate_skips_failed_inference_results( + eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_results = [ + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_failure", + inferences=None, + session_id="session_fail", + status=InferenceStatus.FAILURE, + error_message="simulated failure", + ), + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_success", + inferences=[invocation.model_copy(deep=True)], + session_id="session_success", + status=InferenceStatus.SUCCESS, + ), + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_unknown", + inferences=[invocation.model_copy(deep=True)], + session_id="session_unknown", + status=InferenceStatus.UNKNOWN, + ), + ] + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_request = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2), + ) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + results = [] + async for result in eval_service.evaluate(evaluate_request): + results.append(result) + + assert len(results) == 3 + results_by_case = {result.eval_id: result for result in results} + + failure_result = results_by_case["case_failure"] + assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED + assert failure_result.overall_eval_metric_results == [] + assert failure_result.eval_metric_result_per_invocation == [] + + for case_id in ["case_success", "case_unknown"]: + case_result = results_by_case[case_id] + assert case_result.final_eval_status == EvalStatus.PASSED + assert len(case_result.overall_eval_metric_results) == 1 + assert ( + case_result.overall_eval_metric_results[0].metric_name == "fake_metric" + ) + assert case_result.overall_eval_metric_results[0].score == 0.9 + + assert mock_eval_sets_manager.get_eval_case.call_count == 3 + assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3 + + @pytest.mark.asyncio async def test_evaluate_eval_case_not_found( eval_service, @@ -418,6 +494,93 @@ async def test_evaluate_single_inference_result( assert metric_result.eval_status == EvalStatus.PASSED +@pytest.mark.asyncio +async def test_evaluate_single_inference_result_handles_failed_inference( + eval_service, mock_eval_sets_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_result = InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case1", + inferences=None, + session_id="session1", + status=InferenceStatus.FAILURE, + error_message="simulated inference failure", + ) + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + _, result = await eval_service._evaluate_single_inference_result( + inference_result=inference_result, evaluate_config=evaluate_config + ) + + assert isinstance(result, EvalCaseResult) + assert result.eval_id == "case1" + assert result.final_eval_status == EvalStatus.NOT_EVALUATED + assert result.overall_eval_metric_results == [] + assert result.eval_metric_result_per_invocation == [] + mock_eval_sets_manager.get_eval_case.assert_called_once_with( + app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" + ) + + +@pytest.mark.asyncio +async def test_evaluate_single_inference_result_handles_missing_inferences( + eval_service, mock_eval_sets_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_result = InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case1", + inferences=None, + session_id="session1", + status=InferenceStatus.SUCCESS, + ) + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + _, result = await eval_service._evaluate_single_inference_result( + inference_result=inference_result, evaluate_config=evaluate_config + ) + + assert isinstance(result, EvalCaseResult) + assert result.eval_id == "case1" + assert result.final_eval_status == EvalStatus.NOT_EVALUATED + assert result.overall_eval_metric_results == [] + assert result.eval_metric_result_per_invocation == [] + mock_eval_sets_manager.get_eval_case.assert_called_once_with( + app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" + ) + + @pytest.mark.asyncio async def test_evaluate_single_inference_result_for_conversation_scenario( eval_service, mock_eval_sets_manager, mocker