diff --git a/.github/workflows/check-file-contents.yml b/.github/workflows/check-file-contents.yml
index 6c02d904c7..974f3816a1 100644
--- a/.github/workflows/check-file-contents.yml
+++ b/.github/workflows/check-file-contents.yml
@@ -96,7 +96,7 @@ jobs:
             echo ""
 
             set +e
-            FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*cli.*import.*$' $CHANGED_FILES)
+            FILES_WITH_FORBIDDEN_IMPORT=$(grep -lE '^from.*\bcli\b.*import.*$' $CHANGED_FILES)
             GREP_EXIT_CODE=$?
             set -e
 
diff --git a/contributing/samples/toolbox_agent/README.md b/contributing/samples/toolbox_agent/README.md
index 1c94731ac5..56a4fe089e 100644
--- a/contributing/samples/toolbox_agent/README.md
+++ b/contributing/samples/toolbox_agent/README.md
@@ -26,10 +26,10 @@ Install SQLite from [https://sqlite.org/](https://sqlite.org/)
 
 ### 3. Install Required Python Dependencies
 
-**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-core` package, which is not automatically installed with the ADK. Install it using:
+**Important**: The ADK's `ToolboxToolset` class requires the `toolbox-adk` package, which is not automatically installed with the ADK. Install it using:
 
 ```bash
-pip install toolbox-core
+pip install google-adk[toolbox]
 ```
 
 ### 4. Create Database (Optional)
diff --git a/pyproject.toml b/pyproject.toml
index f612ef4df2..1af967046f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,11 +157,12 @@ extensions = [
   "llama-index-readers-file>=0.4.0",            # For retrieval using LlamaIndex.
   "llama-index-embeddings-google-genai>=0.3.0", # For files retrieval using LlamaIndex.
   "lxml>=5.3.0",                                # For load_web_page tool.
-  "toolbox-adk>=0.1.0",                         # For tools.toolbox_toolset.ToolboxToolset
+  "toolbox-adk>=0.5.7, <0.6.0",                 # For tools.toolbox_toolset.ToolboxToolset
 ]
 
 otel-gcp = ["opentelemetry-instrumentation-google-genai>=0.3b0, <1.0.0"]
 
+toolbox = ["toolbox-adk>=0.5.7, <0.6.0"]
 
 [tool.pyink]
 # Format py files following Google style-guide
diff --git a/src/google/adk/agents/remote_a2a_agent.py b/src/google/adk/agents/remote_a2a_agent.py
index 23a9b47554..0ac47c0236 100644
--- a/src/google/adk/agents/remote_a2a_agent.py
+++ b/src/google/adk/agents/remote_a2a_agent.py
@@ -443,7 +443,8 @@ async def _handle_a2a_response(
               and event.content is not None
               and event.content.parts
           ):
-            event.content.parts[0].thought = True
+            for part in event.content.parts:
+              part.thought = True
         elif (
             isinstance(update, A2ATaskStatusUpdateEvent)
             and update.status
diff --git a/src/google/adk/cli/adk_web_server.py b/src/google/adk/cli/adk_web_server.py
index 0f0657ee0c..752af89c34 100644
--- a/src/google/adk/cli/adk_web_server.py
+++ b/src/google/adk/cli/adk_web_server.py
@@ -330,6 +330,7 @@ class AppInfo(common.BaseModel):
   root_agent_name: str
   description: str
   language: Literal["yaml", "python"]
+  is_computer_use: bool = False
 
 
 class ListAppsResponse(common.BaseModel):
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
index 7176199b9f..2555f3429b 100644
--- a/src/google/adk/cli/cli_eval.py
+++ b/src/google/adk/cli/cli_eval.py
@@ -34,6 +34,9 @@
 from ..evaluation.eval_case import get_all_tool_calls
 from ..evaluation.eval_case import IntermediateDataType
 from ..evaluation.eval_metrics import EvalMetric
+from ..evaluation.eval_metrics import Interval
+from ..evaluation.eval_metrics import MetricInfo
+from ..evaluation.eval_metrics import MetricValueInfo
 from ..evaluation.eval_result import EvalCaseResult
 from ..evaluation.eval_sets_manager import EvalSetsManager
 from ..utils.context_utils import Aclosing
@@ -70,6 +73,19 @@ def _get_agent_module(agent_module_file_path: str):
   return _import_from_path(module_name, file_path)
 
 
+def get_default_metric_info(
+    metric_name: str, description: str = ""
+) -> MetricInfo:
+  """Returns a default MetricInfo for a metric."""
+  return MetricInfo(
+      metric_name=metric_name,
+      description=description,
+      metric_value_info=MetricValueInfo(
+          interval=Interval(min_value=0.0, max_value=1.0)
+      ),
+  )
+
+
 def get_root_agent(agent_module_file_path: str) -> Agent:
   """Returns root agent given the agent module."""
   agent_module = _get_agent_module(agent_module_file_path)
diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
index 241c696351..0875f2523d 100644
--- a/src/google/adk/cli/cli_tools_click.py
+++ b/src/google/adk/cli/cli_tools_click.py
@@ -712,8 +712,11 @@ def cli_eval(
   logs.setup_adk_logger(getattr(logging, log_level.upper()))
 
   try:
+    import importlib
+
     from ..evaluation.base_eval_service import InferenceConfig
     from ..evaluation.base_eval_service import InferenceRequest
+    from ..evaluation.custom_metric_evaluator import _CustomMetricEvaluator
     from ..evaluation.eval_config import get_eval_metrics_from_config
     from ..evaluation.eval_config import get_evaluation_criteria_or_default
     from ..evaluation.eval_result import EvalCaseResult
@@ -723,9 +726,11 @@ def cli_eval(
     from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
     from ..evaluation.local_eval_sets_manager import load_eval_set_from_file
     from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
+    from ..evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
     from ..evaluation.simulation.user_simulator_provider import UserSimulatorProvider
     from .cli_eval import _collect_eval_results
     from .cli_eval import _collect_inferences
+    from .cli_eval import get_default_metric_info
     from .cli_eval import get_root_agent
     from .cli_eval import parse_and_get_evals_to_run
     from .cli_eval import pretty_print_eval_result
@@ -818,11 +823,30 @@ def cli_eval(
   )
 
   try:
+    metric_evaluator_registry = DEFAULT_METRIC_EVALUATOR_REGISTRY
+    if eval_config.custom_metrics:
+      for (
+          metric_name,
+          config,
+      ) in eval_config.custom_metrics.items():
+        if config.metric_info:
+          metric_info = config.metric_info.model_copy()
+          metric_info.metric_name = metric_name
+        else:
+          metric_info = get_default_metric_info(
+              metric_name=metric_name, description=config.description
+          )
+
+        metric_evaluator_registry.register_evaluator(
+            metric_info, _CustomMetricEvaluator
+        )
+
     eval_service = LocalEvalService(
         root_agent=root_agent,
         eval_sets_manager=eval_sets_manager,
         eval_set_results_manager=eval_set_results_manager,
         user_simulator_provider=user_simulator_provider,
+        metric_evaluator_registry=metric_evaluator_registry,
     )
 
     inference_results = asyncio.run(
diff --git a/src/google/adk/cli/utils/agent_loader.py b/src/google/adk/cli/utils/agent_loader.py
index d6965e5bbb..5b86adeffa 100644
--- a/src/google/adk/cli/utils/agent_loader.py
+++ b/src/google/adk/cli/utils/agent_loader.py
@@ -32,6 +32,7 @@
 from ...agents import config_agent_utils
 from ...agents.base_agent import BaseAgent
 from ...apps.app import App
+from ...tools.computer_use.computer_use_toolset import ComputerUseToolset
 from ...utils.feature_decorator import experimental
 from .base_agent_loader import BaseAgentLoader
 
@@ -358,12 +359,17 @@ def list_agents_detailed(self) -> list[dict[str, Any]]:
           agent = loaded
 
         language = self._determine_agent_language(agent_name)
+        is_computer_use = any(
+            isinstance(t, ComputerUseToolset)
+            for t in getattr(agent, "tools", [])
+        )
 
         app_info = {
             "name": agent_name,
             "root_agent_name": agent.name,
             "description": agent.description,
             "language": language,
+            "is_computer_use": is_computer_use,
         }
         apps_info.append(app_info)
 
diff --git a/src/google/adk/evaluation/custom_metric_evaluator.py b/src/google/adk/evaluation/custom_metric_evaluator.py
new file mode 100644
index 0000000000..1eb8666db8
--- /dev/null
+++ b/src/google/adk/evaluation/custom_metric_evaluator.py
@@ -0,0 +1,79 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import importlib
+import inspect
+from typing import Callable
+from typing import Optional
+
+from typing_extensions import override
+
+from .eval_case import ConversationScenario
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .eval_metrics import EvalStatus
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+
+
+def _get_metric_function(
+    custom_function_path: str,
+) -> Callable[..., EvaluationResult]:
+  """Returns the custom metric function from the given path."""
+  try:
+    module_name, function_name = custom_function_path.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    metric_function = getattr(module, function_name)
+    return metric_function
+  except (ImportError, AttributeError, ValueError) as e:
+    raise ImportError(
+        f"Could not import custom metric function from {custom_function_path}"
+    ) from e
+
+
+def _get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
+  if score is None:
+    return EvalStatus.NOT_EVALUATED
+  return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
+
+
+class _CustomMetricEvaluator(Evaluator):
+  """Evaluator for custom metrics."""
+
+  def __init__(self, eval_metric: EvalMetric, custom_function_path: str):
+    self._eval_metric = eval_metric
+    self._metric_function = _get_metric_function(custom_function_path)
+
+  @override
+  async def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
+      conversation_scenario: Optional[ConversationScenario] = None,
+  ) -> EvaluationResult:
+    if inspect.iscoroutinefunction(self._metric_function):
+      eval_result = await self._metric_function(
+          actual_invocations, expected_invocations, conversation_scenario
+      )
+    else:
+      eval_result = self._metric_function(
+          actual_invocations, expected_invocations, conversation_scenario
+      )
+
+    eval_result.overall_eval_status = _get_eval_status(
+        eval_result.overall_score, self._eval_metric.threshold
+    )
+    return eval_result
diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py
index 92b61ac57c..3cc5672ca9 100644
--- a/src/google/adk/evaluation/eval_config.py
+++ b/src/google/adk/evaluation/eval_config.py
@@ -28,12 +28,46 @@
 from ..agents.common_configs import CodeConfig
 from ..evaluation.eval_metrics import EvalMetric
 from .eval_metrics import BaseCriterion
+from .eval_metrics import MetricInfo
 from .eval_metrics import Threshold
 from .simulation.user_simulator import BaseUserSimulatorConfig
 
 logger = logging.getLogger("google_adk." + __name__)
 
 
+class CustomMetricConfig(BaseModel):
+  """Configuration for a custom metric."""
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  code_config: CodeConfig = Field(
+      description=(
+          "Code config for the custom metric, used to locate the custom metric"
+          " function."
+      )
+  )
+  metric_info: Optional[MetricInfo] = Field(
+      default=None,
+      description="Metric info for the custom metric.",
+  )
+  description: str = Field(
+      default="",
+      description="Description for the custom metric info.",
+  )
+
+  @model_validator(mode="after")
+  def check_code_config_args(self) -> "CustomMetricConfig":
+    """Checks that the code config does not have args."""
+    if self.code_config.args:
+      raise ValueError(
+          "args field in CodeConfig for custom metric is not supported."
+      )
+    return self
+
+
 class EvalConfig(BaseModel):
   """Configurations needed to run an Eval.
 
@@ -74,24 +108,43 @@ class EvalConfig(BaseModel):
 """,
   )
 
-  custom_metrics: Optional[dict[str, CodeConfig]] = Field(
+  custom_metrics: Optional[dict[str, CustomMetricConfig]] = Field(
       default=None,
-      description="""A dictionary mapping custom metric names to CodeConfig
-objects, which specify the path to the function for each custom metric.
+      description="""A dictionary mapping custom metric names to
+a CustomMetricConfig object.
 
 If a metric name in `criteria` is also present in `custom_metrics`, the
-corresponding `CodeConfig`'s `name` field will be used to locate the custom
-metric implementation. The `name` field should contain the fully qualified
-path to the custom metric function, e.g., `my.custom.metrics.metric_function`.
+`code_config` in `CustomMetricConfig` will be used to locate the custom metric
+implementation.
+
+The `metric` field in `CustomMetricConfig` can be used to provide metric
+information like `min_value`, `max_value`, and `description`. If `metric`
+is not provided, a default `MetricInfo` will be created, using
+`description` from `CustomMetricConfig` if provided, and default values
+for `min_value` (0.0) and `max_value` (1.0).
 
 Example:
 {
   "criteria": {
-    "my_custom_metric": 0.5
+    "my_custom_metric": 0.5,
+    "my_simple_metric": 0.8
   },
   "custom_metrics": {
+    "my_simple_metric": {
+      "code_config": {
+        "name": "path.to.my.simple.metric.function"
+      }
+    },
     "my_custom_metric": {
-      "name": "path.to.my.custom.metric.function"
+      "code_config": {
+        "name": "path.to.my.custom.metric.function"
+      },
+      "metric": {
+        "metric_name": "my_custom_metric",
+        "min_value": -10.0,
+        "max_value": 10.0,
+        "description": "My custom metric."
+      }
     }
   }
 }
@@ -103,17 +156,6 @@ class EvalConfig(BaseModel):
       description="Config to be used by the user simulator.",
   )
 
-  @model_validator(mode="after")
-  def check_custom_metrics_code_config_args(self) -> "EvalConfig":
-    if self.custom_metrics:
-      for metric_name, metric_config in self.custom_metrics.items():
-        if metric_config.args:
-          raise ValueError(
-              f"args field in CodeConfig for custom metric '{metric_name}' is"
-              " not supported."
-          )
-    return self
-
 
 _DEFAULT_EVAL_CONFIG = EvalConfig(
     criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
@@ -144,11 +186,10 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
   if eval_config.criteria:
     for metric_name, criterion in eval_config.criteria.items():
       custom_function_path = None
-      if (
-          eval_config.custom_metrics
-          and metric_name in eval_config.custom_metrics
+      if eval_config.custom_metrics and (
+          config := eval_config.custom_metrics.get(metric_name)
       ):
-        custom_function_path = eval_config.custom_metrics[metric_name].name
+        custom_function_path = config.code_config.name
 
       if isinstance(criterion, float):
         eval_metric_list.append(
diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py
index 7031266e27..5b8cd21690 100644
--- a/src/google/adk/evaluation/local_eval_service.py
+++ b/src/google/adk/evaluation/local_eval_service.py
@@ -268,6 +268,22 @@ async def _evaluate_single_inference_result(
         else 'test_user_id'
     )
 
+    if (
+        inference_result.status == InferenceStatus.FAILURE
+        or inference_result.inferences is None
+    ):
+      logger.error(
+          'Evaluation attempted on failed inference for eval case `%s`.'
+          ' Error: %s',
+          inference_result.eval_case_id,
+          inference_result.error_message,
+      )
+      eval_case_result = await self._build_not_evaluated_eval_case_result(
+          inference_result=inference_result,
+          user_id=user_id,
+      )
+      return (inference_result, eval_case_result)
+
     if eval_case.conversation_scenario is None and len(
         inference_result.inferences
     ) != len(eval_case.conversation):
@@ -464,6 +480,31 @@ def _generate_final_eval_status(
 
     return final_eval_status
 
+  async def _build_not_evaluated_eval_case_result(
+      self,
+      *,
+      inference_result: InferenceResult,
+      user_id: str,
+  ) -> EvalCaseResult:
+    """Constructs an EvalCaseResult for cases that could not be evaluated."""
+    session_details = await self._session_service.get_session(
+        app_name=inference_result.app_name,
+        user_id=user_id,
+        session_id=inference_result.session_id,
+    )
+
+    return EvalCaseResult(
+        eval_set_file=inference_result.eval_set_id,
+        eval_set_id=inference_result.eval_set_id,
+        eval_id=inference_result.eval_case_id,
+        final_eval_status=EvalStatus.NOT_EVALUATED,
+        overall_eval_metric_results=[],
+        eval_metric_result_per_invocation=[],
+        session_id=inference_result.session_id,
+        session_details=session_details,
+        user_id=user_id,
+    )
+
   async def _perform_inference_single_eval_item(
       self,
       app_name: str,
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
index 9e1fc6c23b..c1010e5ddf 100644
--- a/src/google/adk/evaluation/metric_evaluator_registry.py
+++ b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -18,6 +18,7 @@
 
 from ..errors.not_found_error import NotFoundError
 from ..utils.feature_decorator import experimental
+from .custom_metric_evaluator import _CustomMetricEvaluator
 from .eval_metrics import EvalMetric
 from .eval_metrics import MetricInfo
 from .eval_metrics import PrebuiltMetrics
@@ -62,7 +63,13 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
     if eval_metric.metric_name not in self._registry:
       raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")
 
-    return self._registry[eval_metric.metric_name][0](eval_metric=eval_metric)
+    evaluator_type = self._registry[eval_metric.metric_name][0]
+    if issubclass(evaluator_type, _CustomMetricEvaluator):
+      return evaluator_type(
+          eval_metric=eval_metric,
+          custom_function_path=eval_metric.custom_function_path,
+      )
+    return evaluator_type(eval_metric=eval_metric)
 
   def register_evaluator(
       self,
diff --git a/src/google/adk/tools/toolbox_toolset.py b/src/google/adk/tools/toolbox_toolset.py
index 73f27f3fc2..e1e7e576d6 100644
--- a/src/google/adk/tools/toolbox_toolset.py
+++ b/src/google/adk/tools/toolbox_toolset.py
@@ -35,19 +35,9 @@
 class ToolboxToolset(BaseToolset):
   """A class that provides access to toolbox toolsets.
 
-  This class acts as a bridge to the `toolbox-adk` package.
-  You must install `toolbox-adk` to use this class.
-
   Example:
   ```python
-  from toolbox_adk import CredentialStrategy
-
-  toolbox_toolset = ToolboxToolset(
-      server_url="http://127.0.0.1:5000",
-      # toolset_name and tool_names are optional. If omitted, all tools are
-      loaded.
-      credentials=CredentialStrategy.toolbox_identity()
-  )
+  toolbox_toolset = ToolboxToolset("http://127.0.0.1:5000")
   ```
   """
 
@@ -64,29 +54,37 @@ def __init__(
       additional_headers: Optional[Mapping[str, str]] = None,
       **kwargs,
   ):
-    """Args:
-
-    server_url: The URL of the toolbox server.
-    toolset_name: The name of the toolbox toolset to load.
-    tool_names: The names of the tools to load.
-    auth_token_getters: (Deprecated) Map of auth token getters.
-    bound_params: Parameters to bind to the tools.
-    credentials: (Optional) toolbox_adk.CredentialConfig object.
-    additional_headers: (Optional) Static headers dictionary.
-    **kwargs: Additional arguments passed to the underlying
-    toolbox_adk.ToolboxToolset.
+    """Initializes the ToolboxToolset.
+
+    Args:
+      server_url: The URL of the toolbox server.
+      toolset_name: (Optional) The name of the toolbox toolset to load.
+      tool_names: (Optional) The names of the tools to load.
+      auth_token_getters: (Optional) A mapping of authentication service names
+        to callables that return the corresponding authentication token. see:
+        https://github.com/googleapis/mcp-toolbox-sdk-python/tree/main/packages/toolbox-core#authenticating-tools
+          for details.
+      bound_params: (Optional) A mapping of parameter names to bind to specific
+        values or callables that are called to produce values as needed. see:
+        https://github.com/googleapis/mcp-toolbox-sdk-python/tree/main/packages/toolbox-core#binding-parameter-values
+          for details.
+      credentials: (Optional) toolbox_adk.CredentialConfig object.
+      additional_headers: (Optional) Static headers mapping.
+      **kwargs: Additional arguments passed to the underlying
+        toolbox_adk.ToolboxToolset.
+
+    The resulting ToolboxToolset will contain both tools loaded by tool_names
+    and toolset_name.
+
+    Note: toolset_name and tool_names are optional.
+    If both are omitted, all tools are loaded.
     """
-    if not toolset_name and not tool_names:
-      raise ValueError(
-          "Either 'toolset_name' or 'tool_names' must be provided."
-      )
-
     try:
       from toolbox_adk import ToolboxToolset as RealToolboxToolset  # pylint: disable=import-outside-toplevel
     except ImportError as exc:
       raise ImportError(
           "ToolboxToolset requires the 'toolbox-adk' package. "
-          "Please install it using `pip install toolbox-adk`."
+          "Please install it using `pip install google-adk[toolbox]`."
       ) from exc
 
     super().__init__()
@@ -95,10 +93,10 @@ def __init__(
         server_url=server_url,
         toolset_name=toolset_name,
         tool_names=tool_names,
+        auth_token_getters=auth_token_getters,
+        bound_params=bound_params,
         credentials=credentials,
         additional_headers=additional_headers,
-        bound_params=bound_params,
-        auth_token_getters=auth_token_getters,
         **kwargs,
     )
 
diff --git a/tests/unittests/cli/test_fast_api.py b/tests/unittests/cli/test_fast_api.py
index b7a9773072..6a98f75a88 100755
--- a/tests/unittests/cli/test_fast_api.py
+++ b/tests/unittests/cli/test_fast_api.py
@@ -201,6 +201,7 @@ def list_agents_detailed(self):
           "root_agent_name": "test_agent",
           "description": "A test agent for unit testing",
           "language": "python",
+          "is_computer_use": False,
       }]
 
   return MockAgentLoader(".")
@@ -735,6 +736,8 @@ def test_list_apps_detailed(test_app):
     assert "description" in app
     assert "language" in app
     assert app["language"] in ["yaml", "python"]
+    assert "isComputerUse" in app
+    assert not app["isComputerUse"]
 
   logger.info(f"Listed apps: {data}")
 
diff --git a/tests/unittests/cli/utils/test_agent_loader.py b/tests/unittests/cli/utils/test_agent_loader.py
index 4950fecbd3..130fd72229 100644
--- a/tests/unittests/cli/utils/test_agent_loader.py
+++ b/tests/unittests/cli/utils/test_agent_loader.py
@@ -20,6 +20,7 @@
 import sys
 import tempfile
 from textwrap import dedent
+from unittest import mock
 
 from google.adk.cli.utils import agent_loader as agent_loader_module
 from google.adk.cli.utils.agent_loader import AgentLoader
@@ -49,7 +50,8 @@ def create_agent_structure(
     Args:
         temp_dir: The temporary directory to create the agent in
         agent_name: Name of the agent
-        structure_type: One of 'module', 'package_with_root', 'package_with_agent_module'
+        structure_type: One of 'module', 'package_with_root',
+          'package_with_agent_module'
     """
     if structure_type == "module":
       # Structure: agents_dir/agent_name.py
@@ -928,3 +930,66 @@ def test_yaml_config_agents_dir_parameter(self):
       # Verify they are different agents
       assert default_agent.name != custom_agent.name
       assert explicit_agent.name == default_agent.name
+
+  def test_list_agents_detailed_identifies_computer_use(self):
+    """Test that list_agents_detailed correctly identifies computer use capability."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+      temp_path = Path(temp_dir)
+      agent_name = "computer_use_agent"
+
+      agent_dir = temp_path / agent_name
+      agent_dir.mkdir()
+
+      (agent_dir / "__init__.py").write_text(dedent(f"""
+          from typing import Any
+          from unittest.mock import MagicMock
+          from google.adk.agents.base_agent import BaseAgent
+          from google.adk.tools.computer_use.computer_use_toolset import ComputerUseToolset
+          from google.adk.tools.computer_use.base_computer import BaseComputer
+
+          class {agent_name.title()}Agent(BaseAgent):
+              tools: list[Any] = []
+
+              def __init__(self):
+                  super().__init__(name="{agent_name}")
+                  self.tools = [ComputerUseToolset(computer=MagicMock(spec=BaseComputer))]
+
+          root_agent = {agent_name.title()}Agent()
+      """))
+
+      loader = AgentLoader(str(temp_path))
+      detailed_list = loader.list_agents_detailed()
+
+      assert len(detailed_list) == 1
+      assert detailed_list[0]["name"] == agent_name
+      assert detailed_list[0]["is_computer_use"]
+
+  def test_list_agents_detailed_detects_no_computer_use(self):
+    """Test that list_agents_detailed sets is_computer_use to False when toolset is absent."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+      temp_path = Path(temp_dir)
+      agent_name = "standard_agent"
+
+      agent_dir = temp_path / agent_name
+      agent_dir.mkdir()
+
+      (agent_dir / "__init__.py").write_text(dedent(f"""
+          from typing import Any
+          from google.adk.agents.base_agent import BaseAgent
+
+          class {agent_name.title()}Agent(BaseAgent):
+              tools: list[Any] = []
+
+              def __init__(self):
+                  super().__init__(name="{agent_name}")
+                  self.tools = []
+
+          root_agent = {agent_name.title()}Agent()
+      """))
+
+      loader = AgentLoader(str(temp_path))
+      detailed_list = loader.list_agents_detailed()
+
+      assert len(detailed_list) == 1
+      assert detailed_list[0]["name"] == agent_name
+      assert not detailed_list[0]["is_computer_use"]
diff --git a/tests/unittests/evaluation/test_eval_config.py b/tests/unittests/evaluation/test_eval_config.py
index fd1a7938eb..54f22b5066 100644
--- a/tests/unittests/evaluation/test_eval_config.py
+++ b/tests/unittests/evaluation/test_eval_config.py
@@ -109,8 +109,12 @@ def test_get_eval_metrics_from_config_with_custom_metrics():
           },
       },
       custom_metrics={
-          "custom_metric_1": {"name": "path/to/custom/metric_1"},
-          "custom_metric_2": {"name": "path/to/custom/metric_2"},
+          "custom_metric_1": {
+              "code_config": {"name": "path/to/custom/metric_1"},
+          },
+          "custom_metric_2": {
+              "code_config": {"name": "path/to/custom/metric_2"},
+          },
       },
   )
   eval_metrics = get_eval_metrics_from_config(eval_config)
@@ -128,10 +132,12 @@ def test_get_eval_metrics_from_config_with_custom_metrics():
 
 def test_custom_metric_code_config_with_args_raises_error():
   with pytest.raises(ValueError):
-    eval_config = EvalConfig(
+    _ = EvalConfig(
         criteria={"custom_metric": 1.0},
         custom_metrics={
-            "custom_metric": {"name": "name", "args": [{"value": 1}]}
+            "custom_metric": {
+                "code_config": {"name": "name", "args": [{"value": 1}]},
+            }
         },
     )
 
diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py
index 08ef2aa8b0..4ba91711ee 100644
--- a/tests/unittests/evaluation/test_local_eval_service.py
+++ b/tests/unittests/evaluation/test_local_eval_service.py
@@ -325,6 +325,82 @@ async def test_evaluate_success(
   assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
 
 
+@pytest.mark.asyncio
+async def test_evaluate_skips_failed_inference_results(
+    eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
+):
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="test user content.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="test final response.")]
+      ),
+  )
+  inference_results = [
+      InferenceResult(
+          app_name="test_app",
+          eval_set_id="test_eval_set",
+          eval_case_id="case_failure",
+          inferences=None,
+          session_id="session_fail",
+          status=InferenceStatus.FAILURE,
+          error_message="simulated failure",
+      ),
+      InferenceResult(
+          app_name="test_app",
+          eval_set_id="test_eval_set",
+          eval_case_id="case_success",
+          inferences=[invocation.model_copy(deep=True)],
+          session_id="session_success",
+          status=InferenceStatus.SUCCESS,
+      ),
+      InferenceResult(
+          app_name="test_app",
+          eval_set_id="test_eval_set",
+          eval_case_id="case_unknown",
+          inferences=[invocation.model_copy(deep=True)],
+          session_id="session_unknown",
+          status=InferenceStatus.UNKNOWN,
+      ),
+  ]
+  eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
+  evaluate_request = EvaluateRequest(
+      inference_results=inference_results,
+      evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
+  )
+
+  mock_eval_case = mocker.MagicMock(spec=EvalCase)
+  mock_eval_case.conversation = [invocation.model_copy(deep=True)]
+  mock_eval_case.conversation_scenario = None
+  mock_eval_case.session_input = None
+  mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
+
+  results = []
+  async for result in eval_service.evaluate(evaluate_request):
+    results.append(result)
+
+  assert len(results) == 3
+  results_by_case = {result.eval_id: result for result in results}
+
+  failure_result = results_by_case["case_failure"]
+  assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
+  assert failure_result.overall_eval_metric_results == []
+  assert failure_result.eval_metric_result_per_invocation == []
+
+  for case_id in ["case_success", "case_unknown"]:
+    case_result = results_by_case[case_id]
+    assert case_result.final_eval_status == EvalStatus.PASSED
+    assert len(case_result.overall_eval_metric_results) == 1
+    assert (
+        case_result.overall_eval_metric_results[0].metric_name == "fake_metric"
+    )
+    assert case_result.overall_eval_metric_results[0].score == 0.9
+
+  assert mock_eval_sets_manager.get_eval_case.call_count == 3
+  assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3
+
+
 @pytest.mark.asyncio
 async def test_evaluate_eval_case_not_found(
     eval_service,
@@ -418,6 +494,93 @@ async def test_evaluate_single_inference_result(
     assert metric_result.eval_status == EvalStatus.PASSED
 
 
+@pytest.mark.asyncio
+async def test_evaluate_single_inference_result_handles_failed_inference(
+    eval_service, mock_eval_sets_manager, mocker
+):
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="test user content.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="test final response.")]
+      ),
+  )
+  inference_result = InferenceResult(
+      app_name="test_app",
+      eval_set_id="test_eval_set",
+      eval_case_id="case1",
+      inferences=None,
+      session_id="session1",
+      status=InferenceStatus.FAILURE,
+      error_message="simulated inference failure",
+  )
+  eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
+  evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
+
+  mock_eval_case = mocker.MagicMock(spec=EvalCase)
+  mock_eval_case.conversation = [invocation.model_copy(deep=True)]
+  mock_eval_case.conversation_scenario = None
+  mock_eval_case.session_input = None
+  mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
+
+  _, result = await eval_service._evaluate_single_inference_result(
+      inference_result=inference_result, evaluate_config=evaluate_config
+  )
+
+  assert isinstance(result, EvalCaseResult)
+  assert result.eval_id == "case1"
+  assert result.final_eval_status == EvalStatus.NOT_EVALUATED
+  assert result.overall_eval_metric_results == []
+  assert result.eval_metric_result_per_invocation == []
+  mock_eval_sets_manager.get_eval_case.assert_called_once_with(
+      app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
+  )
+
+
+@pytest.mark.asyncio
+async def test_evaluate_single_inference_result_handles_missing_inferences(
+    eval_service, mock_eval_sets_manager, mocker
+):
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="test user content.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="test final response.")]
+      ),
+  )
+  inference_result = InferenceResult(
+      app_name="test_app",
+      eval_set_id="test_eval_set",
+      eval_case_id="case1",
+      inferences=None,
+      session_id="session1",
+      status=InferenceStatus.SUCCESS,
+  )
+  eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
+  evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
+
+  mock_eval_case = mocker.MagicMock(spec=EvalCase)
+  mock_eval_case.conversation = [invocation.model_copy(deep=True)]
+  mock_eval_case.conversation_scenario = None
+  mock_eval_case.session_input = None
+  mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
+
+  _, result = await eval_service._evaluate_single_inference_result(
+      inference_result=inference_result, evaluate_config=evaluate_config
+  )
+
+  assert isinstance(result, EvalCaseResult)
+  assert result.eval_id == "case1"
+  assert result.final_eval_status == EvalStatus.NOT_EVALUATED
+  assert result.overall_eval_metric_results == []
+  assert result.eval_metric_result_per_invocation == []
+  mock_eval_sets_manager.get_eval_case.assert_called_once_with(
+      app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
+  )
+
+
 @pytest.mark.asyncio
 async def test_evaluate_single_inference_result_for_conversation_scenario(
     eval_service, mock_eval_sets_manager, mocker