From 81eaeb5eba6d40cde0cf6147d96921ed1bf7bb31 Mon Sep 17 00:00:00 2001 From: Google Team Member Date: Fri, 16 Jan 2026 15:47:07 -0800 Subject: [PATCH] fix: Remove custom metadata from A2A response events PiperOrigin-RevId: 857331378 --- src/google/adk/agents/remote_a2a_agent.py | 2 - .../adk/evaluation/local_eval_service.py | 41 ----- .../evaluation/test_local_eval_service.py | 163 ------------------ 3 files changed, 206 deletions(-) diff --git a/src/google/adk/agents/remote_a2a_agent.py b/src/google/adk/agents/remote_a2a_agent.py index 0ac47c0236..dbc51c6dbf 100644 --- a/src/google/adk/agents/remote_a2a_agent.py +++ b/src/google/adk/agents/remote_a2a_agent.py @@ -504,8 +504,6 @@ async def _handle_a2a_response( invocation_id=ctx.invocation_id, branch=ctx.branch, ) - event.custom_metadata = event.custom_metadata or {} - event.custom_metadata[A2A_METADATA_PREFIX + "response"] = True return event except A2AClientError as e: logger.error("Failed to handle A2A response: %s", e) diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 5b8cd21690..7031266e27 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -268,22 +268,6 @@ async def _evaluate_single_inference_result( else 'test_user_id' ) - if ( - inference_result.status == InferenceStatus.FAILURE - or inference_result.inferences is None - ): - logger.error( - 'Evaluation attempted on failed inference for eval case `%s`.' - ' Error: %s', - inference_result.eval_case_id, - inference_result.error_message, - ) - eval_case_result = await self._build_not_evaluated_eval_case_result( - inference_result=inference_result, - user_id=user_id, - ) - return (inference_result, eval_case_result) - if eval_case.conversation_scenario is None and len( inference_result.inferences ) != len(eval_case.conversation): @@ -480,31 +464,6 @@ def _generate_final_eval_status( return final_eval_status - async def _build_not_evaluated_eval_case_result( - self, - *, - inference_result: InferenceResult, - user_id: str, - ) -> EvalCaseResult: - """Constructs an EvalCaseResult for cases that could not be evaluated.""" - session_details = await self._session_service.get_session( - app_name=inference_result.app_name, - user_id=user_id, - session_id=inference_result.session_id, - ) - - return EvalCaseResult( - eval_set_file=inference_result.eval_set_id, - eval_set_id=inference_result.eval_set_id, - eval_id=inference_result.eval_case_id, - final_eval_status=EvalStatus.NOT_EVALUATED, - overall_eval_metric_results=[], - eval_metric_result_per_invocation=[], - session_id=inference_result.session_id, - session_details=session_details, - user_id=user_id, - ) - async def _perform_inference_single_eval_item( self, app_name: str, diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index 4ba91711ee..08ef2aa8b0 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -325,82 +325,6 @@ async def test_evaluate_success( assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2 -@pytest.mark.asyncio -async def test_evaluate_skips_failed_inference_results( - eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker -): - invocation = Invocation( - user_content=genai_types.Content( - parts=[genai_types.Part(text="test user content.")] - ), - final_response=genai_types.Content( - parts=[genai_types.Part(text="test final response.")] - ), - ) - inference_results = [ - InferenceResult( - app_name="test_app", - eval_set_id="test_eval_set", - eval_case_id="case_failure", - inferences=None, - session_id="session_fail", - status=InferenceStatus.FAILURE, - error_message="simulated failure", - ), - InferenceResult( - app_name="test_app", - eval_set_id="test_eval_set", - eval_case_id="case_success", - inferences=[invocation.model_copy(deep=True)], - session_id="session_success", - status=InferenceStatus.SUCCESS, - ), - InferenceResult( - app_name="test_app", - eval_set_id="test_eval_set", - eval_case_id="case_unknown", - inferences=[invocation.model_copy(deep=True)], - session_id="session_unknown", - status=InferenceStatus.UNKNOWN, - ), - ] - eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) - evaluate_request = EvaluateRequest( - inference_results=inference_results, - evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2), - ) - - mock_eval_case = mocker.MagicMock(spec=EvalCase) - mock_eval_case.conversation = [invocation.model_copy(deep=True)] - mock_eval_case.conversation_scenario = None - mock_eval_case.session_input = None - mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case - - results = [] - async for result in eval_service.evaluate(evaluate_request): - results.append(result) - - assert len(results) == 3 - results_by_case = {result.eval_id: result for result in results} - - failure_result = results_by_case["case_failure"] - assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED - assert failure_result.overall_eval_metric_results == [] - assert failure_result.eval_metric_result_per_invocation == [] - - for case_id in ["case_success", "case_unknown"]: - case_result = results_by_case[case_id] - assert case_result.final_eval_status == EvalStatus.PASSED - assert len(case_result.overall_eval_metric_results) == 1 - assert ( - case_result.overall_eval_metric_results[0].metric_name == "fake_metric" - ) - assert case_result.overall_eval_metric_results[0].score == 0.9 - - assert mock_eval_sets_manager.get_eval_case.call_count == 3 - assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3 - - @pytest.mark.asyncio async def test_evaluate_eval_case_not_found( eval_service, @@ -494,93 +418,6 @@ async def test_evaluate_single_inference_result( assert metric_result.eval_status == EvalStatus.PASSED -@pytest.mark.asyncio -async def test_evaluate_single_inference_result_handles_failed_inference( - eval_service, mock_eval_sets_manager, mocker -): - invocation = Invocation( - user_content=genai_types.Content( - parts=[genai_types.Part(text="test user content.")] - ), - final_response=genai_types.Content( - parts=[genai_types.Part(text="test final response.")] - ), - ) - inference_result = InferenceResult( - app_name="test_app", - eval_set_id="test_eval_set", - eval_case_id="case1", - inferences=None, - session_id="session1", - status=InferenceStatus.FAILURE, - error_message="simulated inference failure", - ) - eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) - evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) - - mock_eval_case = mocker.MagicMock(spec=EvalCase) - mock_eval_case.conversation = [invocation.model_copy(deep=True)] - mock_eval_case.conversation_scenario = None - mock_eval_case.session_input = None - mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case - - _, result = await eval_service._evaluate_single_inference_result( - inference_result=inference_result, evaluate_config=evaluate_config - ) - - assert isinstance(result, EvalCaseResult) - assert result.eval_id == "case1" - assert result.final_eval_status == EvalStatus.NOT_EVALUATED - assert result.overall_eval_metric_results == [] - assert result.eval_metric_result_per_invocation == [] - mock_eval_sets_manager.get_eval_case.assert_called_once_with( - app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" - ) - - -@pytest.mark.asyncio -async def test_evaluate_single_inference_result_handles_missing_inferences( - eval_service, mock_eval_sets_manager, mocker -): - invocation = Invocation( - user_content=genai_types.Content( - parts=[genai_types.Part(text="test user content.")] - ), - final_response=genai_types.Content( - parts=[genai_types.Part(text="test final response.")] - ), - ) - inference_result = InferenceResult( - app_name="test_app", - eval_set_id="test_eval_set", - eval_case_id="case1", - inferences=None, - session_id="session1", - status=InferenceStatus.SUCCESS, - ) - eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) - evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) - - mock_eval_case = mocker.MagicMock(spec=EvalCase) - mock_eval_case.conversation = [invocation.model_copy(deep=True)] - mock_eval_case.conversation_scenario = None - mock_eval_case.session_input = None - mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case - - _, result = await eval_service._evaluate_single_inference_result( - inference_result=inference_result, evaluate_config=evaluate_config - ) - - assert isinstance(result, EvalCaseResult) - assert result.eval_id == "case1" - assert result.final_eval_status == EvalStatus.NOT_EVALUATED - assert result.overall_eval_metric_results == [] - assert result.eval_metric_result_per_invocation == [] - mock_eval_sets_manager.get_eval_case.assert_called_once_with( - app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" - ) - - @pytest.mark.asyncio async def test_evaluate_single_inference_result_for_conversation_scenario( eval_service, mock_eval_sets_manager, mocker