fix(openai-agents): also emit spans for MCP tool calls done by the LLM (#4875)

constantinius · sentrivana · web-flow · commit 2e4457c8da51 · 2025-10-02T14:35:31.000Z
### Description We cannot directly intercept MCP Tool calls, as they are done remotely by the LLM and not in the Agent itself. However, we see when such a tool call took place, so we can emit a zero-length span with the tool call specifics. It will start at the same time as the parent span. Closes https://linear.app/getsentry/issue/TET-1192/openai-agents-hosted-mcp-calls-cannot-be-wrapped-in-an-execute-tool  --- > [!NOTE] > Emit execute_tool spans for MCP tool calls detected in agent results, with tool metadata, input/output (PII-gated), and error status. > > - **Tracing/Spans (openai_agents)**: > - Add `utils._create_mcp_execute_tool_spans` to emit `OP.GEN_AI_EXECUTE_TOOL` spans for MCP tool calls (`McpCall`) found in `result.output`. > - Sets `GEN_AI_TOOL_TYPE=mcp`, `GEN_AI_TOOL_NAME`, propagates input/output when PII allowed, and marks `SPANSTATUS.ERROR` on error. > - Spans start at the parent span's start time (zero-length representation of remote call). > - Wire into `spans/ai_client.update_ai_client_span` to create these tool spans after setting usage/input/output data. > - Update imports to include `SPANSTATUS` and `OP`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 96df8c1. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup>  --------- Co-authored-by: Ivana Kellyer <ivana.kellyer@sentry.io>
diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
@@ -7,6 +7,7 @@
     _set_input_data,
     _set_output_data,
     _set_usage_data,
+    _create_mcp_execute_tool_spans,
 )
 
 from typing import TYPE_CHECKING
@@ -38,3 +39,4 @@ def update_ai_client_span(span, agent, get_response_kwargs, result):
     _set_usage_data(span, result.usage)
     _set_input_data(span, get_response_kwargs)
     _set_output_data(span, result)
+    _create_mcp_execute_tool_spans(span, result)
diff --git a/sentry_sdk/integrations/openai_agents/utils.py b/sentry_sdk/integrations/openai_agents/utils.py
@@ -1,6 +1,6 @@
 import sentry_sdk
 from sentry_sdk.ai.utils import set_data_normalized
-from sentry_sdk.consts import SPANDATA
+from sentry_sdk.consts import SPANDATA, SPANSTATUS, OP
 from sentry_sdk.integrations import DidNotEnable
 from sentry_sdk.scope import should_send_default_pii
 from sentry_sdk.tracing_utils import set_span_errored
@@ -156,3 +156,27 @@ def _set_output_data(span, result):
         set_data_normalized(
             span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_messages["response"]
         )
+
+
+def _create_mcp_execute_tool_spans(span, result):
+    # type: (sentry_sdk.tracing.Span, agents.Result) -> None
+    for output in result.output:
+        if output.__class__.__name__ == "McpCall":
+            with sentry_sdk.start_span(
+                op=OP.GEN_AI_EXECUTE_TOOL,
+                description=f"execute_tool {output.name}",
+                start_timestamp=span.start_timestamp,
+            ) as execute_tool_span:
+                set_data_normalized(execute_tool_span, SPANDATA.GEN_AI_TOOL_TYPE, "mcp")
+                set_data_normalized(
+                    execute_tool_span, SPANDATA.GEN_AI_TOOL_NAME, output.name
+                )
+                if should_send_default_pii():
+                    execute_tool_span.set_data(
+                        SPANDATA.GEN_AI_TOOL_INPUT, output.arguments
+                    )
+                    execute_tool_span.set_data(
+                        SPANDATA.GEN_AI_TOOL_OUTPUT, output.output
+                    )
+                if output.error:
+                    execute_tool_span.set_status(SPANSTATUS.ERROR)
diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
@@ -15,6 +15,7 @@
     ModelSettings,
 )
 from agents.items import (
+    McpCall,
     ResponseOutputMessage,
     ResponseOutputText,
     ResponseFunctionToolCall,
@@ -683,6 +684,307 @@ async def test_span_status_error(sentry_init, capture_events, test_agent):
     assert transaction["contexts"]["trace"]["status"] == "error"
 
 
+@pytest.mark.asyncio
+async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent):
+    """
+    Test that MCP (Model Context Protocol) tool calls create execute_tool spans.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object
+            mcp_call = McpCall(
+                id="mcp_call_123",
+                name="test_mcp_tool",
+                arguments='{"query": "search term"}',
+                output="MCP tool executed successfully",
+                error=None,
+                type="mcp_call",
+                server_label="test_server",
+            )
+
+            # Create a ModelResponse with an McpCall in the output
+            mcp_response = ModelResponse(
+                output=[mcp_call],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
+                ),
+                response_id="resp_mcp_123",
+            )
+
+            # Final response after MCP tool execution
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Task completed using MCP tool",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
+                ),
+                response_id="resp_final_123",
+            )
+
+            mock_get_response.side_effect = [mcp_response, final_response]
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use MCP tool",
+                run_config=test_run_config,
+            )
+
+    (transaction,) = events
+    spans = transaction["spans"]
+
+    # Find the MCP execute_tool span
+    mcp_tool_span = None
+    for span in spans:
+        if (
+            span.get("description") == "execute_tool test_mcp_tool"
+            and span.get("data", {}).get("gen_ai.tool.type") == "mcp"
+        ):
+            mcp_tool_span = span
+            break
+
+    # Verify the MCP tool span was created
+    assert mcp_tool_span is not None, "MCP execute_tool span was not created"
+    assert mcp_tool_span["description"] == "execute_tool test_mcp_tool"
+    assert mcp_tool_span["data"]["gen_ai.tool.type"] == "mcp"
+    assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool"
+    assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "search term"}'
+    assert (
+        mcp_tool_span["data"]["gen_ai.tool.output"] == "MCP tool executed successfully"
+    )
+
+    # Verify no error status since error was None
+    assert mcp_tool_span.get("tags", {}).get("status") != "error"
+
+
+@pytest.mark.asyncio
+async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_agent):
+    """
+    Test that MCP tool calls with errors are tracked with error status.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object with an error
+            mcp_call_with_error = McpCall(
+                id="mcp_call_error_123",
+                name="failing_mcp_tool",
+                arguments='{"query": "test"}',
+                output=None,
+                error="MCP tool execution failed",
+                type="mcp_call",
+                server_label="test_server",
+            )
+
+            # Create a ModelResponse with a failing McpCall
+            mcp_response = ModelResponse(
+                output=[mcp_call_with_error],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
+                ),
+                response_id="resp_mcp_error_123",
+            )
+
+            # Final response after error
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="The MCP tool encountered an error",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
+                ),
+                response_id="resp_final_error_123",
+            )
+
+            mock_get_response.side_effect = [mcp_response, final_response]
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use failing MCP tool",
+                run_config=test_run_config,
+            )
+
+    (transaction,) = events
+    spans = transaction["spans"]
+
+    # Find the MCP execute_tool span with error
+    mcp_tool_span = None
+    for span in spans:
+        if (
+            span.get("description") == "execute_tool failing_mcp_tool"
+            and span.get("data", {}).get("gen_ai.tool.type") == "mcp"
+        ):
+            mcp_tool_span = span
+            break
+
+    # Verify the MCP tool span was created with error status
+    assert mcp_tool_span is not None, "MCP execute_tool span was not created"
+    assert mcp_tool_span["description"] == "execute_tool failing_mcp_tool"
+    assert mcp_tool_span["data"]["gen_ai.tool.type"] == "mcp"
+    assert mcp_tool_span["data"]["gen_ai.tool.name"] == "failing_mcp_tool"
+    assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "test"}'
+    assert mcp_tool_span["data"]["gen_ai.tool.output"] is None
+
+    # Verify error status was set
+    assert mcp_tool_span["tags"]["status"] == "error"
+
+
+@pytest.mark.asyncio
+async def test_mcp_tool_execution_without_pii(sentry_init, capture_events, test_agent):
+    """
+    Test that MCP tool input/output are not included when send_default_pii is False.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object
+            mcp_call = McpCall(
+                id="mcp_call_pii_123",
+                name="test_mcp_tool",
+                arguments='{"query": "sensitive data"}',
+                output="Result with sensitive info",
+                error=None,
+                type="mcp_call",
+                server_label="test_server",
+            )
+
+            # Create a ModelResponse with an McpCall
+            mcp_response = ModelResponse(
+                output=[mcp_call],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
+                ),
+                response_id="resp_mcp_123",
+            )
+
+            # Final response
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Task completed",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
+                ),
+                response_id="resp_final_123",
+            )
+
+            mock_get_response.side_effect = [mcp_response, final_response]
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=False,  # PII disabled
+            )
+
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use MCP tool",
+                run_config=test_run_config,
+            )
+
+    (transaction,) = events
+    spans = transaction["spans"]
+
+    # Find the MCP execute_tool span
+    mcp_tool_span = None
+    for span in spans:
+        if (
+            span.get("description") == "execute_tool test_mcp_tool"
+            and span.get("data", {}).get("gen_ai.tool.type") == "mcp"
+        ):
+            mcp_tool_span = span
+            break
+
+    # Verify the MCP tool span was created but without input/output
+    assert mcp_tool_span is not None, "MCP execute_tool span was not created"
+    assert mcp_tool_span["description"] == "execute_tool test_mcp_tool"
+    assert mcp_tool_span["data"]["gen_ai.tool.type"] == "mcp"
+    assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool"
+
+    # Verify input and output are not included when send_default_pii is False
+    assert "gen_ai.tool.input" not in mcp_tool_span["data"]
+    assert "gen_ai.tool.output" not in mcp_tool_span["data"]
+
+
 @pytest.mark.asyncio
 async def test_multiple_agents_asyncio(
     sentry_init, capture_events, test_agent, mock_model_response