From 6038dbbddcb4b438603f3fdeaf0ac0d34a313619 Mon Sep 17 00:00:00 2001
From: majiayu000 <1835304752@qq.com>
Date: Mon, 22 Dec 2025 18:47:22 +0800
Subject: [PATCH] fix: clean internal format markers from agent output when
 parsing fails

When LLM response parsing fails, the raw output may contain internal
ReAct format markers (Thought:, Action:, Action Input:, Observation:)
that should not appear in user-facing output.

- Add _clean_raw_output() to strip internal format markers
- Extract Final Answer content if present
- Preserve original text for debugging while cleaning output
- Add comprehensive tests for the cleaning function

Fixes #3873
---
 .../src/crewai/utilities/agent_utils.py       |  51 ++++++-
 .../tests/utilities/test_agent_utils.py       | 127 ++++++++++++++++++
 2 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 lib/crewai/tests/utilities/test_agent_utils.py

diff --git a/lib/crewai/src/crewai/utilities/agent_utils.py b/lib/crewai/src/crewai/utilities/agent_utils.py
index 973ad55960..a68f654a4b 100644
--- a/lib/crewai/src/crewai/utilities/agent_utils.py
+++ b/lib/crewai/src/crewai/utilities/agent_utils.py
@@ -198,6 +198,53 @@ def format_message_for_llm(
     return {"role": role, "content": prompt}
 
 
+def _clean_raw_output(answer: str) -> str:
+    """Clean raw LLM output by removing internal ReAct format markers.
+
+    When parsing fails, the raw answer may contain internal format markers
+    like 'Thought:', 'Action:', 'Action Input:' that should not appear in
+    the final user-facing output.
+
+    Args:
+        answer: The raw response from the LLM
+
+    Returns:
+        Cleaned output with internal format markers removed
+    """
+    # Check if answer contains "Final Answer:" and extract that part
+    if "Final Answer:" in answer:
+        # Extract everything after "Final Answer:"
+        parts = answer.split("Final Answer:")
+        if len(parts) > 1:
+            return parts[-1].strip()
+
+    # Remove Thought: prefix if present at the start
+    lines = answer.split("\n")
+    cleaned_lines = []
+    skip_until_content = False
+
+    for line in lines:
+        stripped = line.strip()
+        # Skip lines that are internal format markers
+        if stripped.startswith("Thought:"):
+            skip_until_content = True
+            continue
+        if stripped.startswith("Action:") or stripped.startswith("Action Input:"):
+            skip_until_content = True
+            continue
+        if stripped.startswith("Observation:"):
+            skip_until_content = True
+            continue
+
+        if skip_until_content and stripped:
+            skip_until_content = False
+        if not skip_until_content:
+            cleaned_lines.append(line)
+
+    result = "\n".join(cleaned_lines).strip()
+    return result if result else answer
+
+
 def format_answer(answer: str) -> AgentAction | AgentFinish:
     """Format a response from the LLM into an AgentAction or AgentFinish.
 
@@ -210,9 +257,11 @@ def format_answer(answer: str) -> AgentAction | AgentFinish:
     try:
         return parse(answer)
     except Exception:
+        # Clean the output to remove internal format markers
+        cleaned_output = _clean_raw_output(answer)
         return AgentFinish(
             thought="Failed to parse LLM response",
-            output=answer,
+            output=cleaned_output,
             text=answer,
         )
 
diff --git a/lib/crewai/tests/utilities/test_agent_utils.py b/lib/crewai/tests/utilities/test_agent_utils.py
new file mode 100644
index 0000000000..9155e47824
--- /dev/null
+++ b/lib/crewai/tests/utilities/test_agent_utils.py
@@ -0,0 +1,127 @@
+"""Tests for agent utility functions."""
+
+import pytest
+
+from crewai.agents.parser import AgentFinish
+from crewai.utilities.agent_utils import _clean_raw_output, format_answer
+
+
+class TestCleanRawOutput:
+    """Tests for _clean_raw_output function."""
+
+    def test_extracts_final_answer_when_present(self):
+        """Test that Final Answer content is properly extracted."""
+        answer = """Thought: I need to process this request.
+Action: search
+Action Input: {"query": "test"}
+Observation: search results here
+Thought: Now I have the answer.
+Final Answer: The search returned positive results."""
+
+        result = _clean_raw_output(answer)
+        assert result == "The search returned positive results."
+
+    def test_removes_thought_prefix(self):
+        """Test that Thought: prefix lines are removed."""
+        answer = """Thought: I'm thinking about the problem.
+This is the actual content.
+More content here."""
+
+        result = _clean_raw_output(answer)
+        assert "Thought:" not in result
+        assert "This is the actual content." in result
+
+    def test_removes_action_lines(self):
+        """Test that Action: and Action Input: lines are removed."""
+        answer = """Some content here.
+Action: tool_name
+Action Input: {"param": "value"}
+More content after."""
+
+        result = _clean_raw_output(answer)
+        assert "Action:" not in result
+        assert "Action Input:" not in result
+        assert "Some content here." in result
+
+    def test_removes_observation_lines(self):
+        """Test that Observation: lines are removed."""
+        answer = """Content before.
+Observation: tool output here
+Content after observation."""
+
+        result = _clean_raw_output(answer)
+        assert "Observation:" not in result
+        assert "Content before." in result
+
+    def test_returns_original_if_no_content_left(self):
+        """Test that original is returned if cleaning removes everything."""
+        answer = """Thought: Only thought here
+Action: some_action"""
+
+        result = _clean_raw_output(answer)
+        # When cleaning results in empty content, return original
+        assert result == answer
+
+    def test_handles_plain_text(self):
+        """Test that plain text without markers is returned as-is."""
+        answer = "This is a simple response without any markers."
+        result = _clean_raw_output(answer)
+        assert result == answer
+
+    def test_handles_multiline_final_answer(self):
+        """Test that multiline Final Answer is properly extracted."""
+        answer = """Thought: Processing...
+Final Answer: This is line one.
+This is line two.
+And line three."""
+
+        result = _clean_raw_output(answer)
+        assert "This is line one." in result
+        assert "This is line two." in result
+        assert "And line three." in result
+
+
+class TestFormatAnswer:
+    """Tests for format_answer function."""
+
+    def test_returns_agent_finish_on_parse_failure(self):
+        """Test that AgentFinish is returned when parsing fails."""
+        # Invalid format that will fail parsing
+        answer = """Thought: Some thought here
+This is not a valid format."""
+
+        result = format_answer(answer)
+        assert isinstance(result, AgentFinish)
+        assert result.thought == "Failed to parse LLM response"
+
+    def test_cleans_output_on_parse_failure(self):
+        """Test that output is cleaned when parsing fails."""
+        answer = """Thought: I need to respond.
+Action: invalid_action
+The actual response content here."""
+
+        result = format_answer(answer)
+        assert isinstance(result, AgentFinish)
+        # The cleaned output should not contain internal markers
+        assert "Thought:" not in result.output
+        assert "Action:" not in result.output
+
+    def test_preserves_original_text(self):
+        """Test that original text is preserved in the text field."""
+        answer = """Thought: Some thought.
+Action: tool
+The response."""
+
+        result = format_answer(answer)
+        assert isinstance(result, AgentFinish)
+        # Original text should be preserved
+        assert result.text == answer
+
+    def test_valid_final_answer_format(self):
+        """Test that valid Final Answer format is properly parsed."""
+        answer = """Thought: I have the answer.
+Final Answer: This is the correct response."""
+
+        result = format_answer(answer)
+        assert isinstance(result, AgentFinish)
+        assert result.output == "This is the correct response."