From ece944b7cf81c9370a776b769c694c21eb04d0e8 Mon Sep 17 00:00:00 2001
From: Qi Ni <pomelonicky@gmail.com>
Date: Wed, 1 Oct 2025 15:40:21 +1000
Subject: [PATCH 1/2] fix: Handle non-tty scenario correctly for aks agent
 evals

---
 src/aks-agent/HISTORY.rst                     |   6 +
 src/aks-agent/azext_aks_agent/agent/agent.py  |  29 ++--
 .../tests/evals/test_ask_agent.py             | 129 ++++++++++++++++--
 .../tests/latest/test_aks_agent.py            |   6 +-
 src/aks-agent/setup.cfg                       |   8 ++
 src/aks-agent/setup.py                        |   2 +-
 6 files changed, 153 insertions(+), 27 deletions(-)

diff --git a/src/aks-agent/HISTORY.rst b/src/aks-agent/HISTORY.rst
index 1cebda08a46..fdd6be33207 100644
--- a/src/aks-agent/HISTORY.rst
+++ b/src/aks-agent/HISTORY.rst
@@ -12,6 +12,12 @@ To release a new version, please select a new version number (usually plus 1 to
 Pending
 +++++++
 
+1.0.0b5
++++++++
+* Fix stdin reading hang in CI/CD pipelines by using select with timeout for non-interactive mode.
+* Update pytest marker registration and fix datetime.utcnow() deprecation warning in tests.
+* Improve test framework with real-time stderr output visibility and subprocess timeout.
+
 1.0.0b4
 +++++++
 * Fix the --aks-mcp flag to allow true/false values.
diff --git a/src/aks-agent/azext_aks_agent/agent/agent.py b/src/aks-agent/azext_aks_agent/agent/agent.py
index f8ec41002be..55e00c0859a 100644
--- a/src/aks-agent/azext_aks_agent/agent/agent.py
+++ b/src/aks-agent/azext_aks_agent/agent/agent.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import select
 import sys
 
 from azext_aks_agent._consts import (
@@ -112,7 +113,7 @@ def _should_refresh_toolsets(requested_mode: str, user_refresh_request: bool) ->
     return False
 
 
-# pylint: disable=too-many-locals
+# pylint: disable=too-many-locals,too-many-branches
 def aks_agent(
     cmd,
     resource_group_name,
@@ -168,13 +169,25 @@ def aks_agent(
 
         # Detect and read piped input
         piped_data = None
-        if not sys.stdin.isatty():
-            piped_data = sys.stdin.read().strip()
-            if interactive:
-                console.print(
-                    "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
-                )
-                interactive = False
+        # In non-interactive mode with a prompt, we shouldn't try to read stdin
+        # as it may hang in CI/CD environments. Only read stdin if:
+        # 1. Not a TTY (indicating piped input)
+        # 2. Interactive mode is enabled (allows stdin reading)
+        should_check_stdin = not sys.stdin.isatty() and interactive
+
+        if should_check_stdin:
+            try:
+                # Use select with timeout to avoid hanging
+                # Check if data is available with 100ms timeout
+                if select.select([sys.stdin], [], [], 0.1)[0]:
+                    piped_data = sys.stdin.read().strip()
+                    console.print(
+                        "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
+                    )
+                    interactive = False
+            except Exception:  # pylint: disable=broad-exception-caught
+                # Continue without piped data if stdin reading fails
+                pass
 
         # Determine MCP mode and smart refresh logic
         use_aks_mcp = bool(use_aks_mcp)
diff --git a/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py b/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py
index 5b91cf5b7c7..c4e6beffb5d 100644
--- a/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py
+++ b/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py
@@ -8,7 +8,12 @@
 import os
 import shlex
 import subprocess
+import textwrap
+import sys
+import threading
+from datetime import datetime, timezone
 from pathlib import Path
+from time import perf_counter
 from typing import Iterable
 
 
@@ -40,6 +45,26 @@
 ITERATIONS = int(os.environ.get("ITERATIONS", "1"))
 BRAINTRUST_UPLOADER = BraintrustUploader(os.environ)
 
+
+def _log(message: str) -> None:
+    """Emit a timestamped log line that pytest `-s` will surface immediately."""
+    timestamp = datetime.now(timezone.utc).isoformat(timespec="seconds")
+    print(f"[{timestamp}] {message}", flush=True)
+
+
+def _summarise_command(parts: Iterable[str]) -> str:
+    """Return a shell-style command string for debugging output."""
+    sequence = parts if isinstance(parts, list) else list(parts)
+    if hasattr(shlex, "join"):
+        return shlex.join(sequence)
+    # ``shlex.join`` was added in Python 3.8; keep a safe fallback just in case.
+    return " ".join(shlex.quote(part) for part in sequence)
+
+
+def _preview_output(output: str, *, limit: int = 400) -> str:
+    """Provide a trimmed preview of command output for quick debugging."""
+    return textwrap.shorten(output.strip(), width=limit, placeholder=" …")
+
 pytestmark = [
     pytest.mark.skipif(
         not RUN_LIVE,
@@ -90,14 +115,59 @@ def _build_command(prompt: str, model: str, resource_group: str, cluster_name: s
 
 
 def _run_cli(command: Iterable[str], env: dict[str, str]) -> str:
+    command_list = list(command)
+    command_display = _summarise_command(command_list)
+    _log(f"Invoking AKS Agent CLI: {command_display}")
+    start = perf_counter()
+
+    timeout_seconds = 600  # 10 minutes timeout
+
     try:
-        result = subprocess.run(  # noqa: S603
-            list(command),
-            check=True,
-            capture_output=True,
+        # Use Popen for real-time output visibility
+        process = subprocess.Popen(  # noqa: S603
+            command_list,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
             text=True,
             env=env,
         )
+
+        # Thread to print stderr in real-time
+        stderr_lines = []
+        def print_stderr():
+            for line in iter(process.stderr.readline, ''):
+                if line:
+                    print(f"[STDERR] {line.rstrip()}", file=sys.stderr, flush=True)
+                    stderr_lines.append(line)
+
+        stderr_thread = threading.Thread(target=print_stderr, daemon=True)
+        stderr_thread.start()
+
+        # Wait with timeout
+        try:
+            stdout, _ = process.communicate(timeout=timeout_seconds)
+            stderr_thread.join(timeout=1)
+            stderr = ''.join(stderr_lines)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            stdout, stderr_remainder = process.communicate()
+            stderr = ''.join(stderr_lines) + (stderr_remainder or '')
+            _log(f"[ERROR] CLI command timed out after {timeout_seconds}s")
+            pytest.fail(
+                f"AKS Agent CLI call timed out after {timeout_seconds}s\n"
+                f"Command: {command_display}\n"
+                f"Stdout: {stdout}\n"
+                f"Stderr: {stderr}"
+            )
+
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(
+                process.returncode, command_list, stdout, stderr
+            )
+
+        result = subprocess.CompletedProcess(
+            command_list, process.returncode, stdout, stderr
+        )
     except subprocess.CalledProcessError as exc:  # pragma: no cover - live failure path
         output = exc.stdout or ""
         stderr = exc.stderr or ""
@@ -109,13 +179,28 @@ def _run_cli(command: Iterable[str], env: dict[str, str]) -> str:
             f"Stdout: {output}\n"
             f"Stderr: {stderr}"
         )
+    duration = perf_counter() - start
+    stdout_preview = _preview_output(result.stdout)
+    stderr_preview = _preview_output(result.stderr) if result.stderr else None
+    _log(
+        f"AKS Agent CLI completed in {duration:.1f}s with stdout preview: {stdout_preview}"
+    )
+    if stderr_preview:
+        _log(
+            f"AKS Agent CLI stderr preview: {stderr_preview}"
+        )
     return result.stdout
 
 
 def _run_commands(
     commands: list[str], env: dict[str, str], label: str, scenario: Scenario
 ) -> None:
+    if not commands:
+        _log(f"[{label}] {scenario.name}: no commands to run")
+        return
     for cmd in commands:
+        _log(f"[{label}] {scenario.name}: running shell command: {cmd}")
+        start = perf_counter()
         try:
             completed = subprocess.run(  # noqa: S603
                 cmd,
@@ -137,9 +222,25 @@ def _run_commands(
                 f"Stderr: {stderr}"
             )
         else:
+            duration = perf_counter() - start
             # Provide quick visibility into command results when debugging failures.
             if completed.stdout:
-                print(f"[{label}] {scenario.name}: {completed.stdout.strip()}")
+                stdout_preview = _preview_output(completed.stdout)
+                _log(
+                    f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; stdout preview: {stdout_preview}"
+                )
+            else:
+                _log(
+                    f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; no stdout produced"
+                )
+            if completed.stderr:
+                stderr_preview = _preview_output(completed.stderr)
+                _log(
+                    f"[{label}] {scenario.name}: stderr preview: {stderr_preview}"
+                )
+    _log(
+        f"[{label}] {scenario.name}: completed {len(commands)} command(s)"
+    )
 
 
 def _scenario_params() -> list:
@@ -165,6 +266,7 @@ def test_ask_agent_live(
     request: pytest.FixtureRequest,
 ) -> None:
     iteration_label = f"[iteration {iteration + 1}/{ITERATIONS}]"
+    _log(f"{iteration_label} starting scenario {scenario.name}")
     if RUN_LIVE:
         env = _load_env()
 
@@ -178,7 +280,7 @@ def test_ask_agent_live(
             env.update(scenario.env_overrides)
 
         if iteration == 0 and scenario.before_commands and not aks_skip_setup:
-            print(f"{iteration_label} running setup commands for {scenario.name}")
+            _log(f"{iteration_label} running setup commands for {scenario.name}")
             _run_commands(scenario.before_commands, env, "setup", scenario)
 
         command = _build_command(
@@ -188,7 +290,7 @@ def test_ask_agent_live(
             cluster_name=cluster_name,
         )
 
-        print(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}")
+        _log(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}")
         try:
             raw_output = _run_cli(command, env)
             answer = ""
@@ -216,11 +318,11 @@ def test_ask_agent_live(
                     classifier_rationale = classifier_result.metadata.get(
                         "rationale", ""
                     )
-                    print(
+                    _log(
                         f"{iteration_label} classifier score for {scenario.name}: {classifier_score}"
                     )
                     if classifier_score is None:
-                        print(
+                        _log(
                             f"{iteration_label} classifier returned no score for {scenario.name}; falling back to substring checks"
                         )
                     else:
@@ -230,7 +332,7 @@ def test_ask_agent_live(
                             if not error_message:
                                 error_message = "Classifier judged answer incorrect"
                 else:
-                    print(
+                    _log(
                         f"{iteration_label} classifier unavailable for {scenario.name}; falling back to substring checks"
                     )
 
@@ -280,21 +382,21 @@ def test_ask_agent_live(
 
             if GENERATE_MOCKS:
                 mock_path = save_mock_answer(scenario.mock_path, answer)
-                print(f"{iteration_label} [mock] wrote response to {mock_path}")
+                _log(f"{iteration_label} [mock] wrote response to {mock_path}")
         finally:
             if (
                 iteration == ITERATIONS - 1
                 and scenario.after_commands
                 and not aks_skip_cleanup
             ):
-                print(f"{iteration_label} running cleanup commands for {scenario.name}")
+                _log(f"{iteration_label} running cleanup commands for {scenario.name}")
                 _run_commands(scenario.after_commands, env, "cleanup", scenario)
     else:
         if GENERATE_MOCKS:
             pytest.fail("GENERATE_MOCKS requires RUN_LIVE=true")
         try:
             answer = load_mock_answer(scenario.mock_path)
-            print(f"{iteration_label} replayed mock response for {scenario.name}")
+            _log(f"{iteration_label} replayed mock response for {scenario.name}")
         except FileNotFoundError:
             pytest.skip(f"Mock response missing for scenario {scenario.name}; rerun with RUN_LIVE=true GENERATE_MOCKS=true")
 
@@ -328,5 +430,6 @@ def test_ask_agent_live(
                 _set_user_property(request, 'braintrust_root_span_id', str(root_span_id))
             if url:
                 _set_user_property(request, 'braintrust_experiment_url', str(url))
+        _log(f"{iteration_label} completed scenario {scenario.name} (passed={passed})")
         if not passed:
             pytest.fail(f"Scenario {scenario.name}: {error}\nAI answer:\n{answer}")
diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py
index eb0082b840e..bde484b0ca2 100644
--- a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py
+++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py
@@ -6,12 +6,8 @@
 import os
 import sys
 import unittest
-from types import SimpleNamespace
-from unittest.mock import MagicMock, Mock, call, patch
+from unittest.mock import Mock, patch
 
-from azext_aks_agent._consts import (CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY,
-                                     CONST_AGENT_NAME,
-                                     CONST_AGENT_NAME_ENV_KEY)
 from azext_aks_agent.agent.agent import aks_agent, init_log
 from azure.cli.core.util import CLIError
 
diff --git a/src/aks-agent/setup.cfg b/src/aks-agent/setup.cfg
index 3c6e79cf31d..6fc1bb2faa5 100644
--- a/src/aks-agent/setup.cfg
+++ b/src/aks-agent/setup.cfg
@@ -1,2 +1,10 @@
 [bdist_wheel]
 universal=1
+
+[tool:pytest]
+markers =
+    easy: Regression AKS Agent evals that should always pass
+    medium: Stretch AKS Agent evals that may fail occasionally
+    hard: Challenging AKS Agent evals reserved for complex scenarios
+    kubernetes: AKS Agent evals that exercise Kubernetes-focused flows
+    aks_eval: AKS Agent evaluation tests
diff --git a/src/aks-agent/setup.py b/src/aks-agent/setup.py
index 1206b6f475f..34ebaab343e 100644
--- a/src/aks-agent/setup.py
+++ b/src/aks-agent/setup.py
@@ -9,7 +9,7 @@
 
 from setuptools import find_packages, setup
 
-VERSION = "1.0.0b4"
+VERSION = "1.0.0b5"
 
 CLASSIFIERS = [
     "Development Status :: 4 - Beta",

From f60b8c532742eb778d9e17469bd7330ee987a05b Mon Sep 17 00:00:00 2001
From: Qi Ni <niqi@microsoft.com>
Date: Fri, 17 Oct 2025 11:03:33 +1100
Subject: [PATCH 2/2] do not release a new version

---
 src/aks-agent/HISTORY.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/aks-agent/HISTORY.rst b/src/aks-agent/HISTORY.rst
index 1f0fba1f74f..ef7f92bca76 100644
--- a/src/aks-agent/HISTORY.rst
+++ b/src/aks-agent/HISTORY.rst
@@ -11,9 +11,6 @@ To release a new version, please select a new version number (usually plus 1 to
 
 Pending
 +++++++
-
-1.0.0b7
-+++++++
 * Fix stdin reading hang in CI/CD pipelines by using select with timeout for non-interactive mode.
 * Update pytest marker registration and fix datetime.utcnow() deprecation warning in tests.
 * Improve test framework with real-time stderr output visibility and subprocess timeout.