From ece944b7cf81c9370a776b769c694c21eb04d0e8 Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Wed, 1 Oct 2025 15:40:21 +1000 Subject: [PATCH 1/2] fix: Handle non-tty scenario correctly for aks agent evals --- src/aks-agent/HISTORY.rst | 6 + src/aks-agent/azext_aks_agent/agent/agent.py | 29 ++-- .../tests/evals/test_ask_agent.py | 129 ++++++++++++++++-- .../tests/latest/test_aks_agent.py | 6 +- src/aks-agent/setup.cfg | 8 ++ src/aks-agent/setup.py | 2 +- 6 files changed, 153 insertions(+), 27 deletions(-) diff --git a/src/aks-agent/HISTORY.rst b/src/aks-agent/HISTORY.rst index 1cebda08a46..fdd6be33207 100644 --- a/src/aks-agent/HISTORY.rst +++ b/src/aks-agent/HISTORY.rst @@ -12,6 +12,12 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ +1.0.0b5 ++++++++ +* Fix stdin reading hang in CI/CD pipelines by using select with timeout for non-interactive mode. +* Update pytest marker registration and fix datetime.utcnow() deprecation warning in tests. +* Improve test framework with real-time stderr output visibility and subprocess timeout. + 1.0.0b4 +++++++ * Fix the --aks-mcp flag to allow true/false values. diff --git a/src/aks-agent/azext_aks_agent/agent/agent.py b/src/aks-agent/azext_aks_agent/agent/agent.py index f8ec41002be..55e00c0859a 100644 --- a/src/aks-agent/azext_aks_agent/agent/agent.py +++ b/src/aks-agent/azext_aks_agent/agent/agent.py @@ -5,6 +5,7 @@ import logging import os +import select import sys from azext_aks_agent._consts import ( @@ -112,7 +113,7 @@ def _should_refresh_toolsets(requested_mode: str, user_refresh_request: bool) -> return False -# pylint: disable=too-many-locals +# pylint: disable=too-many-locals,too-many-branches def aks_agent( cmd, resource_group_name, @@ -168,13 +169,25 @@ def aks_agent( # Detect and read piped input piped_data = None - if not sys.stdin.isatty(): - piped_data = sys.stdin.read().strip() - if interactive: - console.print( - "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]" - ) - interactive = False + # In non-interactive mode with a prompt, we shouldn't try to read stdin + # as it may hang in CI/CD environments. Only read stdin if: + # 1. Not a TTY (indicating piped input) + # 2. Interactive mode is enabled (allows stdin reading) + should_check_stdin = not sys.stdin.isatty() and interactive + + if should_check_stdin: + try: + # Use select with timeout to avoid hanging + # Check if data is available with 100ms timeout + if select.select([sys.stdin], [], [], 0.1)[0]: + piped_data = sys.stdin.read().strip() + console.print( + "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]" + ) + interactive = False + except Exception: # pylint: disable=broad-exception-caught + # Continue without piped data if stdin reading fails + pass # Determine MCP mode and smart refresh logic use_aks_mcp = bool(use_aks_mcp) diff --git a/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py b/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py index 5b91cf5b7c7..c4e6beffb5d 100644 --- a/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py +++ b/src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py @@ -8,7 +8,12 @@ import os import shlex import subprocess +import textwrap +import sys +import threading +from datetime import datetime, timezone from pathlib import Path +from time import perf_counter from typing import Iterable @@ -40,6 +45,26 @@ ITERATIONS = int(os.environ.get("ITERATIONS", "1")) BRAINTRUST_UPLOADER = BraintrustUploader(os.environ) + +def _log(message: str) -> None: + """Emit a timestamped log line that pytest `-s` will surface immediately.""" + timestamp = datetime.now(timezone.utc).isoformat(timespec="seconds") + print(f"[{timestamp}] {message}", flush=True) + + +def _summarise_command(parts: Iterable[str]) -> str: + """Return a shell-style command string for debugging output.""" + sequence = parts if isinstance(parts, list) else list(parts) + if hasattr(shlex, "join"): + return shlex.join(sequence) + # ``shlex.join`` was added in Python 3.8; keep a safe fallback just in case. + return " ".join(shlex.quote(part) for part in sequence) + + +def _preview_output(output: str, *, limit: int = 400) -> str: + """Provide a trimmed preview of command output for quick debugging.""" + return textwrap.shorten(output.strip(), width=limit, placeholder=" …") + pytestmark = [ pytest.mark.skipif( not RUN_LIVE, @@ -90,14 +115,59 @@ def _build_command(prompt: str, model: str, resource_group: str, cluster_name: s def _run_cli(command: Iterable[str], env: dict[str, str]) -> str: + command_list = list(command) + command_display = _summarise_command(command_list) + _log(f"Invoking AKS Agent CLI: {command_display}") + start = perf_counter() + + timeout_seconds = 600 # 10 minutes timeout + try: - result = subprocess.run( # noqa: S603 - list(command), - check=True, - capture_output=True, + # Use Popen for real-time output visibility + process = subprocess.Popen( # noqa: S603 + command_list, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, env=env, ) + + # Thread to print stderr in real-time + stderr_lines = [] + def print_stderr(): + for line in iter(process.stderr.readline, ''): + if line: + print(f"[STDERR] {line.rstrip()}", file=sys.stderr, flush=True) + stderr_lines.append(line) + + stderr_thread = threading.Thread(target=print_stderr, daemon=True) + stderr_thread.start() + + # Wait with timeout + try: + stdout, _ = process.communicate(timeout=timeout_seconds) + stderr_thread.join(timeout=1) + stderr = ''.join(stderr_lines) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr_remainder = process.communicate() + stderr = ''.join(stderr_lines) + (stderr_remainder or '') + _log(f"[ERROR] CLI command timed out after {timeout_seconds}s") + pytest.fail( + f"AKS Agent CLI call timed out after {timeout_seconds}s\n" + f"Command: {command_display}\n" + f"Stdout: {stdout}\n" + f"Stderr: {stderr}" + ) + + if process.returncode != 0: + raise subprocess.CalledProcessError( + process.returncode, command_list, stdout, stderr + ) + + result = subprocess.CompletedProcess( + command_list, process.returncode, stdout, stderr + ) except subprocess.CalledProcessError as exc: # pragma: no cover - live failure path output = exc.stdout or "" stderr = exc.stderr or "" @@ -109,13 +179,28 @@ def _run_cli(command: Iterable[str], env: dict[str, str]) -> str: f"Stdout: {output}\n" f"Stderr: {stderr}" ) + duration = perf_counter() - start + stdout_preview = _preview_output(result.stdout) + stderr_preview = _preview_output(result.stderr) if result.stderr else None + _log( + f"AKS Agent CLI completed in {duration:.1f}s with stdout preview: {stdout_preview}" + ) + if stderr_preview: + _log( + f"AKS Agent CLI stderr preview: {stderr_preview}" + ) return result.stdout def _run_commands( commands: list[str], env: dict[str, str], label: str, scenario: Scenario ) -> None: + if not commands: + _log(f"[{label}] {scenario.name}: no commands to run") + return for cmd in commands: + _log(f"[{label}] {scenario.name}: running shell command: {cmd}") + start = perf_counter() try: completed = subprocess.run( # noqa: S603 cmd, @@ -137,9 +222,25 @@ def _run_commands( f"Stderr: {stderr}" ) else: + duration = perf_counter() - start # Provide quick visibility into command results when debugging failures. if completed.stdout: - print(f"[{label}] {scenario.name}: {completed.stdout.strip()}") + stdout_preview = _preview_output(completed.stdout) + _log( + f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; stdout preview: {stdout_preview}" + ) + else: + _log( + f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; no stdout produced" + ) + if completed.stderr: + stderr_preview = _preview_output(completed.stderr) + _log( + f"[{label}] {scenario.name}: stderr preview: {stderr_preview}" + ) + _log( + f"[{label}] {scenario.name}: completed {len(commands)} command(s)" + ) def _scenario_params() -> list: @@ -165,6 +266,7 @@ def test_ask_agent_live( request: pytest.FixtureRequest, ) -> None: iteration_label = f"[iteration {iteration + 1}/{ITERATIONS}]" + _log(f"{iteration_label} starting scenario {scenario.name}") if RUN_LIVE: env = _load_env() @@ -178,7 +280,7 @@ def test_ask_agent_live( env.update(scenario.env_overrides) if iteration == 0 and scenario.before_commands and not aks_skip_setup: - print(f"{iteration_label} running setup commands for {scenario.name}") + _log(f"{iteration_label} running setup commands for {scenario.name}") _run_commands(scenario.before_commands, env, "setup", scenario) command = _build_command( @@ -188,7 +290,7 @@ def test_ask_agent_live( cluster_name=cluster_name, ) - print(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}") + _log(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}") try: raw_output = _run_cli(command, env) answer = "" @@ -216,11 +318,11 @@ def test_ask_agent_live( classifier_rationale = classifier_result.metadata.get( "rationale", "" ) - print( + _log( f"{iteration_label} classifier score for {scenario.name}: {classifier_score}" ) if classifier_score is None: - print( + _log( f"{iteration_label} classifier returned no score for {scenario.name}; falling back to substring checks" ) else: @@ -230,7 +332,7 @@ def test_ask_agent_live( if not error_message: error_message = "Classifier judged answer incorrect" else: - print( + _log( f"{iteration_label} classifier unavailable for {scenario.name}; falling back to substring checks" ) @@ -280,21 +382,21 @@ def test_ask_agent_live( if GENERATE_MOCKS: mock_path = save_mock_answer(scenario.mock_path, answer) - print(f"{iteration_label} [mock] wrote response to {mock_path}") + _log(f"{iteration_label} [mock] wrote response to {mock_path}") finally: if ( iteration == ITERATIONS - 1 and scenario.after_commands and not aks_skip_cleanup ): - print(f"{iteration_label} running cleanup commands for {scenario.name}") + _log(f"{iteration_label} running cleanup commands for {scenario.name}") _run_commands(scenario.after_commands, env, "cleanup", scenario) else: if GENERATE_MOCKS: pytest.fail("GENERATE_MOCKS requires RUN_LIVE=true") try: answer = load_mock_answer(scenario.mock_path) - print(f"{iteration_label} replayed mock response for {scenario.name}") + _log(f"{iteration_label} replayed mock response for {scenario.name}") except FileNotFoundError: pytest.skip(f"Mock response missing for scenario {scenario.name}; rerun with RUN_LIVE=true GENERATE_MOCKS=true") @@ -328,5 +430,6 @@ def test_ask_agent_live( _set_user_property(request, 'braintrust_root_span_id', str(root_span_id)) if url: _set_user_property(request, 'braintrust_experiment_url', str(url)) + _log(f"{iteration_label} completed scenario {scenario.name} (passed={passed})") if not passed: pytest.fail(f"Scenario {scenario.name}: {error}\nAI answer:\n{answer}") diff --git a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py index eb0082b840e..bde484b0ca2 100644 --- a/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py +++ b/src/aks-agent/azext_aks_agent/tests/latest/test_aks_agent.py @@ -6,12 +6,8 @@ import os import sys import unittest -from types import SimpleNamespace -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import Mock, patch -from azext_aks_agent._consts import (CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY, - CONST_AGENT_NAME, - CONST_AGENT_NAME_ENV_KEY) from azext_aks_agent.agent.agent import aks_agent, init_log from azure.cli.core.util import CLIError diff --git a/src/aks-agent/setup.cfg b/src/aks-agent/setup.cfg index 3c6e79cf31d..6fc1bb2faa5 100644 --- a/src/aks-agent/setup.cfg +++ b/src/aks-agent/setup.cfg @@ -1,2 +1,10 @@ [bdist_wheel] universal=1 + +[tool:pytest] +markers = + easy: Regression AKS Agent evals that should always pass + medium: Stretch AKS Agent evals that may fail occasionally + hard: Challenging AKS Agent evals reserved for complex scenarios + kubernetes: AKS Agent evals that exercise Kubernetes-focused flows + aks_eval: AKS Agent evaluation tests diff --git a/src/aks-agent/setup.py b/src/aks-agent/setup.py index 1206b6f475f..34ebaab343e 100644 --- a/src/aks-agent/setup.py +++ b/src/aks-agent/setup.py @@ -9,7 +9,7 @@ from setuptools import find_packages, setup -VERSION = "1.0.0b4" +VERSION = "1.0.0b5" CLASSIFIERS = [ "Development Status :: 4 - Beta", From f60b8c532742eb778d9e17469bd7330ee987a05b Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Fri, 17 Oct 2025 11:03:33 +1100 Subject: [PATCH 2/2] do not release a new version --- src/aks-agent/HISTORY.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/aks-agent/HISTORY.rst b/src/aks-agent/HISTORY.rst index 1f0fba1f74f..ef7f92bca76 100644 --- a/src/aks-agent/HISTORY.rst +++ b/src/aks-agent/HISTORY.rst @@ -11,9 +11,6 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ - -1.0.0b7 -+++++++ * Fix stdin reading hang in CI/CD pipelines by using select with timeout for non-interactive mode. * Update pytest marker registration and fix datetime.utcnow() deprecation warning in tests. * Improve test framework with real-time stderr output visibility and subprocess timeout.