From 084390de771f26e568e18f472986cf749402f55e Mon Sep 17 00:00:00 2001 From: Dipesh Mittal Date: Mon, 16 Feb 2026 17:59:20 +0530 Subject: [PATCH 1/2] added benchmarking changes for drdroid agent --- benchmarks/.gitignore | 10 + benchmarks/README.md | 442 ++++++++++ benchmarks/__init__.py | 54 ++ benchmarks/agent.py | 688 +++++++++++++++ benchmarks/config.py | 228 +++++ benchmarks/config/.gitkeep | 0 benchmarks/config/credentials.yaml.template | 78 ++ benchmarks/config/mcp_servers.json.template | 42 + benchmarks/dashboard.py | 652 ++++++++++++++ benchmarks/executor.py | 817 ++++++++++++++++++ benchmarks/reporter.py | 626 ++++++++++++++ benchmarks/results/.gitkeep | 0 .../01_how_many_pods/test_case.yaml | 12 +- .../02_what_is_wrong_with_pod/test_case.yaml | 2 +- .../04_related_k8s_events/test_case.yaml | 2 +- .../05_image_version/test_case.yaml | 2 +- .../07_high_latency/test_case.yaml | 2 +- .../09_crashpod/test_case.yaml | 2 +- .../10_image_pull_backoff/test_case.yaml | 2 +- .../11_init_containers/test_case.yaml | 2 +- .../12_job_crashing/test_case.yaml | 6 +- 21 files changed, 3653 insertions(+), 16 deletions(-) create mode 100644 benchmarks/.gitignore create mode 100644 benchmarks/README.md create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/agent.py create mode 100644 benchmarks/config.py create mode 100644 benchmarks/config/.gitkeep create mode 100644 benchmarks/config/credentials.yaml.template create mode 100644 benchmarks/config/mcp_servers.json.template create mode 100644 benchmarks/dashboard.py create mode 100644 benchmarks/executor.py create mode 100644 benchmarks/reporter.py create mode 100644 benchmarks/results/.gitkeep diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 0000000000..e3ee75b93b --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,10 @@ +# Credentials (contains secrets) +config/credentials.yaml +config/mcp_servers.json + +# Results directory (can be large) +results/*.json + +# Keep the directories +!config/.gitkeep +!results/.gitkeep diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..951fe3cc94 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,442 @@ +# Benchmark Test Suite + +A comprehensive test suite for evaluating LLM models against HolmesGPT test cases. +Results are tracked by **model** and **test case (use case)** for easy comparison. + +## Directory Structure + +``` +benchmarks/ +├── config/ +│ ├── credentials.yaml # Your credentials (git-ignored) +│ └── credentials.yaml.template # Template for credentials +├── results/ # Test results (JSON files) +├── agent.py # Agent implementations +├── executor.py # Test execution engine +├── config.py # Configuration management +├── reporter.py # CLI report generation +├── dashboard.py # Streamlit dashboard +└── README.md +``` + +## Quick Start + +```bash +# 1. Setup credentials +cp benchmarks/config/credentials.yaml.template benchmarks/config/credentials.yaml +# Edit credentials.yaml with your API keys + +# 2. List available tests +python benchmarks/executor.py --list-tests + +# 3. Run a test (model is REQUIRED) +python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods + +# 4. View results (choose one) +python benchmarks/reporter.py --summary # CLI summary +python benchmarks/reporter.py --compare-models # Model comparison +streamlit run benchmarks/dashboard.py # Interactive dashboard +``` + +## Running Tests + +### Single Test +```bash +python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods +``` + +### Multiple Tests +```bash +python benchmarks/executor.py --model gpt5.2 \ + --test-id 01_how_many_pods \ + --test-id 02_what_is_wrong_with_pod +``` + +### All Tests +```bash +python benchmarks/executor.py --model sonnet4.5 --all +``` + +### Tests by Tag +```bash +# Run all kubernetes tests +python benchmarks/executor.py --model sonnet4.5 --tag kubernetes + +# Run easy tests +python benchmarks/executor.py --model gpt5.2 --tag easy + +# Multiple tags (OR logic) +python benchmarks/executor.py --model sonnet4.5 --tag kubernetes --tag prometheus +``` + +### Skip Setup/Cleanup +```bash +# Skip infrastructure setup (useful for debugging) +python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods --skip-setup + +# Skip cleanup (keep infrastructure running) +python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods --skip-cleanup +``` + +## Available Agents + +| Agent | Description | +|-------|-------------| +| `drdroid` | DrDroid Investigation API (default) | +| `claudecode` | Local Claude Code CLI with read-only kubectl | +| `holmes` | HolmesGPT ToolCallingLLM | +| `openai` | Simple OpenAI completion (no tools) | + +### Claude Code Agent + +The `claudecode` agent runs prompts through your local Claude Code CLI installation. +It's restricted to read-only kubectl commands for safe investigation. + +```bash +# Run with Claude Code agent +python benchmarks/executor.py --model claude-sonnet --agent claudecode --test-id 01_how_many_pods + +# With custom model +CLAUDE_MODEL=claude-sonnet-4-20250514 python benchmarks/executor.py --model sonnet4 --agent claudecode --all +``` + +**Requirements:** +- Claude Code CLI installed and authenticated (`claude` command in PATH) +- kubectl configured with appropriate cluster context + +**Restrictions (enforced via system prompt):** +- Only read-only kubectl commands allowed: `get`, `describe`, `logs`, `top`, `explain`, `api-resources`, `cluster-info` +- Write commands forbidden: `apply`, `create`, `delete`, `edit`, `patch`, `exec`, etc. + +### Adding a Custom Agent + +Edit `benchmarks/agent.py`: + +```python +@register_agent("my_agent") +def my_custom_agent(test_case: TestCase) -> AgentResult: + """My custom agent implementation.""" + + # Your agent logic here + response = call_my_api(test_case.user_prompt) + + return AgentResult( + output=response, + tool_calls=["tool1", "tool2"], # optional + metadata={"custom": "data"}, # optional + ) +``` + +## Credentials Configuration + +Create `benchmarks/config/credentials.yaml`: + +```yaml +# Kubernetes +kubernetes: + kubeconfig: ~/.kube/config + context: my-cluster + +# Monitoring tools +datadog: + api_key: your-api-key + app_key: your-app-key + +prometheus: + url: http://localhost:9090 + +grafana: + url: http://localhost:3000 + api_key: your-api-key + +# LLM +openai: + api_key: sk-... + +# Judge +judge: + model: gpt-4.1 + +# Custom +custom: + drdroid: + api_url: http://localhost:8000 + api_key: your-key +``` + +Environment variables override file values: +- `OPENAI_API_KEY` +- `CLASSIFIER_MODEL` +- `DRDROID_API_URL` +- `DRDROID_API_KEY` +- etc. + +## Results Storage + +Each test run is saved to `benchmarks/results/` as a JSON file named by model: + +``` +results/ +├── sonnet4.5_01_how_many_pods_20260130_163000.json +├── sonnet4.5_02_what_is_wrong_with_pod_20260130_163100.json +├── gpt5.2_01_how_many_pods_20260130_164000.json +└── ... +``` + +### Result File Format + +```json +{ + "test_id": "01_how_many_pods", + "agent": "drdroid", + "model": "sonnet4.5", + "run_id": "sonnet4.5_01_how_many_pods_20260130_163000", + "status": "passed", + "user_prompt": "How many pods are in the app-01 namespace?", + "expected_output": ["There are 14 pods in the app-01 namespace"], + "actual_output": "There are 14 pods running in namespace app-01.", + "score": 1.0, + "judge_rationale": "The output correctly states 14 pods...", + "judge_model": "gpt-4.1", + "setup_time": 45.2, + "agent_time": 3.5, + "judge_time": 2.1, + "cleanup_time": 5.0, + "total_time": 55.8, + "tool_calls": ["kubectl_get_pods"], + "agent_metadata": { + "investigation_id": "inv-123", + "tokens": 150 + }, + "started_at": "2026-01-30T16:30:00", + "completed_at": "2026-01-30T16:30:55" +} +``` + +## Generating Reports + +### Interactive Dashboard (Recommended) + +```bash +streamlit run benchmarks/dashboard.py +``` + +The dashboard provides: + +- **Overview**: Total runs, pass rate, cost, tokens summary +- **Model Comparison**: Side-by-side comparison of all models +- **Test Cases**: Analysis by test case with per-model breakdown +- **Use Case x Model Matrix**: Pivot table showing status/metrics for every combination +- **Raw Results**: Detailed view with filtering and drill-down + +Features: + +- Auto-refresh with "Refresh Data" button +- Filter by model, status, and date range +- Download CSV exports +- View detailed output, rationale, and errors for any run + +### CLI Reports + +#### Summary Report +```bash +python benchmarks/reporter.py --summary +``` + +Output: +``` +====================================================================== +BENCHMARK SUMMARY REPORT +====================================================================== + +Overall Statistics: + Total Runs: 50 + Passed: 42 ✅ + Failed: 5 ❌ + Setup Failed: 2 🔧 + Errors: 1 ⚠️ + Pass Rate: 84.0% + +Timing: + Avg Total Time: 45.30s + Avg Agent Time: 3.20s + +Coverage: + Unique Models: 2 + Unique Tests: 20 + Models: sonnet4.5, gpt5.2 +``` + +#### Model Comparison +```bash +python benchmarks/reporter.py --compare-models +``` + +Output: +``` +====================================================================== +MODEL COMPARISON REPORT +====================================================================== + +Model Runs Pass Fail Rate Avg Time +---------------------------------------------------------------------- +sonnet4.5 5 4 1 80.0% 42.50s +gpt5.2 5 3 2 60.0% 38.20s +====================================================================== +``` + +#### Test Case Report (by Use Case) +```bash +python benchmarks/reporter.py --by-test +``` + +Output: +``` +====================================================================== +TEST CASE REPORT (by Use Case) +====================================================================== + +01_how_many_pods [kubernetes, easy] + Prompt: How many pods are in the app-01 namespace?... + Model Runs Pass Rate Time + -------------------------------------------------- + sonnet4.5 3 3 100.0% 42.50s + gpt5.2 2 1 50.0% 38.20s + +02_what_is_wrong_with_pod [kubernetes] + ... +``` + +#### Test-Specific Report +```bash +python benchmarks/reporter.py --test-id 01_how_many_pods +``` + +#### Export Reports +```bash +# JSON export +python benchmarks/reporter.py --summary --output report.json + +# CSV export +python benchmarks/reporter.py --compare-models --output comparison.csv +python benchmarks/reporter.py --by-test --output tests.csv +``` + +#### Filter Results +```bash +# Results for specific model +python benchmarks/reporter.py --summary --model sonnet4.5 + +# Results since a date +python benchmarks/reporter.py --summary --since 2026-01-30 + +# Failed tests only +python benchmarks/reporter.py --detailed --status failed +``` + +## Test Execution Flow + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 1. Load credentials from config/credentials.yaml │ +│ 2. Load test case from fixtures (user_prompt, expected_output) │ +│ 3. Run before_test bash script (setup infrastructure) │ +│ 4. Call agent with test_case.user_prompt (model passed via env) │ +│ 5. LLM Judge evaluates actual vs expected output │ +│ 6. Run after_test bash script (cleanup) │ +│ 7. Save result to results/{model}_{test_id}_{timestamp}.json │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `OPENAI_API_KEY` | OpenAI API key for judge | - | +| `CLASSIFIER_MODEL` | Model for LLM judge | `gpt-4.1` | +| `DRDROID_API_URL` | DrDroid API URL | `http://localhost:8000` | +| `DRDROID_API_KEY` | DrDroid API key | - | +| `DRDROID_MODEL` | Model to use (set by --model flag) | - | + +## CLI Reference + +### executor.py + +``` +python benchmarks/executor.py [OPTIONS] + +Options: + --model TEXT Model to use (REQUIRED). Examples: sonnet4.5, gpt5.2 + --agent TEXT Agent to use (default: drdroid) + --test-id TEXT Test ID(s) to run (repeatable) + --all Run all tests + --tag TEXT Filter by tag (repeatable) + --skip-setup Skip before_test scripts + --skip-cleanup Skip after_test scripts + --classifier-model LLM judge model + --credentials PATH Path to credentials file + --list-tests List available tests + --list-agents List registered agents + -v, --verbose Verbose output +``` + +### reporter.py + +``` +python benchmarks/reporter.py [OPTIONS] + +Options: + --summary Generate summary report + --compare-models Compare model performance + --by-test Report grouped by test case (use case) + --detailed Show detailed results + --model TEXT Filter by model + --test-id TEXT Report on specific test + --status TEXT Filter by status + --since TEXT Filter by date (ISO format) + --output, -o PATH Output file (JSON or CSV) + --results-dir PATH Results directory +``` + +### dashboard.py + +```bash +# Launch interactive dashboard +streamlit run benchmarks/dashboard.py + +# Or with custom port +streamlit run benchmarks/dashboard.py --server.port 8501 +``` + +### agent.py + +```bash +# Test an agent directly +python benchmarks/agent.py --agent drdroid --prompt "How many pods?" +``` + +## Integration with CI/CD + +```bash +#!/bin/bash +# Run benchmarks and fail if pass rate < 80% + +python benchmarks/executor.py --model sonnet4.5 --all + +# Check results +python benchmarks/reporter.py --summary --output results.json + +PASS_RATE=$(jq -r '.summary.pass_rate' results.json | tr -d '%') +if (( $(echo "$PASS_RATE < 80" | bc -l) )); then + echo "Pass rate $PASS_RATE% is below threshold" + exit 1 +fi +``` + +## Requirements + +The dashboard requires Streamlit and Pandas: + +```bash +pip install streamlit pandas +``` diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 0000000000..b48cde3be1 --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1,54 @@ +""" +Benchmark Test Suite + +A comprehensive test suite for evaluating LLM agents against HolmesGPT test cases. + +Modules: + - agent.py: Agent implementations and registry + - executor.py: Test execution engine + - config.py: Credentials and configuration management + - reporter.py: Report generation from results + +Usage: + # Run tests + python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods + + # Generate reports + python benchmarks/reporter.py --summary + python benchmarks/reporter.py --compare-agents +""" + +from benchmarks.agent import ( + AgentResult, + TestCase, + get_agent, + list_agents, + register_agent, +) +from benchmarks.config import ( + Credentials, + load_credentials, +) +from benchmarks.executor import ( + BenchmarkExecutor, + TestResult, + discover_tests, + load_test_case, +) + +__all__ = [ + # Agent + "AgentResult", + "TestCase", + "get_agent", + "list_agents", + "register_agent", + # Config + "Credentials", + "load_credentials", + # Executor + "BenchmarkExecutor", + "TestResult", + "discover_tests", + "load_test_case", +] diff --git a/benchmarks/agent.py b/benchmarks/agent.py new file mode 100644 index 0000000000..218aaa1bba --- /dev/null +++ b/benchmarks/agent.py @@ -0,0 +1,688 @@ +#!/usr/bin/env python3 +""" +Agent Implementations + +Register your agents here. Each agent must be registered with a unique name. +The executor requires an agent type to be specified - there is no default. + +Usage: + python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods + python benchmarks/executor.py --agent holmes --test-id 01_how_many_pods +""" + +import json +import os +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional + +import requests + +# Add project root to path +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + + +# ============================================================================= +# Data Models +# ============================================================================= + + +@dataclass +class AgentResult: + """Result returned by an agent.""" + + output: str # The agent's answer (REQUIRED) + tool_calls: List[str] = field(default_factory=list) # Tools called + metadata: Dict[str, Any] = field(default_factory=dict) # Extra data + + +@dataclass +class TestCase: + """Test case input to agents.""" + + id: str # e.g., "01_how_many_pods" + folder: str # Path to test folder + user_prompt: str # The question to answer + expected_output: List[str] # What the judge checks for + before_test: Optional[str] = None # Setup script + after_test: Optional[str] = None # Cleanup script + tags: List[str] = field(default_factory=list) + setup_timeout: int = 300 + + +# ============================================================================= +# Agent Registry +# ============================================================================= + +# Type for agent functions +AgentFunction = Callable[[TestCase], AgentResult] + +# Registry of all available agents +_AGENT_REGISTRY: Dict[str, AgentFunction] = {} + + +def register_agent(name: str): + """Decorator to register an agent.""" + + def decorator(func: AgentFunction) -> AgentFunction: + _AGENT_REGISTRY[name] = func + return func + + return decorator + + +def get_agent(name: str) -> AgentFunction: + """Get an agent by name.""" + if name not in _AGENT_REGISTRY: + available = ", ".join(sorted(_AGENT_REGISTRY.keys())) + raise ValueError( + f"Unknown agent: '{name}'. Available agents: {available}" + ) + return _AGENT_REGISTRY[name] + + +def list_agents() -> List[str]: + """List all registered agent names.""" + return sorted(_AGENT_REGISTRY.keys()) + + +# ============================================================================= +# Agent Implementations +# ============================================================================= + + +@register_agent("drdroid") +def drdroid_agent(test_case: TestCase) -> AgentResult: + """ + DrDroid Investigation Agent via API. + + Environment variables: + DRDROID_API_URL: API endpoint (default: http://localhost:8000) + DRDROID_API_KEY: API key for authentication + DRDROID_MODEL: Model override (optional). Options: + - "sonnet4.5" - Claude Sonnet 4.5 via Portkey + - "gpt5.2" - GPT 5.2 via Azure Foundry + - "codex" - GPT 5.2 Codex via Azure Foundry + """ + api_url = os.getenv("DRDROID_API_URL", "http://localhost:8000") + api_key = os.getenv("DRDROID_API_KEY") + model = os.getenv("DRDROID_MODEL") # Optional: "sonnet4.5" or "gpt5.2" + + if not api_key: + raise ValueError("DRDROID_API_KEY environment variable is required") + + url = f"{api_url}/api/external/investigate" + + headers = { + "Content-Type": "application/json", + "X-API-Key": api_key, + } + + payload = { + "message": test_case.user_prompt, + "metadata": { + "test_id": test_case.id, + "source": "benchmark", + }, + } + + # Add model override if specified + if model: + payload["model"] = model + + print(f"[drdroid] Calling API: {url}") + print(f"[drdroid] Model: {model or 'default (credits-based)'}") + print(f"[drdroid] Prompt: {test_case.user_prompt[:100]}...") + + try: + response = requests.post(url, json=payload, headers=headers, timeout=300) + response.raise_for_status() + result = response.json() + + output = result.get("response", "") + investigation_id = result.get("investigation_id") + total_tokens = result.get("total_tokens", 0) + model_used = result.get("model") + credits_used = result.get("credits_used") + + print(f"[drdroid] Investigation ID: {investigation_id}") + print(f"[drdroid] Model: {model_used}") + print(f"[drdroid] Total tokens: {total_tokens}") + print(f"[drdroid] Credits used: {credits_used}") + print(f"[drdroid] Output: {output[:200]}...") + + return AgentResult( + output=output, + metadata={ + "investigation_id": investigation_id, + "status": result.get("status"), + "total_tokens": total_tokens, + "model": model_used, + "credits_used": credits_used, + }, + ) + except requests.exceptions.RequestException as e: + print(f"[drdroid] Error: {str(e)}") + return AgentResult( + output=f"Error calling DrDroid API: {str(e)}", + metadata={"error": str(e)}, + ) + + +@register_agent("holmes") +def holmes_agent(test_case: TestCase) -> AgentResult: + """ + HolmesGPT Agent using ToolCallingLLM. + + Environment variables: + OPENAI_API_KEY: OpenAI API key + MODEL: Model to use (default: gpt-4.1) + """ + from holmes.config import Config + from holmes.core.llm import DefaultLLM + from holmes.core.tool_calling_llm import ToolCallingLLM + from holmes.core.tools import ToolExecutor + from holmes.core.toolset_manager import ToolsetManager + + model = os.getenv("MODEL", "gpt-4.1") + + print(f"[holmes] Using model: {model}") + print(f"[holmes] Prompt: {test_case.user_prompt[:100]}...") + + config = Config() + llm = DefaultLLM(model=model) + + toolset_manager = ToolsetManager(config=config) + toolsets = toolset_manager.load_builtin_toolsets() + toolset_manager.add_toolsets(toolsets) + tool_executor = ToolExecutor(toolset_manager.get_enabled_toolsets()) + + ai = ToolCallingLLM(llm=llm, tool_executor=tool_executor, max_steps=20) + + messages = [ + {"role": "system", "content": "You are a Kubernetes troubleshooting assistant."}, + {"role": "user", "content": test_case.user_prompt}, + ] + + result = ai.messages_call(messages=messages) + + tool_calls = [] + if result.tool_calls: + tool_calls = [tc.description for tc in result.tool_calls] + + print(f"[holmes] Tool calls: {len(tool_calls)}") + print(f"[holmes] Output: {(result.result or '')[:200]}...") + + return AgentResult( + output=result.result or "", + tool_calls=tool_calls, + metadata={ + "num_llm_calls": result.num_llm_calls, + "total_tokens": result.total_tokens, + "prompt_tokens": result.prompt_tokens, + "completion_tokens": result.completion_tokens, + "cost": result.cost, + "model": model, + }, + ) + + +@register_agent("openai") +def openai_agent(test_case: TestCase) -> AgentResult: + """ + Simple OpenAI chat completion (no tools). + + Environment variables: + OPENAI_API_KEY: OpenAI API key + MODEL: Model to use (default: gpt-4.1) + """ + import openai + + model = os.getenv("MODEL", "gpt-4.1") + + print(f"[openai] Using model: {model}") + print(f"[openai] Prompt: {test_case.user_prompt[:100]}...") + + client = openai.OpenAI() + + response = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": test_case.user_prompt}], + ) + + output = response.choices[0].message.content or "" + + print(f"[openai] Output: {output[:200]}...") + + return AgentResult( + output=output, + metadata={ + "model": model, + "total_tokens": response.usage.total_tokens if response.usage else 0, + "prompt_tokens": response.usage.prompt_tokens if response.usage else 0, + "completion_tokens": response.usage.completion_tokens if response.usage else 0, + }, + ) + + +def _fetch_investigation_prompt(investigation_id: str) -> dict: + """Fetch prompt details from Investigation API. + + Returns dict with: + - prompt: The investigation prompt + - context: Additional context (optional) + - metadata: Any metadata from the API + """ + api_url = os.getenv("INVESTIGATION_API_URL", "https://api.drdroid.io") + api_key = os.getenv("INVESTIGATION_API_KEY", "") + + if not api_key: + raise ValueError( + "INVESTIGATION_API_KEY environment variable required when using investigation_id" + ) + + url = f"{api_url}/api/investigations/{investigation_id}/prompt" + + print(f"[claudecode] Fetching prompt from Investigation API...") + print(f"[claudecode] URL: {url}") + + headers = { + "Content-Type": "application/json", + "X-API-Key": api_key, + } + + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + print(f"[claudecode] Status: {response.status_code} OK") + print(f"[claudecode] Prompt length: {len(data.get('prompt', ''))} chars") + + return { + "prompt": data.get("prompt", ""), + "context": data.get("context", ""), + "metadata": data.get("metadata", {}), + } + except requests.exceptions.RequestException as e: + print(f"[claudecode] Error fetching prompt: {e}") + raise ValueError(f"Failed to fetch investigation prompt: {e}") + + +@register_agent("claudecode") +def claudecode_agent(test_case: TestCase) -> AgentResult: + """ + Claude Code Agent - runs prompts through local Claude Code CLI. + + This agent invokes Claude Code in non-interactive mode to investigate + issues using kubectl commands on the current cluster context. + + Environment variables: + CLAUDE_CODE_PATH: Path to claude CLI (optional, auto-detected) + CLAUDE_MODEL: Model to use (optional, uses Claude Code default) + CLAUDE_MCP_CONFIG: Path to MCP servers config JSON file + INVESTIGATION_ID: Investigation ID to fetch prompt from API (optional) + INVESTIGATION_API_URL: Base URL for investigation API + INVESTIGATION_API_KEY: API key for investigation API + + Requirements: + - Claude Code CLI installed and authenticated (`claude` command available) + - kubectl configured with appropriate cluster context + - MCP servers configured (optional, for additional tool access) + """ + import shutil + import subprocess + import time + from pathlib import Path + + print(f"[claudecode] {'=' * 60}") + print(f"[claudecode] CLAUDE CODE AGENT STARTING") + print(f"[claudecode] {'=' * 60}") + + # Check if claude CLI is available + # First check environment variable, then PATH, then common locations + claude_path = os.getenv("CLAUDE_CODE_PATH") + + if not claude_path: + claude_path = shutil.which("claude") + + if not claude_path: + # Check common installation locations + common_paths = [ + Path.home() / ".local" / "bin" / "claude", + Path("/usr/local/bin/claude"), + Path.home() / "bin" / "claude", + ] + for path in common_paths: + if path.exists() and path.is_file(): + claude_path = str(path) + break + + if not claude_path: + raise ValueError( + "Claude Code CLI not found. Checked PATH and ~/.local/bin/claude. " + "Set CLAUDE_CODE_PATH environment variable or ensure 'claude' is in PATH. " + "Install from: https://docs.anthropic.com/en/docs/claude-code" + ) + + model = os.getenv("CLAUDE_MODEL", "") # Empty means use Claude Code default + mcp_config = os.getenv("CLAUDE_MCP_CONFIG", "") # Path to MCP servers config + investigation_id = os.getenv("INVESTIGATION_ID", "") # Optional investigation ID + + print(f"[claudecode] Configuration:") + print(f"[claudecode] CLI Path: {claude_path}") + print(f"[claudecode] Model: {model or 'default'}") + print(f"[claudecode] MCP Config: {mcp_config or 'none'}") + print(f"[claudecode] Investigation ID: {investigation_id or 'none'}") + + # Determine the prompt to use + investigation_metadata = {} + if investigation_id: + # Fetch prompt from Investigation API + print(f"[claudecode] {'─' * 60}") + print(f"[claudecode] FETCHING INVESTIGATION PROMPT") + inv_data = _fetch_investigation_prompt(investigation_id) + user_prompt = inv_data["prompt"] + investigation_metadata = inv_data.get("metadata", {}) + + if inv_data.get("context"): + print(f"[claudecode] Additional context: {len(inv_data['context'])} chars") + user_prompt = f"{inv_data['context']}\n\n{user_prompt}" + + print(f"[claudecode] Final prompt: {user_prompt[:150]}...") + else: + # Use prompt from test case + user_prompt = test_case.user_prompt + print(f"[claudecode] Using test case prompt: {user_prompt[:100]}...") + + # Build the prompt with read-only kubectl restrictions + system_instructions = """You are a Kubernetes troubleshooting assistant. + +IMPORTANT RESTRICTIONS: +- You may ONLY use kubectl commands that are READ-ONLY +- ALLOWED kubectl commands: get, describe, logs, top, explain, api-resources, api-versions, cluster-info, config view, config get-contexts +- FORBIDDEN kubectl commands: apply, create, delete, edit, patch, replace, scale, rollout, exec, cp, port-forward, run, set, label, annotate, taint, cordon, uncordon, drain +- If you need to run a forbidden command, explain what you would do instead of running it +- Focus on gathering information and diagnosing issues, not making changes + +Investigate the following and provide your findings:""" + + full_prompt = f"{system_instructions}\n\n{user_prompt}" + + # Build claude command + # Using --print (-p) for non-interactive mode that prints the result + # Using --verbose to see tool calls and commands + # Using --output-format json to get structured output with token counts + cmd = [claude_path, "-p", full_prompt, "--verbose", "--output-format", "json"] + + # Add model override if specified + if model: + cmd.extend(["--model", model]) + + # Add MCP config if specified + if mcp_config: + mcp_config_path = Path(mcp_config) + # If relative path, resolve from benchmarks directory + if not mcp_config_path.is_absolute(): + mcp_config_path = Path(__file__).parent / mcp_config + if mcp_config_path.exists(): + cmd.extend(["--mcp-config", str(mcp_config_path)]) + print(f"[claudecode] MCP config loaded: {mcp_config_path}") + + # Log MCP servers being used + try: + with open(mcp_config_path, "r") as f: + mcp_data = json.load(f) + servers = mcp_data.get("mcpServers", {}) + print(f"[claudecode] MCP servers: {', '.join(servers.keys())}") + except Exception as e: + print(f"[claudecode] Warning: Could not parse MCP config: {e}") + else: + print(f"[claudecode] Warning: MCP config not found: {mcp_config_path}") + + # Add dangerously skip permissions to avoid interactive prompts + # This is safe because we're restricting to read-only in the prompt + cmd.append("--dangerously-skip-permissions") + + print(f"[claudecode] Running Claude Code...") + print(f"[claudecode] Command: {' '.join(cmd[:3])}... (prompt truncated)") + start_time = time.time() + + try: + # Use Popen to stream output in real-time + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, # Merge stderr into stdout + text=True, + cwd=test_case.folder, + bufsize=1, # Line buffered + ) + + output_lines = [] + tool_calls = [] + token_info = { + "total_tokens": 0, + "prompt_tokens": 0, + "completion_tokens": 0, + "cost": 0.0, + } + + print(f"[claudecode] {'─' * 60}") + + # Stream output line by line + while True: + line = process.stdout.readline() + if not line and process.poll() is not None: + break + if line: + line_stripped = line.rstrip() + print(f" │ {line_stripped}") + output_lines.append(line) + + # Try to detect tool calls from output + # Claude Code typically shows tool usage like "Running: kubectl ..." or "Tool: ..." + line_lower = line_stripped.lower() + if any(indicator in line_lower for indicator in [ + "running:", "executing:", "tool:", "$ kubectl", "bash:", + "> kubectl", "command:", "running command" + ]): + tool_calls.append(line_stripped) + + # Try to extract token information from output + # Look for patterns like "tokens: 1234", "total_tokens", "input/output tokens" + + # Pattern: "X tokens" or "tokens: X" or "total_tokens: X" + token_patterns = [ + r'total[_\s]?tokens[:\s]+(\d+)', + r'input[_\s]?tokens[:\s]+(\d+)', + r'output[_\s]?tokens[:\s]+(\d+)', + r'prompt[_\s]?tokens[:\s]+(\d+)', + r'completion[_\s]?tokens[:\s]+(\d+)', + r'(\d+)\s+tokens?\s+used', + r'tokens?\s+used[:\s]+(\d+)', + r'cost[:\s]+\$?([\d.]+)', + ] + + for pattern in token_patterns: + match = re.search(pattern, line_lower) + if match: + value = match.group(1) + if 'total' in pattern: + token_info["total_tokens"] = int(value) + elif 'input' in pattern or 'prompt' in pattern: + token_info["prompt_tokens"] = int(value) + elif 'output' in pattern or 'completion' in pattern: + token_info["completion_tokens"] = int(value) + elif 'cost' in pattern: + token_info["cost"] = float(value) + + # Check timeout + if time.time() - start_time > 300: + process.kill() + process.wait() + print(f"[claudecode] {'─' * 60}") + print(f"[claudecode] Timeout after 300s") + return AgentResult( + output="Claude Code timed out after 300 seconds", + metadata={"error": "timeout", "timeout": 300}, + ) + + process.wait() + elapsed = time.time() - start_time + + print(f"[claudecode] {'─' * 60}") + print(f"[claudecode] Completed in {elapsed:.2f}s") + print(f"[claudecode] Tool calls detected: {len(tool_calls)}") + + if tool_calls: + print(f"[claudecode] Tools used:") + for tc in tool_calls[:10]: # Show first 10 + print(f" • {tc[:100]}") + if len(tool_calls) > 10: + print(f" ... and {len(tool_calls) - 10} more") + + raw_output = "".join(output_lines).strip() + + # Try to parse JSON output for structured data including tokens + output = raw_output + try: + # Look for JSON in the output (might be at the end or the whole output) + json_match = re.search(r'\{[^{}]*"result"[^{}]*\}|\{[^{}]*"output"[^{}]*\}', raw_output, re.DOTALL) + if json_match: + json_data = json.loads(json_match.group()) + # Extract result/output from JSON + output = json_data.get("result") or json_data.get("output") or raw_output + # Extract token info from JSON + if "usage" in json_data: + usage = json_data["usage"] + token_info["total_tokens"] = usage.get("total_tokens", 0) + token_info["prompt_tokens"] = usage.get("prompt_tokens") or usage.get("input_tokens", 0) + token_info["completion_tokens"] = usage.get("completion_tokens") or usage.get("output_tokens", 0) + if "cost" in json_data: + token_info["cost"] = json_data["cost"] + if "total_tokens" in json_data: + token_info["total_tokens"] = json_data["total_tokens"] + except (json.JSONDecodeError, AttributeError): + pass # Not JSON or parsing failed, use raw output + + # Print token info + if token_info["total_tokens"] > 0: + print(f"[claudecode] Tokens: {token_info['total_tokens']} total " + f"({token_info['prompt_tokens']} prompt, {token_info['completion_tokens']} completion)") + if token_info["cost"] > 0: + print(f"[claudecode] Cost: ${token_info['cost']:.4f}") + + print(f"[claudecode] {'=' * 60}") + print(f"[claudecode] CLAUDE CODE AGENT COMPLETED") + print(f"[claudecode] {'=' * 60}") + + if process.returncode != 0: + print(f"[claudecode] Error: exit code {process.returncode}") + return AgentResult( + output=f"Claude Code error (exit {process.returncode}): {output}", + tool_calls=tool_calls, + metadata={ + "error": f"exit code {process.returncode}", + "return_code": process.returncode, + "elapsed_time": elapsed, + "total_tokens": token_info["total_tokens"], + "prompt_tokens": token_info["prompt_tokens"], + "completion_tokens": token_info["completion_tokens"], + "cost": token_info["cost"], + "investigation_id": investigation_id or None, + "investigation_metadata": investigation_metadata, + "mcp_config": mcp_config or None, + }, + ) + + return AgentResult( + output=output, + tool_calls=tool_calls, + metadata={ + "model": model or "claude-code-default", + "elapsed_time": elapsed, + "return_code": process.returncode, + "num_tool_calls": len(tool_calls), + "total_tokens": token_info["total_tokens"], + "prompt_tokens": token_info["prompt_tokens"], + "completion_tokens": token_info["completion_tokens"], + "cost": token_info["cost"], + "investigation_id": investigation_id or None, + "investigation_metadata": investigation_metadata, + "mcp_config": mcp_config or None, + }, + ) + + except subprocess.TimeoutExpired: + print(f"[claudecode] Timeout after 300s") + return AgentResult( + output="Claude Code timed out after 300 seconds", + metadata={"error": "timeout", "timeout": 300}, + ) + except Exception as e: + print(f"[claudecode] Exception: {str(e)}") + return AgentResult( + output=f"Claude Code exception: {str(e)}", + metadata={"error": str(e)}, + ) + + +# ============================================================================= +# Add your custom agents below +# ============================================================================= + + +# @register_agent("my_agent") +# def my_custom_agent(test_case: TestCase) -> AgentResult: +# """Your custom agent implementation.""" +# # Your code here +# return AgentResult(output="response") + + +# ============================================================================= +# CLI for testing agents directly +# ============================================================================= + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Test an agent directly") + parser.add_argument( + "--agent", + required=True, + choices=list_agents(), + help="Agent to test", + ) + parser.add_argument( + "--prompt", + default="How many pods are in the app-01 namespace?", + help="Test prompt", + ) + + args = parser.parse_args() + + print(f"Testing agent: {args.agent}") + print(f"Prompt: {args.prompt}") + print("-" * 50) + + test = TestCase( + id="cli-test", + folder=".", + user_prompt=args.prompt, + expected_output=["test"], + ) + + try: + agent_fn = get_agent(args.agent) + result = agent_fn(test) + print("\n" + "=" * 50) + print("RESULT:") + print(f"Output: {result.output}") + print(f"Tool calls: {result.tool_calls}") + print(f"Metadata: {result.metadata}") + except Exception as e: + print(f"Error: {e}") diff --git a/benchmarks/config.py b/benchmarks/config.py new file mode 100644 index 0000000000..a346e9c1b2 --- /dev/null +++ b/benchmarks/config.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Configuration and Credentials Management + +Loads credentials from a unified YAML file for all infrastructure and monitoring tools. +""" + +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Optional + +import yaml + +BENCHMARKS_DIR = Path(__file__).parent +PROJECT_ROOT = BENCHMARKS_DIR.parent +CONFIG_DIR = BENCHMARKS_DIR / "config" +RESULTS_DIR = BENCHMARKS_DIR / "results" +FIXTURES_DIR = PROJECT_ROOT / "tests" / "llm" / "fixtures" / "test_ask_holmes" + +# Default credentials file location +DEFAULT_CREDENTIALS_FILE = CONFIG_DIR / "credentials.yaml" + + +@dataclass +class KubernetesCredentials: + """Kubernetes cluster credentials.""" + + kubeconfig: Optional[str] = None + context: Optional[str] = None + namespace: Optional[str] = None + + +@dataclass +class DatadogCredentials: + """Datadog credentials.""" + + api_key: Optional[str] = None + app_key: Optional[str] = None + site: str = "datadoghq.com" + + +@dataclass +class NewRelicCredentials: + """New Relic credentials.""" + + api_key: Optional[str] = None + account_id: Optional[str] = None + region: str = "US" + + +@dataclass +class PrometheusCredentials: + """Prometheus credentials.""" + + url: Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None + + +@dataclass +class GrafanaCredentials: + """Grafana credentials.""" + + url: Optional[str] = None + api_key: Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None + + +@dataclass +class LokiCredentials: + """Loki credentials.""" + + url: Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None + + +@dataclass +class ElasticsearchCredentials: + """Elasticsearch/OpenSearch credentials.""" + + url: Optional[str] = None + api_key: Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None + + +@dataclass +class OpenAICredentials: + """OpenAI/LLM credentials.""" + + api_key: Optional[str] = None + org_id: Optional[str] = None + base_url: Optional[str] = None + + +@dataclass +class JudgeConfig: + """LLM Judge configuration.""" + + model: str = "gpt-4.1" + api_key: Optional[str] = None + + +@dataclass +class Credentials: + """All credentials for benchmark testing.""" + + kubernetes: KubernetesCredentials = field(default_factory=KubernetesCredentials) + datadog: DatadogCredentials = field(default_factory=DatadogCredentials) + newrelic: NewRelicCredentials = field(default_factory=NewRelicCredentials) + prometheus: PrometheusCredentials = field(default_factory=PrometheusCredentials) + grafana: GrafanaCredentials = field(default_factory=GrafanaCredentials) + loki: LokiCredentials = field(default_factory=LokiCredentials) + elasticsearch: ElasticsearchCredentials = field(default_factory=ElasticsearchCredentials) + openai: OpenAICredentials = field(default_factory=OpenAICredentials) + judge: JudgeConfig = field(default_factory=JudgeConfig) + custom: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_yaml(cls, path: Path) -> "Credentials": + """Load credentials from YAML file.""" + if not path.exists(): + return cls() + + with open(path, "r") as f: + data = yaml.safe_load(f) or {} + + return cls( + kubernetes=KubernetesCredentials(**data.get("kubernetes", {})), + datadog=DatadogCredentials(**data.get("datadog", {})), + newrelic=NewRelicCredentials(**data.get("newrelic", {})), + prometheus=PrometheusCredentials(**data.get("prometheus", {})), + grafana=GrafanaCredentials(**data.get("grafana", {})), + loki=LokiCredentials(**data.get("loki", {})), + elasticsearch=ElasticsearchCredentials(**data.get("elasticsearch", {})), + openai=OpenAICredentials(**data.get("openai", {})), + judge=JudgeConfig(**data.get("judge", {})), + custom=data.get("custom", {}), + ) + + def to_env_vars(self) -> Dict[str, str]: + """Convert credentials to environment variables.""" + env_vars = {} + + # Kubernetes + if self.kubernetes.kubeconfig: + env_vars["KUBECONFIG"] = self.kubernetes.kubeconfig + if self.kubernetes.context: + env_vars["KUBE_CONTEXT"] = self.kubernetes.context + + # Datadog + if self.datadog.api_key: + env_vars["DD_API_KEY"] = self.datadog.api_key + if self.datadog.app_key: + env_vars["DD_APP_KEY"] = self.datadog.app_key + if self.datadog.site: + env_vars["DD_SITE"] = self.datadog.site + + # New Relic + if self.newrelic.api_key: + env_vars["NEW_RELIC_API_KEY"] = self.newrelic.api_key + if self.newrelic.account_id: + env_vars["NEW_RELIC_ACCOUNT_ID"] = self.newrelic.account_id + + # Prometheus + if self.prometheus.url: + env_vars["PROMETHEUS_URL"] = self.prometheus.url + if self.prometheus.username: + env_vars["PROMETHEUS_USERNAME"] = self.prometheus.username + if self.prometheus.password: + env_vars["PROMETHEUS_PASSWORD"] = self.prometheus.password + + # Grafana + if self.grafana.url: + env_vars["GRAFANA_URL"] = self.grafana.url + if self.grafana.api_key: + env_vars["GRAFANA_API_KEY"] = self.grafana.api_key + + # Loki + if self.loki.url: + env_vars["LOKI_URL"] = self.loki.url + + # Elasticsearch + if self.elasticsearch.url: + env_vars["ELASTICSEARCH_URL"] = self.elasticsearch.url + if self.elasticsearch.api_key: + env_vars["ELASTICSEARCH_API_KEY"] = self.elasticsearch.api_key + + # OpenAI + if self.openai.api_key: + env_vars["OPENAI_API_KEY"] = self.openai.api_key + if self.openai.org_id: + env_vars["OPENAI_ORG_ID"] = self.openai.org_id + if self.openai.base_url: + env_vars["OPENAI_BASE_URL"] = self.openai.base_url + + return env_vars + + def apply_to_env(self) -> None: + """Apply credentials to current environment.""" + for key, value in self.to_env_vars().items(): + os.environ[key] = value + + +def load_credentials(path: Optional[Path] = None) -> Credentials: + """Load credentials from file or environment.""" + if path is None: + path = DEFAULT_CREDENTIALS_FILE + + # Load from file if exists + credentials = Credentials.from_yaml(path) + + # Override with environment variables + if os.environ.get("OPENAI_API_KEY"): + credentials.openai.api_key = os.environ["OPENAI_API_KEY"] + if os.environ.get("CLASSIFIER_MODEL"): + credentials.judge.model = os.environ["CLASSIFIER_MODEL"] + + return credentials + + +def ensure_directories(): + """Ensure required directories exist.""" + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + RESULTS_DIR.mkdir(parents=True, exist_ok=True) diff --git a/benchmarks/config/.gitkeep b/benchmarks/config/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/config/credentials.yaml.template b/benchmarks/config/credentials.yaml.template new file mode 100644 index 0000000000..6af33fedec --- /dev/null +++ b/benchmarks/config/credentials.yaml.template @@ -0,0 +1,78 @@ +# Benchmark Credentials Configuration +# Copy this file to credentials.yaml and fill in your values +# Environment variables can override these values + +# Kubernetes Configuration +kubernetes: + kubeconfig: ~/.kube/config # Path to kubeconfig file + context: null # Kubernetes context to use (null = current) + namespace: default # Default namespace + +# Datadog +datadog: + api_key: null + app_key: null + site: datadoghq.com # or datadoghq.eu, us3.datadoghq.com, etc. + +# New Relic +newrelic: + api_key: null + account_id: null + region: US # or EU + +# Prometheus +prometheus: + url: http://localhost:9090 + username: null + password: null + +# Grafana +grafana: + url: http://localhost:3000 + api_key: null + username: admin + password: null + +# Loki +loki: + url: http://localhost:3100 + username: null + password: null + +# Elasticsearch / OpenSearch +elasticsearch: + url: http://localhost:9200 + api_key: null + username: null + password: null + +# OpenAI / LLM Provider +openai: + api_key: null # Or set OPENAI_API_KEY env var + org_id: null + base_url: null # For custom endpoints + +# LLM Judge Configuration +judge: + model: gpt-4.1 # Model used for evaluating test results + api_key: null # If different from openai.api_key + +# Investigation API (for fetching prompts by investigation ID) +investigation_api: + url: https://api.drdroid.io # Base URL for investigation API + api_key: null # API key for authentication + # Or set INVESTIGATION_API_URL and INVESTIGATION_API_KEY env vars + +# Claude Code Configuration +claudecode: + mcp_config: config/mcp_servers.json # Path to MCP servers config (relative to benchmarks/) + # Or set CLAUDE_MCP_CONFIG env var + +# Custom credentials for additional integrations +custom: + drdroid: + api_url: http://localhost:8000 + api_key: null # your-api-key + # splunk: + # url: https://splunk.example.com + # token: your-token diff --git a/benchmarks/config/mcp_servers.json.template b/benchmarks/config/mcp_servers.json.template new file mode 100644 index 0000000000..b81af7c759 --- /dev/null +++ b/benchmarks/config/mcp_servers.json.template @@ -0,0 +1,42 @@ +{ + "mcpServers": { + "kubernetes": { + "command": "npx", + "args": ["-y", "@anthropic/mcp-server-kubernetes"], + "env": { + "KUBECONFIG": "~/.kube/config" + } + }, + "prometheus": { + "command": "npx", + "args": ["-y", "@anthropic/mcp-server-prometheus"], + "env": { + "PROMETHEUS_URL": "http://localhost:9090" + } + }, + "grafana": { + "command": "npx", + "args": ["-y", "@anthropic/mcp-server-grafana"], + "env": { + "GRAFANA_URL": "http://localhost:3000", + "GRAFANA_API_KEY": "your-grafana-api-key" + } + }, + "datadog": { + "command": "npx", + "args": ["-y", "@anthropic/mcp-server-datadog"], + "env": { + "DD_API_KEY": "your-datadog-api-key", + "DD_APP_KEY": "your-datadog-app-key" + } + }, + "elasticsearch": { + "command": "npx", + "args": ["-y", "@anthropic/mcp-server-elasticsearch"], + "env": { + "ELASTICSEARCH_URL": "http://localhost:9200", + "ELASTICSEARCH_API_KEY": "your-elasticsearch-api-key" + } + } + } +} diff --git a/benchmarks/dashboard.py b/benchmarks/dashboard.py new file mode 100644 index 0000000000..7c40778194 --- /dev/null +++ b/benchmarks/dashboard.py @@ -0,0 +1,652 @@ +#!/usr/bin/env python3 +""" +Benchmark Results Dashboard + +A Streamlit app to visualize benchmark test results with use-case x model level reporting. + +Usage: + streamlit run benchmarks/dashboard.py +""" + +import json +from collections import defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd +import streamlit as st + +# Results directory +RESULTS_DIR = Path(__file__).parent / "results" + + +# ============================================================================= +# Data Loading +# ============================================================================= + + +@st.cache_data(ttl=30) # Cache for 30 seconds, then refresh +def load_all_results() -> List[Dict[str, Any]]: + """Load all results from JSON files.""" + results = [] + + if not RESULTS_DIR.exists(): + return results + + for filepath in RESULTS_DIR.glob("*.json"): + try: + with open(filepath, "r") as f: + data = json.load(f) + data["_filepath"] = str(filepath) + results.append(data) + except Exception as e: + st.warning(f"Failed to load {filepath.name}: {e}") + + return sorted(results, key=lambda r: r.get("started_at", ""), reverse=True) + + +def extract_metrics(result: Dict[str, Any]) -> Dict[str, Any]: + """Extract key metrics from a result.""" + metadata = result.get("agent_metadata", {}) + + # Build investigation URL if investigation_id exists + investigation_id = metadata.get("investigation_id") + investigation_url = None + if investigation_id: + investigation_url = f"https://aiops.drdroid.io/investigations/{investigation_id}" + + return { + "test_id": result.get("test_id", "unknown"), + "model": result.get("model", "unknown"), + "agent": result.get("agent", "unknown"), + "status": result.get("status", "unknown"), + "score": result.get("score"), + "total_time": result.get("total_time", 0), + "setup_time": result.get("setup_time", 0), + "agent_time": result.get("agent_time", 0), + "judge_time": result.get("judge_time", 0), + "cleanup_time": result.get("cleanup_time", 0), + "cost": metadata.get("cost", 0) or 0, + "total_tokens": metadata.get("total_tokens", 0) or 0, + "prompt_tokens": metadata.get("prompt_tokens", 0) or 0, + "completion_tokens": metadata.get("completion_tokens", 0) or 0, + "num_llm_calls": metadata.get("num_llm_calls", 0) or 0, + "tags": result.get("tags", []), + "started_at": result.get("started_at", ""), + "run_id": result.get("run_id", ""), + "user_prompt": result.get("user_prompt", ""), + "actual_output": result.get("actual_output", ""), + "expected_output": result.get("expected_output", []), + "judge_rationale": result.get("judge_rationale", ""), + "error_message": result.get("error_message", ""), + "investigation_id": investigation_id, + "investigation_url": investigation_url, + } + + +def build_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame: + """Build a pandas DataFrame from results.""" + if not results: + return pd.DataFrame() + + metrics = [extract_metrics(r) for r in results] + df = pd.DataFrame(metrics) + + # Convert started_at to datetime + if "started_at" in df.columns: + df["started_at"] = pd.to_datetime(df["started_at"], errors="coerce") + + return df + + +# ============================================================================= +# Aggregation Functions +# ============================================================================= + + +def aggregate_by_model(df: pd.DataFrame) -> pd.DataFrame: + """Aggregate metrics by model.""" + if df.empty: + return pd.DataFrame() + + agg = df.groupby("model").agg( + total_runs=("test_id", "count"), + passed=("status", lambda x: (x == "passed").sum()), + failed=("status", lambda x: (x == "failed").sum()), + setup_failed=("status", lambda x: (x == "setup_failed").sum()), + errors=("status", lambda x: (x == "error").sum()), + avg_total_time=("total_time", "mean"), + avg_agent_time=("agent_time", "mean"), + total_cost=("cost", "sum"), + avg_cost=("cost", "mean"), + total_tokens=("total_tokens", "sum"), + avg_tokens=("total_tokens", "mean"), + unique_tests=("test_id", "nunique"), + ).reset_index() + + agg["pass_rate"] = (agg["passed"] / agg["total_runs"] * 100).round(1) + + return agg.sort_values("pass_rate", ascending=False) + + +def aggregate_by_test(df: pd.DataFrame) -> pd.DataFrame: + """Aggregate metrics by test case.""" + if df.empty: + return pd.DataFrame() + + agg = df.groupby("test_id").agg( + total_runs=("model", "count"), + passed=("status", lambda x: (x == "passed").sum()), + failed=("status", lambda x: (x == "failed").sum()), + avg_total_time=("total_time", "mean"), + avg_agent_time=("agent_time", "mean"), + total_cost=("cost", "sum"), + avg_cost=("cost", "mean"), + total_tokens=("total_tokens", "sum"), + models_tested=("model", "nunique"), + ).reset_index() + + agg["pass_rate"] = (agg["passed"] / agg["total_runs"] * 100).round(1) + + return agg.sort_values("test_id") + + +def create_pivot_table(df: pd.DataFrame, value_col: str, aggfunc: str = "mean") -> pd.DataFrame: + """Create a pivot table with test_id as rows and model as columns.""" + if df.empty: + return pd.DataFrame() + + pivot = pd.pivot_table( + df, + values=value_col, + index="test_id", + columns="model", + aggfunc=aggfunc, + fill_value=0, + ) + + return pivot + + +def create_status_pivot(df: pd.DataFrame) -> pd.DataFrame: + """Create a pivot table showing pass/fail status per test x model.""" + if df.empty: + return pd.DataFrame() + + # Get latest result for each test_id x model combination + latest = df.sort_values("started_at", ascending=False).drop_duplicates( + subset=["test_id", "model"] + ) + + # Create status mapping with icons + status_map = { + "passed": "✅", + "failed": "❌", + "setup_failed": "🔧", + "error": "⚠️", + } + latest["status_icon"] = latest["status"].map(status_map).fillna("?") + + pivot = pd.pivot_table( + latest, + values="status_icon", + index="test_id", + columns="model", + aggfunc="first", + fill_value="-", + ) + + return pivot + + +def get_latest_results_matrix(df: pd.DataFrame) -> Dict[str, Dict[str, Dict[str, Any]]]: + """Get latest results organized as test_id -> model -> result_data. + + Returns a nested dict for building an interactive status matrix. + """ + if df.empty: + return {} + + # Get latest result for each test_id x model combination + latest = df.sort_values("started_at", ascending=False).drop_duplicates( + subset=["test_id", "model"] + ) + + # Status mapping with icons + status_map = { + "passed": "✅", + "failed": "❌", + "setup_failed": "🔧", + "error": "⚠️", + } + + # Build nested dict + matrix: Dict[str, Dict[str, Dict[str, Any]]] = {} + for _, row in latest.iterrows(): + test_id = row["test_id"] + model = row["model"] + + if test_id not in matrix: + matrix[test_id] = {} + + matrix[test_id][model] = { + "status": row["status"], + "status_icon": status_map.get(row["status"], "?"), + "score": row["score"], + "judge_rationale": row["judge_rationale"] or "No rationale available", + "agent_time": row["agent_time"], + "cost": row["cost"], + "total_tokens": row["total_tokens"], + "error_message": row["error_message"], + "run_id": row["run_id"], + "investigation_url": row.get("investigation_url"), + } + + return matrix + + +# ============================================================================= +# Streamlit App +# ============================================================================= + + +def main(): + st.set_page_config( + page_title="Benchmark Dashboard", + page_icon="📊", + layout="wide", + ) + + st.title("📊 Benchmark Results Dashboard") + + # Sidebar + with st.sidebar: + st.header("Controls") + + if st.button("🔄 Refresh Data"): + st.cache_data.clear() + st.rerun() + + st.markdown("---") + + # Load data + results = load_all_results() + + if not results: + st.warning("No results found") + st.info(f"Results directory: {RESULTS_DIR}") + return + + df = build_dataframe(results) + + st.metric("Total Results", len(df)) + st.metric("Unique Models", df["model"].nunique()) + st.metric("Unique Tests", df["test_id"].nunique()) + + st.markdown("---") + + # Filters + st.subheader("Filters") + + models = ["All"] + sorted(df["model"].unique().tolist()) + selected_model = st.selectbox("Model", models) + + statuses = ["All"] + sorted(df["status"].unique().tolist()) + selected_status = st.selectbox("Status", statuses) + + # Date filter + if not df["started_at"].isna().all(): + min_date = df["started_at"].min().date() + max_date = df["started_at"].max().date() + date_range = st.date_input( + "Date Range", + value=(min_date, max_date), + min_value=min_date, + max_value=max_date, + ) + else: + date_range = None + + # Apply filters + filtered_df = df.copy() + + if selected_model != "All": + filtered_df = filtered_df[filtered_df["model"] == selected_model] + + if selected_status != "All": + filtered_df = filtered_df[filtered_df["status"] == selected_status] + + if date_range and len(date_range) == 2: + start_date, end_date = date_range + filtered_df = filtered_df[ + (filtered_df["started_at"].dt.date >= start_date) + & (filtered_df["started_at"].dt.date <= end_date) + ] + + # Main content + tab1, tab2, tab3, tab4, tab5 = st.tabs([ + "📈 Overview", + "🤖 Model Comparison", + "📋 Test Cases", + "🔀 Use Case x Model", + "📄 Raw Results", + ]) + + # Tab 1: Overview + with tab1: + st.header("Overview") + + col1, col2, col3, col4 = st.columns(4) + + total = len(filtered_df) + passed = (filtered_df["status"] == "passed").sum() + failed = (filtered_df["status"] == "failed").sum() + pass_rate = (passed / total * 100) if total > 0 else 0 + + col1.metric("Total Runs", total) + col2.metric("Passed", f"{passed} ✅") + col3.metric("Failed", f"{failed} ❌") + col4.metric("Pass Rate", f"{pass_rate:.1f}%") + + col5, col6, col7, col8 = st.columns(4) + + col5.metric("Avg Time", f"{filtered_df['total_time'].mean():.1f}s") + col6.metric("Total Cost", f"${filtered_df['cost'].sum():.4f}") + col7.metric("Total Tokens", f"{filtered_df['total_tokens'].sum():,}") + col8.metric("Avg Tokens/Run", f"{filtered_df['total_tokens'].mean():,.0f}") + + st.markdown("---") + + # Status distribution + col1, col2 = st.columns(2) + + with col1: + st.subheader("Status Distribution") + status_counts = filtered_df["status"].value_counts() + st.bar_chart(status_counts) + + with col2: + st.subheader("Runs by Model") + model_counts = filtered_df["model"].value_counts() + st.bar_chart(model_counts) + + # Tab 2: Model Comparison + with tab2: + st.header("Model Comparison") + + model_agg = aggregate_by_model(filtered_df) + + if not model_agg.empty: + # Summary table + st.subheader("Summary by Model") + display_cols = [ + "model", "total_runs", "passed", "failed", "pass_rate", + "avg_agent_time", "total_cost", "avg_cost", "total_tokens", "avg_tokens", + ] + display_df = model_agg[display_cols].copy() + display_df.columns = [ + "Model", "Runs", "Passed", "Failed", "Pass Rate %", + "Avg Agent Time (s)", "Total Cost ($)", "Avg Cost ($)", "Total Tokens", "Avg Tokens", + ] + + # Format numeric columns + display_df["Avg Agent Time (s)"] = display_df["Avg Agent Time (s)"].round(2) + display_df["Total Cost ($)"] = display_df["Total Cost ($)"].round(4) + display_df["Avg Cost ($)"] = display_df["Avg Cost ($)"].round(4) + display_df["Avg Tokens"] = display_df["Avg Tokens"].round(0).astype(int) + + st.dataframe(display_df, use_container_width=True, hide_index=True) + + st.markdown("---") + + # Charts + col1, col2 = st.columns(2) + + with col1: + st.subheader("Pass Rate by Model") + chart_data = model_agg.set_index("model")["pass_rate"] + st.bar_chart(chart_data) + + with col2: + st.subheader("Avg Cost by Model") + chart_data = model_agg.set_index("model")["avg_cost"] + st.bar_chart(chart_data) + + # Tab 3: Test Cases + with tab3: + st.header("Test Case Analysis") + + test_agg = aggregate_by_test(filtered_df) + + if not test_agg.empty: + st.subheader("Summary by Test Case") + display_cols = [ + "test_id", "total_runs", "passed", "failed", "pass_rate", + "models_tested", "avg_agent_time", "avg_cost", + ] + display_df = test_agg[display_cols].copy() + display_df.columns = [ + "Test ID", "Runs", "Passed", "Failed", "Pass Rate %", + "Models", "Avg Time (s)", "Avg Cost ($)", + ] + display_df["Avg Time (s)"] = display_df["Avg Time (s)"].round(2) + display_df["Avg Cost ($)"] = display_df["Avg Cost ($)"].round(4) + + st.dataframe(display_df, use_container_width=True, hide_index=True) + + st.markdown("---") + + # Failing tests + failing_tests = test_agg[test_agg["pass_rate"] < 100].sort_values("pass_rate") + if not failing_tests.empty: + st.subheader("Tests with Failures") + st.dataframe( + failing_tests[["test_id", "total_runs", "passed", "failed", "pass_rate"]], + use_container_width=True, + hide_index=True, + ) + + # Tab 4: Use Case x Model Matrix + with tab4: + st.header("Use Case x Model Matrix") + + st.subheader("Status Matrix (Latest Run)") + st.caption("Click the ℹ️ icon to view judge rationale and details") + + # Get the matrix data for interactive display + results_matrix = get_latest_results_matrix(filtered_df) + models = sorted(filtered_df["model"].unique().tolist()) + test_ids = sorted(results_matrix.keys()) + + if results_matrix and models: + # Build header row + header_cols = st.columns([2] + [1] * len(models)) + header_cols[0].markdown("**Test Case**") + for i, model in enumerate(models): + header_cols[i + 1].markdown(f"**{model}**") + + # Build data rows + for test_id in test_ids: + row_cols = st.columns([2] + [1] * len(models)) + row_cols[0].write(test_id) + + for i, model in enumerate(models): + result_data = results_matrix.get(test_id, {}).get(model) + + if result_data: + status_icon = result_data["status_icon"] + + # Create a container with status icon and info popover + with row_cols[i + 1]: + col_status, col_info = st.columns([1, 1]) + col_status.write(status_icon) + + with col_info.popover("ℹ️"): + st.markdown(f"**Test:** {test_id}") + st.markdown(f"**Model:** {model}") + st.markdown(f"**Status:** {result_data['status']} {status_icon}") + st.markdown(f"**Score:** {result_data['score']}") + + st.markdown("---") + st.markdown("**Timing & Cost:**") + st.write(f"Agent Time: {result_data['agent_time']:.2f}s") + st.write(f"Cost: ${result_data['cost']:.4f}") + st.write(f"Tokens: {result_data['total_tokens']:,}") + + st.markdown("---") + st.markdown("**Judge Rationale:**") + st.write(result_data["judge_rationale"]) + + if result_data.get("error_message"): + st.markdown("---") + st.error(f"Error: {result_data['error_message']}") + + if result_data.get("investigation_url"): + st.markdown("---") + st.markdown( + f"[Open Investigation]({result_data['investigation_url']})" + ) + else: + row_cols[i + 1].write("-") + + st.markdown("---") + + # Metric selection for pivot + metric = st.selectbox( + "Select Metric for Heatmap", + ["pass_rate", "agent_time", "cost", "total_tokens"], + ) + + if metric == "pass_rate": + # Calculate pass rate per test x model + pass_df = filtered_df.copy() + pass_df["is_passed"] = (pass_df["status"] == "passed").astype(int) + pivot = create_pivot_table(pass_df, "is_passed", "mean") * 100 + st.subheader("Pass Rate % (Test x Model)") + elif metric == "agent_time": + pivot = create_pivot_table(filtered_df, "agent_time", "mean") + st.subheader("Avg Agent Time in seconds (Test x Model)") + elif metric == "cost": + pivot = create_pivot_table(filtered_df, "cost", "mean") + st.subheader("Avg Cost in $ (Test x Model)") + else: + pivot = create_pivot_table(filtered_df, "total_tokens", "mean") + st.subheader("Avg Tokens (Test x Model)") + + if not pivot.empty: + # Round values + pivot = pivot.round(2 if metric in ["agent_time", "cost"] else 1) + st.dataframe(pivot, use_container_width=True) + + # Download button + csv = pivot.to_csv() + st.download_button( + "Download as CSV", + csv, + f"benchmark_{metric}_matrix.csv", + "text/csv", + ) + + # Tab 5: Raw Results + with tab5: + st.header("Raw Results") + + # Sort options + sort_col = st.selectbox( + "Sort by", + ["started_at", "test_id", "model", "status", "total_time", "cost"], + ) + sort_order = st.radio("Order", ["Descending", "Ascending"], horizontal=True) + + sorted_df = filtered_df.sort_values( + sort_col, ascending=(sort_order == "Ascending") + ) + + # Create display dataframe with investigation links + display_df = sorted_df[["test_id", "model", "status", "score", "agent_time", "cost", "total_tokens", "started_at", "investigation_url"]].copy() + + display_df.columns = [ + "Test ID", "Model", "Status", "Score", "Agent Time (s)", + "Cost ($)", "Tokens", "Started At", "Investigation", + ] + + # Use LinkColumn for clickable investigation links + st.dataframe( + display_df, + use_container_width=True, + hide_index=True, + column_config={ + "Investigation": st.column_config.LinkColumn( + "Investigation", + help="Open investigation in DrDroid (opens in new tab)", + display_text="Open", + ), + }, + ) + + st.markdown("---") + + # Detailed view for selected result + st.subheader("Result Details") + + result_options = sorted_df["run_id"].tolist() + if result_options: + selected_run = st.selectbox("Select Run", result_options) + + if selected_run: + result_row = sorted_df[sorted_df["run_id"] == selected_run].iloc[0] + + col1, col2 = st.columns(2) + + with col1: + st.markdown("**Test Info**") + st.write(f"**Test ID:** {result_row['test_id']}") + st.write(f"**Model:** {result_row['model']}") + st.write(f"**Status:** {result_row['status']}") + st.write(f"**Score:** {result_row['score']}") + + # Investigation link + if result_row.get("investigation_url"): + st.markdown( + f"**Investigation:** [Open in DrDroid]({result_row['investigation_url']})" + ) + + st.markdown("**Timing**") + st.write(f"Setup: {result_row['setup_time']:.2f}s") + st.write(f"Agent: {result_row['agent_time']:.2f}s") + st.write(f"Judge: {result_row['judge_time']:.2f}s") + st.write(f"Total: {result_row['total_time']:.2f}s") + + with col2: + st.markdown("**Costs & Tokens**") + st.write(f"Cost: ${result_row['cost']:.4f}") + st.write(f"Total Tokens: {result_row['total_tokens']:,}") + st.write(f"Prompt Tokens: {result_row['prompt_tokens']:,}") + st.write(f"Completion Tokens: {result_row['completion_tokens']:,}") + + st.markdown("**User Prompt**") + st.code(result_row["user_prompt"], language=None) + + st.markdown("**Expected Output**") + expected = result_row["expected_output"] + if isinstance(expected, list): + for exp in expected: + st.write(f"- {exp}") + else: + st.write(expected) + + st.markdown("**Actual Output**") + st.code(result_row["actual_output"] or "N/A", language=None) + + if result_row["judge_rationale"]: + st.markdown("**Judge Rationale**") + st.write(result_row["judge_rationale"]) + + if result_row["error_message"]: + st.markdown("**Error**") + st.error(result_row["error_message"]) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/executor.py b/benchmarks/executor.py new file mode 100644 index 0000000000..a5e28f52bc --- /dev/null +++ b/benchmarks/executor.py @@ -0,0 +1,817 @@ +#!/usr/bin/env python3 +""" +Benchmark Test Suite Executor + +A comprehensive test suite for running LLM evaluation tests against different agents. +Each test run is saved to a JSON file in the results directory for analysis. + +Usage: + # Run single test + python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods + + # Run multiple tests + python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods --test-id 02_what_is_wrong_with_pod + + # Run all tests + python benchmarks/executor.py --agent drdroid --all + + # Run tests by tag + python benchmarks/executor.py --agent drdroid --tag kubernetes --tag easy + + # List available tests and agents + python benchmarks/executor.py --list-tests + python benchmarks/executor.py --list-agents +""" + +import argparse +import json +import logging +import os +import subprocess +import sys +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +# Add project root to path +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + +from benchmarks.agent import AgentResult, TestCase, get_agent, list_agents +from benchmarks.config import ( + FIXTURES_DIR, + RESULTS_DIR, + Credentials, + ensure_directories, + load_credentials, +) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Data Models +# ============================================================================= + + +@dataclass +class TestResult: + """Complete result of a single test execution.""" + + # Identifiers + test_id: str + agent: str + model: str # Model used (e.g., "sonnet4.5", "gpt5.2") + run_id: str # Unique ID for this run + + # Test case info + user_prompt: str + expected_output: List[str] + tags: List[str] + + # Execution results + status: str # "passed", "failed", "setup_failed", "error" + actual_output: Optional[str] = None + tool_calls: List[str] = field(default_factory=list) + + # Judge evaluation + score: Optional[float] = None + judge_rationale: Optional[str] = None + judge_model: Optional[str] = None + + # Timing + setup_time: float = 0.0 + agent_time: float = 0.0 + judge_time: float = 0.0 + cleanup_time: float = 0.0 + total_time: float = 0.0 + + # Agent metadata + agent_metadata: Dict[str, Any] = field(default_factory=dict) + + # Error info + error_message: Optional[str] = None + error_type: Optional[str] = None + + # Timestamps + started_at: str = field(default_factory=lambda: datetime.now().isoformat()) + completed_at: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return asdict(self) + + +# ============================================================================= +# Test Case Loading +# ============================================================================= + + +def load_test_case(test_id: str, fixtures_path: Path = FIXTURES_DIR) -> TestCase: + """Load a test case from YAML.""" + folder = fixtures_path / test_id + yaml_path = folder / "test_case.yaml" + + if not yaml_path.exists(): + raise ValueError(f"Test case not found: {test_id}") + + with open(yaml_path, "r") as f: + data = yaml.safe_load(f) + + expected = data.get("expected_output", []) + if isinstance(expected, str): + expected = [expected] + + user_prompt = data.get("user_prompt", "") + if isinstance(user_prompt, list): + user_prompt = user_prompt[0] + + return TestCase( + id=test_id, + folder=str(folder), + user_prompt=user_prompt, + expected_output=expected, + before_test=data.get("before_test"), + after_test=data.get("after_test"), + tags=data.get("tags", []), + setup_timeout=data.get("setup_timeout", 300), + ) + + +def discover_tests( + fixtures_path: Path = FIXTURES_DIR, + tags: Optional[List[str]] = None, +) -> List[str]: + """Discover all test IDs, optionally filtered by tags.""" + test_ids = [] + + for item in fixtures_path.iterdir(): + if item.is_dir() and not item.name.startswith("."): + yaml_path = item / "test_case.yaml" + if yaml_path.exists(): + # Filter by tags if specified + if tags: + with open(yaml_path, "r") as f: + data = yaml.safe_load(f) + test_tags = data.get("tags", []) + if not any(t in test_tags for t in tags): + continue + + test_ids.append(item.name) + + return sorted(test_ids) + + +# ============================================================================= +# Setup/Cleanup Execution +# ============================================================================= + + +def run_bash_script( + script: str, + cwd: str, + timeout: int = 300, + credentials: Optional[Credentials] = None, + stream_output: bool = True, +) -> tuple[bool, str, float]: + """Run a bash script with credentials applied to environment. + + Args: + script: Bash script to run + cwd: Working directory + timeout: Timeout in seconds + credentials: Credentials to apply to environment + stream_output: If True, stream output to console in real-time + """ + if not script or not script.strip(): + return True, "", 0.0 + + # Prepare environment with credentials + env = os.environ.copy() + if credentials: + env.update(credentials.to_env_vars()) + + start_time = time.time() + + try: + if stream_output: + # Use Popen to stream output in real-time + process = subprocess.Popen( + script, + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, # Merge stderr into stdout + text=True, + cwd=cwd, + env=env, + bufsize=1, # Line buffered + ) + + output_lines = [] + try: + # Read and print output line by line + while True: + line = process.stdout.readline() + if not line and process.poll() is not None: + break + if line: + print(f" | {line.rstrip()}") + output_lines.append(line) + + # Check timeout + if time.time() - start_time > timeout: + process.kill() + process.wait() + elapsed = time.time() - start_time + return False, f"Timeout after {timeout}s", elapsed + + process.wait() + elapsed = time.time() - start_time + output = "".join(output_lines) + + if process.returncode != 0: + return False, f"Exit code {process.returncode}\n{output}", elapsed + + return True, output, elapsed + + except Exception as e: + process.kill() + process.wait() + raise e + else: + # Original behavior: capture output without streaming + result = subprocess.run( + script, + shell=True, + executable="/bin/bash", + capture_output=True, + text=True, + cwd=cwd, + timeout=timeout, + env=env, + ) + elapsed = time.time() - start_time + output = f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" + + if result.returncode != 0: + return False, f"Exit code {result.returncode}\n{output}", elapsed + + return True, output, elapsed + + except subprocess.TimeoutExpired: + elapsed = time.time() - start_time + return False, f"Timeout after {timeout}s", elapsed + except Exception as e: + elapsed = time.time() - start_time + return False, f"Error: {str(e)}", elapsed + + +# ============================================================================= +# LLM Judge +# ============================================================================= + + +def evaluate_with_llm_judge( + expected_elements: List[str], + actual_output: str, + classifier_model: str = "gpt-4.1", +) -> tuple[float, str]: + """Evaluate output using LLM-as-judge.""" + try: + from autoevals import LLMClassifier + except ImportError: + logger.error("autoevals not installed. Run: pip install autoevals") + return 0.0, "autoevals not installed" + + expected_str = "\n- ".join(expected_elements) + + prompt_template = """ +You are evaluating the correctness of an OUTPUT given by a LLM. You must return a score that +represents the correctness of that OUTPUT. + +The correctness is defined by the presence of EXPECTED ELEMENTS in the OUTPUT. +Make a judgement call whether each ELEMENT sufficiently matches the OUTPUT. ELEMENTS do +not need to appear verbatim or be a perfect match but their essence should be +present in the whole OUTPUT, even if it spans multiple sentences. + +# EXPECTED ELEMENTS + +- {{expected}} + +# OUTPUT + +{{output}} + + +Return a choice based on the number of EXPECTED ELEMENTS present in the OUTPUT. +Possible choices: +- A: All elements are present +- B: Either no element is present or only some but not all elements are present +""" + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + logger.warning("OPENAI_API_KEY not set, skipping LLM judge") + return 0.0, "No API key for judge" + + try: + classifier = LLMClassifier( + name="Correctness", + prompt_template=prompt_template, + choice_scores={"A": 1, "B": 0}, + use_cot=True, + model=classifier_model, + api_key=api_key, + ) + + result = classifier( + input=prompt_template, + output=actual_output, + expected=expected_str, + ) + + return result.score, result.metadata.get("rationale", "") + + except Exception as e: + logger.error(f"Judge error: {e}") + return 0.0, f"Judge error: {str(e)}" + + +# ============================================================================= +# Result Storage +# ============================================================================= + + +def generate_run_id(model: str, test_id: str) -> str: + """Generate a unique run ID based on model and test.""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{model}_{test_id}_{timestamp}" + + +def save_result(result: TestResult, results_dir: Path = RESULTS_DIR) -> Path: + """Save a test result to JSON file.""" + ensure_directories() + + filename = f"{result.run_id}.json" + filepath = results_dir / filename + + with open(filepath, "w") as f: + json.dump(result.to_dict(), f, indent=2, default=str) + + return filepath + + +def load_results( + results_dir: Path = RESULTS_DIR, + agent: Optional[str] = None, + test_id: Optional[str] = None, +) -> List[TestResult]: + """Load results from JSON files, optionally filtered.""" + results = [] + + if not results_dir.exists(): + return results + + for filepath in results_dir.glob("*.json"): + try: + with open(filepath, "r") as f: + data = json.load(f) + + # Filter by agent + if agent and data.get("agent") != agent: + continue + + # Filter by test_id + if test_id and data.get("test_id") != test_id: + continue + + results.append(TestResult(**data)) + except Exception as e: + logger.warning(f"Failed to load {filepath}: {e}") + + return sorted(results, key=lambda r: r.started_at, reverse=True) + + +# ============================================================================= +# Test Executor +# ============================================================================= + + +class BenchmarkExecutor: + """Executes benchmark tests against registered agents.""" + + def __init__( + self, + agent: str, + model: str, + credentials: Optional[Credentials] = None, + skip_setup: bool = False, + skip_cleanup: bool = False, + classifier_model: str = "gpt-4.1", + quiet: bool = False, + ): + if not agent: + raise ValueError( + "Agent must be specified. Use --agent . " + f"Available agents: {', '.join(list_agents())}" + ) + + if not model: + raise ValueError( + "Model must be specified. Use --model . " + "Examples: sonnet4.5, gpt5.2, gpt-4.1" + ) + + self.agent_name = agent + self.model = model + self.agent_fn = get_agent(agent) + self.credentials = credentials or load_credentials() + self.skip_setup = skip_setup + self.skip_cleanup = skip_cleanup + self.classifier_model = classifier_model + self.quiet = quiet # If True, don't stream script output to console + self.results: List[TestResult] = [] + + # Apply credentials to environment + self.credentials.apply_to_env() + + # Set model in environment for agent to use + os.environ["DRDROID_MODEL"] = model + + def run_test(self, test_case: TestCase) -> TestResult: + """Run a single test case.""" + run_id = generate_run_id(self.model, test_case.id) + start_time = time.time() + + logger.info(f"{'='*60}") + logger.info(f"Test: {test_case.id}") + logger.info(f"Agent: {self.agent_name}") + logger.info(f"Model: {self.model}") + logger.info(f"Run ID: {run_id}") + logger.info(f"{'='*60}") + + result = TestResult( + test_id=test_case.id, + agent=self.agent_name, + model=self.model, + run_id=run_id, + user_prompt=test_case.user_prompt, + expected_output=test_case.expected_output, + tags=test_case.tags, + status="error", + ) + + # 1. Run setup + if not self.skip_setup and test_case.before_test: + logger.info("Running setup...") + success, output, elapsed = run_bash_script( + test_case.before_test, + test_case.folder, + timeout=test_case.setup_timeout, + credentials=self.credentials, + stream_output=not self.quiet, # Stream output unless --quiet + ) + result.setup_time = elapsed + + if not success: + logger.error(f"Setup failed:\n{output[:500]}...") + result.status = "setup_failed" + result.error_message = output + result.error_type = "setup_failure" + result.completed_at = datetime.now().isoformat() + result.total_time = time.time() - start_time + save_result(result) + return result + + logger.info(f"Setup completed in {elapsed:.2f}s") + + # 2. Run agent + try: + logger.info(f"Running agent [{self.agent_name}]...") + agent_start = time.time() + + agent_result = self.agent_fn(test_case) + + result.agent_time = time.time() - agent_start + result.actual_output = agent_result.output + result.tool_calls = agent_result.tool_calls + result.agent_metadata = agent_result.metadata + + logger.info(f"Agent completed in {result.agent_time:.2f}s") + + except Exception as e: + logger.error(f"Agent error: {e}") + result.status = "error" + result.error_message = str(e) + result.error_type = type(e).__name__ + self._run_cleanup(test_case, result) + result.completed_at = datetime.now().isoformat() + result.total_time = time.time() - start_time + save_result(result) + return result + + # 3. Run LLM judge + try: + logger.info("Running LLM judge...") + judge_start = time.time() + + score, rationale = evaluate_with_llm_judge( + test_case.expected_output, + result.actual_output, + self.classifier_model, + ) + + result.judge_time = time.time() - judge_start + result.score = score + result.judge_rationale = rationale + result.judge_model = self.classifier_model + result.status = "passed" if score == 1 else "failed" + + logger.info(f"Judge score: {score} ({result.status})") + + except Exception as e: + logger.error(f"Judge error: {e}") + result.status = "error" + result.error_message = f"Judge error: {str(e)}" + result.error_type = "judge_error" + + # 4. Run cleanup + self._run_cleanup(test_case, result) + + # Finalize + result.completed_at = datetime.now().isoformat() + result.total_time = time.time() - start_time + + # Save result + filepath = save_result(result) + logger.info(f"Result saved to: {filepath}") + + return result + + def _run_cleanup(self, test_case: TestCase, result: TestResult) -> None: + """Run cleanup after test.""" + if not self.skip_cleanup and test_case.after_test: + logger.info("Running cleanup...") + success, output, elapsed = run_bash_script( + test_case.after_test, + test_case.folder, + timeout=120, + credentials=self.credentials, + stream_output=False, # Cleanup output is less important + ) + result.cleanup_time = elapsed + if not success: + logger.warning(f"Cleanup failed (non-fatal)") + else: + logger.info(f"Cleanup completed in {elapsed:.2f}s") + + def run_tests(self, test_ids: List[str]) -> List[TestResult]: + """Run multiple tests.""" + results = [] + + for i, test_id in enumerate(test_ids, 1): + logger.info(f"\n[{i}/{len(test_ids)}] Running test: {test_id}") + try: + test_case = load_test_case(test_id) + result = self.run_test(test_case) + results.append(result) + self.results.append(result) + except Exception as e: + logger.error(f"Error loading test {test_id}: {e}") + + return results + + def print_summary(self) -> None: + """Print summary of all results.""" + total = len(self.results) + passed = sum(1 for r in self.results if r.status == "passed") + failed = sum(1 for r in self.results if r.status == "failed") + setup_failed = sum(1 for r in self.results if r.status == "setup_failed") + errors = sum(1 for r in self.results if r.status == "error") + + total_time = sum(r.total_time for r in self.results) + + print("\n" + "=" * 70) + print(f"BENCHMARK RESULTS") + print(f"Model: {self.model}") + print("=" * 70) + print(f"Total: {total}") + print(f"Passed: {passed} ✅") + print(f"Failed: {failed} ❌") + print(f"Setup Failed: {setup_failed} 🔧") + print(f"Errors: {errors} ⚠️") + print(f"Pass Rate: {(passed/total*100):.1f}%" if total > 0 else "N/A") + print(f"Total Time: {total_time:.2f}s") + print("=" * 70) + + print("\nDetailed Results:") + print("-" * 70) + for r in self.results: + icon = {"passed": "✅", "failed": "❌", "setup_failed": "🔧", "error": "⚠️"}.get( + r.status, "?" + ) + print(f"{icon} {r.test_id}: {r.status.upper()} (score={r.score}, time={r.total_time:.1f}s)") + if r.error_message: + print(f" Error: {r.error_message[:80]}...") + print() + + +# ============================================================================= +# CLI +# ============================================================================= + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark Test Suite Executor", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run single test with model + python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods + + # Run multiple tests + python benchmarks/executor.py --model gpt5.2 --test-id 01_how_many_pods --test-id 02_what_is_wrong_with_pod + + # Run all tests + python benchmarks/executor.py --model sonnet4.5 --all + + # Run tests by tag + python benchmarks/executor.py --model sonnet4.5 --tag kubernetes --tag easy + + # List available tests + python benchmarks/executor.py --list-tests + python benchmarks/executor.py --list-tests --tag kubernetes + """, + ) + + # Model selection (REQUIRED) + parser.add_argument( + "--model", + help="Model to use for testing (REQUIRED). Examples: sonnet4.5, gpt5.2", + ) + + # Agent selection (defaults to drdroid) + parser.add_argument( + "--agent", + default="drdroid", + help="Agent to use (default: drdroid)", + ) + + # Test selection + parser.add_argument( + "--test-id", + action="append", + dest="test_ids", + help="Test ID(s) to run (can specify multiple)", + ) + parser.add_argument( + "--all", + action="store_true", + help="Run all available tests", + ) + parser.add_argument( + "--tag", + action="append", + dest="tags", + help="Filter tests by tag (can specify multiple)", + ) + + # Execution options + parser.add_argument( + "--skip-setup", + action="store_true", + help="Skip before_test scripts", + ) + parser.add_argument( + "--skip-cleanup", + action="store_true", + help="Skip after_test scripts", + ) + parser.add_argument( + "--classifier-model", + default=os.environ.get("CLASSIFIER_MODEL", "gpt-4.1"), + help="Model for LLM judge (default: gpt-4.1)", + ) + parser.add_argument( + "--credentials", + type=Path, + help="Path to credentials YAML file", + ) + + # List commands + parser.add_argument( + "--list-tests", + action="store_true", + help="List all available test IDs", + ) + parser.add_argument( + "--list-agents", + action="store_true", + help="List all registered agents", + ) + + # Output options + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Verbose output", + ) + parser.add_argument( + "-q", "--quiet", + action="store_true", + help="Suppress setup/cleanup script output (don't stream to console)", + ) + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Handle list commands + if args.list_agents: + print("Available agents:") + for name in list_agents(): + print(f" - {name}") + return + + if args.list_tests: + test_ids = discover_tests(tags=args.tags) + print(f"Available tests ({len(test_ids)}):") + for tid in test_ids: + test_case = load_test_case(tid) + tags_str = f" [{', '.join(test_case.tags)}]" if test_case.tags else "" + print(f" - {tid}{tags_str}") + return + + # Validate model is specified for test execution + if not args.model: + parser.print_help() + print("\n" + "=" * 60) + print("ERROR: --model is required for test execution") + print("Examples: --model sonnet4.5, --model gpt5.2") + print("=" * 60) + sys.exit(1) + + # Load credentials + credentials = load_credentials(args.credentials) + + # Create executor + executor = BenchmarkExecutor( + agent=args.agent, + model=args.model, + credentials=credentials, + skip_setup=args.skip_setup, + skip_cleanup=args.skip_cleanup, + classifier_model=args.classifier_model, + quiet=args.quiet, + ) + + # Determine tests to run + if args.all: + test_ids = discover_tests(tags=args.tags) + elif args.test_ids: + test_ids = args.test_ids + elif args.tags: + test_ids = discover_tests(tags=args.tags) + else: + parser.print_help() + print("\nError: Specify --test-id, --all, or --tag") + sys.exit(1) + + if not test_ids: + print("No tests found matching criteria") + sys.exit(1) + + # Run tests + logger.info(f"Running {len(test_ids)} test(s) with model '{args.model}'...") + executor.run_tests(test_ids) + + # Print summary + executor.print_summary() + + # Exit code based on results + report = { + "passed": sum(1 for r in executor.results if r.status == "passed"), + "total": len(executor.results), + } + + if report["passed"] == report["total"] and report["total"] > 0: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/reporter.py b/benchmarks/reporter.py new file mode 100644 index 0000000000..306d89a6a8 --- /dev/null +++ b/benchmarks/reporter.py @@ -0,0 +1,626 @@ +#!/usr/bin/env python3 +""" +Benchmark Report Generator + +Generates reports from saved test results focusing on MODEL and TEST CASE analysis. + +Usage: + # Summary report + python benchmarks/reporter.py --summary + + # Compare models + python benchmarks/reporter.py --compare-models + + # Report by test case (use case) + python benchmarks/reporter.py --by-test + + # Report for specific model + python benchmarks/reporter.py --model sonnet4.5 + + # Report for specific test case + python benchmarks/reporter.py --test-id 01_how_many_pods + + # Export to JSON/CSV + python benchmarks/reporter.py --compare-models --output report.json +""" + +import argparse +import csv +import json +import sys +from collections import defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +# Add project root to path +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + +from benchmarks.config import RESULTS_DIR + + +def load_all_results(results_dir: Path = RESULTS_DIR) -> List[Dict[str, Any]]: + """Load all results from JSON files.""" + results = [] + + if not results_dir.exists(): + return results + + for filepath in results_dir.glob("*.json"): + try: + with open(filepath, "r") as f: + data = json.load(f) + results.append(data) + except Exception as e: + print(f"Warning: Failed to load {filepath}: {e}") + + return sorted(results, key=lambda r: r.get("started_at", ""), reverse=True) + + +def filter_results( + results: List[Dict[str, Any]], + model: Optional[str] = None, + test_id: Optional[str] = None, + status: Optional[str] = None, + since: Optional[str] = None, +) -> List[Dict[str, Any]]: + """Filter results by criteria.""" + filtered = results + + if model: + filtered = [r for r in filtered if r.get("model") == model] + + if test_id: + filtered = [r for r in filtered if r.get("test_id") == test_id] + + if status: + filtered = [r for r in filtered if r.get("status") == status] + + if since: + filtered = [r for r in filtered if r.get("started_at", "") >= since] + + return filtered + + +def generate_summary_report(results: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate overall summary report.""" + if not results: + return {"error": "No results found"} + + total = len(results) + passed = sum(1 for r in results if r.get("status") == "passed") + failed = sum(1 for r in results if r.get("status") == "failed") + setup_failed = sum(1 for r in results if r.get("status") == "setup_failed") + errors = sum(1 for r in results if r.get("status") == "error") + + # Timing stats + total_times = [r.get("total_time", 0) for r in results if r.get("total_time")] + agent_times = [r.get("agent_time", 0) for r in results if r.get("agent_time")] + + # Unique counts + unique_models = set(r.get("model") for r in results if r.get("model")) + unique_tests = set(r.get("test_id") for r in results if r.get("test_id")) + + return { + "summary": { + "total_runs": total, + "passed": passed, + "failed": failed, + "setup_failed": setup_failed, + "errors": errors, + "pass_rate": f"{(passed / total * 100):.1f}%" if total > 0 else "N/A", + }, + "timing": { + "avg_total_time": round(sum(total_times) / len(total_times), 2) if total_times else 0, + "avg_agent_time": round(sum(agent_times) / len(agent_times), 2) if agent_times else 0, + "max_total_time": round(max(total_times), 2) if total_times else 0, + "min_total_time": round(min(total_times), 2) if total_times else 0, + }, + "coverage": { + "unique_models": len(unique_models), + "unique_tests": len(unique_tests), + "models": sorted(unique_models), + }, + "generated_at": datetime.now().isoformat(), + } + + +def generate_model_comparison(results: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate model comparison report.""" + model_stats = defaultdict(lambda: { + "total": 0, + "passed": 0, + "failed": 0, + "setup_failed": 0, + "errors": 0, + "total_time": 0, + "agent_time": 0, + "tests": set(), + }) + + for r in results: + model = r.get("model", "unknown") + status = r.get("status", "unknown") + + model_stats[model]["total"] += 1 + model_stats[model]["tests"].add(r.get("test_id")) + + if status == "passed": + model_stats[model]["passed"] += 1 + elif status == "failed": + model_stats[model]["failed"] += 1 + elif status == "setup_failed": + model_stats[model]["setup_failed"] += 1 + else: + model_stats[model]["errors"] += 1 + + model_stats[model]["total_time"] += r.get("total_time", 0) + model_stats[model]["agent_time"] += r.get("agent_time", 0) + + # Convert to serializable format + comparison = {} + for model, stats in sorted(model_stats.items()): + total = stats["total"] + comparison[model] = { + "total_runs": total, + "passed": stats["passed"], + "failed": stats["failed"], + "setup_failed": stats["setup_failed"], + "errors": stats["errors"], + "pass_rate": f"{(stats['passed'] / total * 100):.1f}%" if total > 0 else "N/A", + "pass_rate_numeric": round(stats['passed'] / total * 100, 1) if total > 0 else 0, + "avg_total_time": round(stats["total_time"] / total, 2) if total > 0 else 0, + "avg_agent_time": round(stats["agent_time"] / total, 2) if total > 0 else 0, + "unique_tests": len(stats["tests"]), + } + + return { + "model_comparison": comparison, + "generated_at": datetime.now().isoformat(), + } + + +def generate_test_case_report(results: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate report grouped by test case (use case).""" + test_stats = defaultdict(lambda: { + "models": defaultdict(lambda: { + "runs": 0, + "passed": 0, + "failed": 0, + "avg_time": 0, + "total_time": 0, + }), + "tags": [], + "user_prompt": "", + }) + + for r in results: + test_id = r.get("test_id", "unknown") + model = r.get("model", "unknown") + status = r.get("status", "unknown") + + test_stats[test_id]["models"][model]["runs"] += 1 + test_stats[test_id]["models"][model]["total_time"] += r.get("total_time", 0) + + if status == "passed": + test_stats[test_id]["models"][model]["passed"] += 1 + elif status == "failed": + test_stats[test_id]["models"][model]["failed"] += 1 + + # Store test metadata + if not test_stats[test_id]["user_prompt"]: + test_stats[test_id]["user_prompt"] = r.get("user_prompt", "") + test_stats[test_id]["tags"] = r.get("tags", []) + + # Convert to serializable format + report = {} + for test_id, stats in sorted(test_stats.items()): + model_results = {} + for model, model_stats in stats["models"].items(): + runs = model_stats["runs"] + model_results[model] = { + "runs": runs, + "passed": model_stats["passed"], + "failed": model_stats["failed"], + "pass_rate": f"{(model_stats['passed'] / runs * 100):.1f}%" if runs > 0 else "N/A", + "avg_time": round(model_stats["total_time"] / runs, 2) if runs > 0 else 0, + } + + report[test_id] = { + "tags": stats["tags"], + "user_prompt": stats["user_prompt"][:100] + "..." if len(stats["user_prompt"]) > 100 else stats["user_prompt"], + "models": model_results, + } + + return { + "test_case_report": report, + "generated_at": datetime.now().isoformat(), + } + + +def generate_single_test_report( + results: List[Dict[str, Any]], + test_id: str, +) -> Dict[str, Any]: + """Generate detailed report for a specific test case.""" + test_results = [r for r in results if r.get("test_id") == test_id] + + if not test_results: + return {"error": f"No results found for test: {test_id}"} + + # Group by model + by_model = defaultdict(list) + for r in test_results: + by_model[r.get("model", "unknown")].append(r) + + model_performance = {} + for model, runs in sorted(by_model.items()): + passed = sum(1 for r in runs if r.get("status") == "passed") + failed = sum(1 for r in runs if r.get("status") == "failed") + total = len(runs) + + # Get latest run details + latest = runs[0] if runs else {} + + model_performance[model] = { + "runs": total, + "passed": passed, + "failed": failed, + "pass_rate": f"{(passed / total * 100):.1f}%" if total > 0 else "N/A", + "avg_time": round(sum(r.get("total_time", 0) for r in runs) / total, 2) if total > 0 else 0, + "latest_status": latest.get("status"), + "latest_score": latest.get("score"), + "latest_run_id": latest.get("run_id"), + "latest_judge_rationale": latest.get("judge_rationale", "")[:200] if latest.get("judge_rationale") else None, + } + + return { + "test_id": test_id, + "total_runs": len(test_results), + "models_tested": sorted(by_model.keys()), + "tags": test_results[0].get("tags", []) if test_results else [], + "user_prompt": test_results[0].get("user_prompt") if test_results else None, + "expected_output": test_results[0].get("expected_output") if test_results else None, + "model_performance": model_performance, + "generated_at": datetime.now().isoformat(), + } + + +def generate_detailed_report(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Generate detailed report with all results.""" + return [ + { + "test_id": r.get("test_id"), + "model": r.get("model"), + "run_id": r.get("run_id"), + "status": r.get("status"), + "score": r.get("score"), + "total_time": r.get("total_time"), + "agent_time": r.get("agent_time"), + "setup_time": r.get("setup_time"), + "judge_time": r.get("judge_time"), + "started_at": r.get("started_at"), + "error_message": r.get("error_message"), + } + for r in results + ] + + +# ============================================================================= +# Console Output Functions +# ============================================================================= + + +def print_summary_report(report: Dict[str, Any]) -> None: + """Print summary report to console.""" + summary = report.get("summary", {}) + timing = report.get("timing", {}) + coverage = report.get("coverage", {}) + + print("\n" + "=" * 70) + print("BENCHMARK SUMMARY REPORT") + print("=" * 70) + + print("\nOverall Statistics:") + print(f" Total Runs: {summary.get('total_runs', 0)}") + print(f" Passed: {summary.get('passed', 0)} ✅") + print(f" Failed: {summary.get('failed', 0)} ❌") + print(f" Setup Failed: {summary.get('setup_failed', 0)} 🔧") + print(f" Errors: {summary.get('errors', 0)} ⚠️") + print(f" Pass Rate: {summary.get('pass_rate', 'N/A')}") + + print("\nTiming:") + print(f" Avg Total Time: {timing.get('avg_total_time', 0):.2f}s") + print(f" Avg Agent Time: {timing.get('avg_agent_time', 0):.2f}s") + + print("\nCoverage:") + print(f" Unique Models: {coverage.get('unique_models', 0)}") + print(f" Unique Tests: {coverage.get('unique_tests', 0)}") + print(f" Models: {', '.join(coverage.get('models', []))}") + + print("=" * 70 + "\n") + + +def print_model_comparison(report: Dict[str, Any]) -> None: + """Print model comparison to console.""" + comparison = report.get("model_comparison", {}) + + print("\n" + "=" * 70) + print("MODEL COMPARISON REPORT") + print("=" * 70) + + # Header + print(f"\n{'Model':<20} {'Runs':>6} {'Pass':>6} {'Fail':>6} {'Rate':>8} {'Avg Time':>10}") + print("-" * 70) + + for model, stats in sorted(comparison.items(), key=lambda x: -x[1].get("pass_rate_numeric", 0)): + print( + f"{model:<20} " + f"{stats.get('total_runs', 0):>6} " + f"{stats.get('passed', 0):>6} " + f"{stats.get('failed', 0):>6} " + f"{stats.get('pass_rate', 'N/A'):>8} " + f"{stats.get('avg_agent_time', 0):>9.2f}s" + ) + + print("=" * 70 + "\n") + + +def print_test_case_report(report: Dict[str, Any]) -> None: + """Print test case report to console.""" + test_cases = report.get("test_case_report", {}) + + print("\n" + "=" * 70) + print("TEST CASE REPORT (by Use Case)") + print("=" * 70) + + for test_id, data in sorted(test_cases.items()): + tags_str = f" [{', '.join(data.get('tags', []))}]" if data.get('tags') else "" + print(f"\n{test_id}{tags_str}") + print(f" Prompt: {data.get('user_prompt', 'N/A')}") + + models = data.get("models", {}) + if models: + print(f" {'Model':<18} {'Runs':>5} {'Pass':>5} {'Rate':>8} {'Time':>8}") + print(f" {'-'*50}") + for model, stats in sorted(models.items()): + print( + f" {model:<18} " + f"{stats.get('runs', 0):>5} " + f"{stats.get('passed', 0):>5} " + f"{stats.get('pass_rate', 'N/A'):>8} " + f"{stats.get('avg_time', 0):>7.2f}s" + ) + + print("\n" + "=" * 70 + "\n") + + +def print_single_test_report(report: Dict[str, Any]) -> None: + """Print single test report to console.""" + if "error" in report: + print(f"Error: {report['error']}") + return + + print("\n" + "=" * 70) + print(f"TEST REPORT: {report.get('test_id', 'Unknown')}") + print("=" * 70) + + print(f"\nTotal Runs: {report.get('total_runs', 0)}") + print(f"Models Tested: {', '.join(report.get('models_tested', []))}") + print(f"Tags: {', '.join(report.get('tags', []))}") + + print(f"\nPrompt: {report.get('user_prompt', 'N/A')}") + + print("\nExpected Output:") + for exp in report.get("expected_output", []): + print(f" - {exp}") + + print("\nModel Performance:") + print(f" {'Model':<18} {'Runs':>5} {'Pass':>5} {'Rate':>8} {'Latest':>10} {'Score':>6}") + print(f" {'-'*60}") + + for model, perf in sorted(report.get("model_performance", {}).items()): + print( + f" {model:<18} " + f"{perf.get('runs', 0):>5} " + f"{perf.get('passed', 0):>5} " + f"{perf.get('pass_rate', 'N/A'):>8} " + f"{perf.get('latest_status', 'N/A'):>10} " + f"{perf.get('latest_score', 'N/A'):>6}" + ) + + print("\n" + "=" * 70 + "\n") + + +def export_to_file(data: Any, filepath: Path) -> None: + """Export report to JSON or CSV file.""" + if filepath.suffix == ".json": + with open(filepath, "w") as f: + json.dump(data, f, indent=2, default=str) + print(f"Report saved to: {filepath}") + + elif filepath.suffix == ".csv": + if isinstance(data, list): + rows = data + elif isinstance(data, dict) and "model_comparison" in data: + rows = [ + {"model": model, **stats} + for model, stats in data["model_comparison"].items() + ] + elif isinstance(data, dict) and "test_case_report" in data: + rows = [] + for test_id, test_data in data["test_case_report"].items(): + for model, model_stats in test_data.get("models", {}).items(): + rows.append({ + "test_id": test_id, + "model": model, + "tags": ", ".join(test_data.get("tags", [])), + **model_stats, + }) + else: + print("Cannot export this report type to CSV") + return + + if rows: + with open(filepath, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + print(f"Report saved to: {filepath}") + else: + print(f"Unsupported file format: {filepath.suffix}") + + +# ============================================================================= +# CLI +# ============================================================================= + + +def main(): + parser = argparse.ArgumentParser( + description="Generate benchmark reports (Model + Test Case level)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Summary report + python benchmarks/reporter.py --summary + + # Compare models side-by-side + python benchmarks/reporter.py --compare-models + + # Report by test case (use case) + python benchmarks/reporter.py --by-test + + # Report for specific model + python benchmarks/reporter.py --model sonnet4.5 + + # Report for specific test case + python benchmarks/reporter.py --test-id 01_how_many_pods + + # Export to file + python benchmarks/reporter.py --compare-models --output comparison.json + python benchmarks/reporter.py --by-test --output tests.csv + """, + ) + + # Report types + parser.add_argument( + "--summary", + action="store_true", + help="Generate summary report", + ) + parser.add_argument( + "--compare-models", + action="store_true", + help="Compare performance across models", + ) + parser.add_argument( + "--by-test", + action="store_true", + help="Report grouped by test case (use case)", + ) + parser.add_argument( + "--detailed", + action="store_true", + help="Show detailed results list", + ) + + # Filters + parser.add_argument( + "--model", + help="Filter by model", + ) + parser.add_argument( + "--test-id", + help="Report on specific test case", + ) + parser.add_argument( + "--status", + choices=["passed", "failed", "setup_failed", "error"], + help="Filter by status", + ) + parser.add_argument( + "--since", + help="Filter results since date (ISO format)", + ) + + # Output + parser.add_argument( + "--output", "-o", + type=Path, + help="Output file (JSON or CSV)", + ) + parser.add_argument( + "--results-dir", + type=Path, + default=RESULTS_DIR, + help="Results directory", + ) + + args = parser.parse_args() + + # Load results + results = load_all_results(args.results_dir) + + if not results: + print("No results found in results directory") + print(f"Results directory: {args.results_dir}") + sys.exit(1) + + # Apply filters (except test_id if doing single test report) + filtered = filter_results( + results, + model=args.model, + test_id=None, # Don't filter by test_id for filtering, handled separately + status=args.status, + since=args.since, + ) + + # Generate requested report + if args.test_id: + # Single test report + report = generate_single_test_report(results, args.test_id) + if args.output: + export_to_file(report, args.output) + else: + print_single_test_report(report) + + elif args.compare_models: + report = generate_model_comparison(filtered) + if args.output: + export_to_file(report, args.output) + else: + print_model_comparison(report) + + elif args.by_test: + report = generate_test_case_report(filtered) + if args.output: + export_to_file(report, args.output) + else: + print_test_case_report(report) + + elif args.detailed: + report = generate_detailed_report(filtered) + if args.output: + export_to_file(report, args.output) + else: + for r in report[:30]: + status_icon = {"passed": "✅", "failed": "❌", "setup_failed": "🔧", "error": "⚠️"}.get( + r["status"], "?" + ) + print(f"{status_icon} {r['model']}/{r['test_id']}: {r['status']} (score={r['score']}, time={r['total_time']:.1f}s)") + if len(report) > 30: + print(f"... and {len(report) - 30} more results") + + else: + # Default to summary + report = generate_summary_report(filtered) + if args.output: + export_to_file(report, args.output) + else: + print_summary_report(report) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/results/.gitkeep b/benchmarks/results/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml index 3c3ce7e942..7fc6b2ec63 100644 --- a/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml @@ -1,23 +1,23 @@ -user_prompt: "How many pods are in the app-01 namespace?" +user_prompt: "How many pods are in the app-01 namespace in Azure Prod cluster?" expected_output: - There are 14 pods in the app-01 namespace before_test: | kubectl apply -f manifests.yaml # Wait for 14 pods to be created and ready (60s total) - MUST succeed or test fails PODS_READY=false - for i in {1..12}; do + for i in {1..25}; do POD_COUNT=$(kubectl get pods -l app=test-pod -n app-01 --no-headers 2>/dev/null | wc -l) - if [ "$POD_COUNT" -eq 14 ] && kubectl wait --for=condition=ready pod -l app=test-pod -n app-01 --timeout=5s 2>/dev/null; then + if [ "$POD_COUNT" -eq 14 ]; then echo "✅ All 14 pods created and ready!" PODS_READY=true break else - echo "⏳ Attempt $i/12: $POD_COUNT/14 pods found, waiting 5s..." - sleep 5 + echo "⏳ Attempt $i/25: $POD_COUNT/14 pods found, waiting 10s..." + sleep 10 fi done if [ "$PODS_READY" = false ]; then - echo "❌ 14 ready pods not achieved after 60s" + echo "❌ 14 ready pods not achieved after 250s" kubectl get pods -n app-01 exit 1 fi diff --git a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml index 9c8e8f023d..6d2c72f1fb 100644 --- a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: 'What is wrong with << { "type": "pod", "name": "giant-narwhal-6958c5bdd8-69gtn", "namespace": "app-02" } >> ?' +user_prompt: 'What is wrong with << { "type": "pod", "name": "giant-narwhal-6958c5bdd8-69gtn", "namespace": "app-02" } >> in Azure Prod cluster?' expected_output: - The pod was killed due to it being out of memory diff --git a/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml index 53c596d72e..5a7dec215e 100644 --- a/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: "What kubernetes events are related to the nginx pod in app-04?" +user_prompt: "What kubernetes events are related to the nginx pod in app-04 in Azure Prod cluster?" expected_output: - "Container is failing or crashing (CrashLoopBackOff or similar restart events)" - "Events showing repeated container restarts or failures" diff --git a/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml index bc2b15b971..88b671e4d5 100644 --- a/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: 'what is the image version of << { "type": "pod", "name": "sea-turtle" } >>' +user_prompt: 'what is the image version of << { "type": "pod", "name": "sea-turtle" } >> in Azure Prod cluster ?' expected_output: - The image version of the `sea-turtle` pod is either `1.23.4` or `nginx:1.23.4`. tags: diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml index 90eb9a908c..753d80a6bb 100644 --- a/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: "Why is there high latency with the customer-orders deployment?" +user_prompt: "Why is there high latency with the customer-orders deployment in Azure Prod Cluster?" expected_output: - High latency in the `customer-orders` deployment is due to slow database calls - Database operations are taking between 5 to 10 seconds to complete diff --git a/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml index 249ac71f0c..0d0368b98e 100644 --- a/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: "What is the issue with payment-processing-worker?" +user_prompt: "What is the issue with payment-processing-worker in Azure Prod cluster?" expected_output: - The `DEPLOY_ENV` environment variable is undefined or missing before_test: | diff --git a/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml index 40f6ca32e3..94ba06fb52 100644 --- a/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: What is the issue with customer-relations-webapp? +user_prompt: What is the issue with customer-relations-webapp in Azure Prod cluster? expected_output: - The Docker image `yourcompany/crw:latest` cannot be pulled before_test: | diff --git a/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml index 4f2ed9fad9..947d5cbb19 100644 --- a/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: "What is the issue with logging-agent?" +user_prompt: "What is the issue with logging-agent in Azure Prod cluster?" expected_output: - The command `wge` is not found - The answer should suggets `wge` may be a typo for `wget`. diff --git a/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml index fe7dd1d7ab..28fdc6a3c8 100644 --- a/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml @@ -1,4 +1,4 @@ -user_prompt: "What is the issue with job java-api-checker in namespace app-12" +user_prompt: "What is the issue with job java-api-checker in namespace app-12 in Azure Prod cluster?" expected_output: - The `java-api-checker` job repeatedly fails to connect to the database at `prod-db:3333` before_test: | @@ -18,8 +18,8 @@ before_test: | LOGS_READY=true break else - echo "⏳ Attempt $i/20: waiting for job pod with specific log lines, checking in 3s..." - sleep 3 + echo "⏳ Attempt $i/20: waiting for job pod with specific log lines, checking in 10s..." + sleep 10 fi done if [ "$LOGS_READY" = false ]; then From ad70f2565f1ebcc90a4b8eb7a96efe059882b6db Mon Sep 17 00:00:00 2001 From: Dipesh Mittal Date: Mon, 16 Feb 2026 18:18:57 +0530 Subject: [PATCH 2/2] added benchmarking guides --- benchmarks/BENCHMARKING_GUIDE.md | 267 +++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 benchmarks/BENCHMARKING_GUIDE.md diff --git a/benchmarks/BENCHMARKING_GUIDE.md b/benchmarks/BENCHMARKING_GUIDE.md new file mode 100644 index 0000000000..2f260b733b --- /dev/null +++ b/benchmarks/BENCHMARKING_GUIDE.md @@ -0,0 +1,267 @@ +# Benchmarking Guide + +This guide covers how to run benchmark tests using the DrDroid agent against HolmesGPT's evaluation test cases, generate reports, and review results in the Streamlit dashboard. + +## Prerequisites + +- Python 3.10+ +- A running Kubernetes cluster with `kubectl` configured +- API keys for the LLM judge (OpenAI) and DrDroid agent +- Install extra dependencies: + ```bash + pip install streamlit pandas pyyaml requests + ``` + +## 1. Setup Credentials + +```bash +cp benchmarks/config/credentials.yaml.template benchmarks/config/credentials.yaml +``` + +Edit `benchmarks/config/credentials.yaml` and fill in: + +```yaml +# Required for the DrDroid agent +custom: + drdroid: + api_url: http://your-drdroid-api-url + api_key: your-drdroid-api-key + +# Required for the LLM judge that scores results +openai: + api_key: sk-... + +judge: + model: gpt-4.1 + +# Required for Kubernetes-based tests +kubernetes: + kubeconfig: ~/.kube/config + context: your-cluster-context +``` + +Alternatively, set environment variables (these override the YAML file): + +```bash +export DRDROID_API_URL=http://your-drdroid-api-url +export DRDROID_API_KEY=your-drdroid-api-key +export OPENAI_API_KEY=sk-... +export CLASSIFIER_MODEL=gpt-4.1 +``` + +## 2. Running Benchmark Tests + +### List Available Tests + +```bash +python benchmarks/executor.py --list-tests +``` + +### Run a Single Test + +```bash +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods +``` + +The `--model` flag is **required** and labels which model the agent is using (used for tracking/comparison). The `--agent` flag selects the agent implementation (defaults to `drdroid`). + +### Run Multiple Specific Tests + +```bash +python benchmarks/executor.py --model sonnet4.5 --agent drdroid \ + --test-id 01_how_many_pods \ + --test-id 02_what_is_wrong_with_pod \ + --test-id 09_crashpod +``` + +### Run All Tests + +```bash +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --all +``` + +### Run Tests by Tag + +```bash +# Run only Kubernetes tests +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag kubernetes + +# Run easy/regression tests +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag easy + +# Multiple tags (OR logic) +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag kubernetes --tag prometheus +``` + +### Skip Setup or Cleanup + +Useful for iterative debugging: + +```bash +# Skip infrastructure setup (if resources are already running) +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods --skip-setup + +# Skip cleanup (keep infrastructure running after test) +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods --skip-cleanup +``` + +## 3. Available Agents + +| Agent | Description | +|------------- |--------------------------------------------------| +| `drdroid` | DrDroid Investigation API | +| `holmes` | HolmesGPT ToolCallingLLM | +| `claudecode` | Local Claude Code CLI with read-only kubectl | +| `openai` | Simple OpenAI completion (no tools) | + +To compare agents, run the same tests with different `--agent` flags: + +```bash +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --all +python benchmarks/executor.py --model sonnet4.5 --agent holmes --all +``` + +## 4. Generating Reports + +### CLI Summary + +```bash +python benchmarks/reporter.py --summary +``` + +Shows overall pass rate, timing, and coverage stats. + +### Model Comparison + +```bash +python benchmarks/reporter.py --compare-models +``` + +Side-by-side comparison of pass rates across models. + +### Report by Test Case + +```bash +python benchmarks/reporter.py --by-test +``` + +Breakdown of results per test case, showing which models passed/failed each one. + +### Filter Results + +```bash +# Results for a specific model only +python benchmarks/reporter.py --summary --model sonnet4.5 + +# Results since a specific date +python benchmarks/reporter.py --summary --since 2026-01-30 + +# Only failed tests +python benchmarks/reporter.py --detailed --status failed + +# Specific test case +python benchmarks/reporter.py --test-id 01_how_many_pods +``` + +### Export to File + +```bash +# JSON export +python benchmarks/reporter.py --summary --output report.json + +# CSV export +python benchmarks/reporter.py --compare-models --output comparison.csv +python benchmarks/reporter.py --by-test --output tests.csv +``` + +## 5. Reviewing Results in the Streamlit Dashboard + +Launch the interactive dashboard: + +```bash +streamlit run benchmarks/dashboard.py +``` + +Or on a custom port: + +```bash +streamlit run benchmarks/dashboard.py --server.port 8501 +``` + +### Dashboard Sections + +- **Overview** - Total runs, pass rate, cost, and tokens summary +- **Model Comparison** - Side-by-side comparison of all models tested +- **Test Cases** - Per-test-case analysis with per-model breakdown +- **Use Case x Model Matrix** - Pivot table showing status/metrics for every test-model combination +- **Raw Results** - Detailed view with filtering and drill-down into individual runs + +### Dashboard Features + +- **Refresh Data** button to reload latest results +- Filter by model, status, and date range +- Download CSV exports directly from the UI +- View detailed output, judge rationale, and errors for any run + +## 6. Understanding Results + +Each test run produces a JSON file in `benchmarks/results/`: + +``` +results/ + sonnet4.5_01_how_many_pods_20260130_163000.json + sonnet4.5_02_what_is_wrong_with_pod_20260130_163100.json + ... +``` + +Key fields in each result: + +| Field | Description | +|-------------------|---------------------------------------------| +| `status` | `passed`, `failed`, `setup_failed`, `error` | +| `score` | 0.0 to 1.0 score from the LLM judge | +| `judge_rationale` | Explanation of why the judge scored it | +| `actual_output` | The agent's raw response | +| `agent_time` | Time the agent took to respond | +| `setup_time` | Time for infrastructure setup | +| `tool_calls` | Tools the agent invoked | + +## 7. Adding More Tests + +Test cases live in `tests/llm/fixtures/test_ask_holmes/`. Each test is a directory containing a `test_case.yaml`: + +```yaml +user_prompt: "Your question here?" +expected_output: + - "What the judge should check for" + - "Another expected fact" +tags: + - kubernetes + - easy +before_test: | + # Bash script to set up infrastructure + kubectl apply -f manifests.yaml +after_test: | + # Bash script to clean up + kubectl delete -f manifests.yaml +``` + +After adding a test, verify it appears: + +```bash +python benchmarks/executor.py --list-tests +``` + +Then run it: + +```bash +python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id your_new_test +``` + +## 8. Branch-Specific Changes + +The `benchmarking-drdroid-agent` branch includes these changes over `master`: + +1. **New `benchmarks/` module** - Complete benchmarking framework with executor, agent registry, reporter, config management, and Streamlit dashboard +2. **Updated test prompts** - Added "in Azure Prod cluster" context to test prompts (tests 01, 02, 04, 05, 07, 09, 10, 11, 12) for more realistic DrDroid agent evaluation +3. **Increased setup timeouts** - Longer wait times in `before_test` scripts for tests 01 and 12 to handle slower cluster environments