From 084390de771f26e568e18f472986cf749402f55e Mon Sep 17 00:00:00 2001
From: Dipesh Mittal <dipesh@drdroid.io>
Date: Mon, 16 Feb 2026 17:59:20 +0530
Subject: [PATCH 1/2] added benchmarking changes for drdroid agent

---
 benchmarks/.gitignore                         |  10 +
 benchmarks/README.md                          | 442 ++++++++++
 benchmarks/__init__.py                        |  54 ++
 benchmarks/agent.py                           | 688 +++++++++++++++
 benchmarks/config.py                          | 228 +++++
 benchmarks/config/.gitkeep                    |   0
 benchmarks/config/credentials.yaml.template   |  78 ++
 benchmarks/config/mcp_servers.json.template   |  42 +
 benchmarks/dashboard.py                       | 652 ++++++++++++++
 benchmarks/executor.py                        | 817 ++++++++++++++++++
 benchmarks/reporter.py                        | 626 ++++++++++++++
 benchmarks/results/.gitkeep                   |   0
 .../01_how_many_pods/test_case.yaml           |  12 +-
 .../02_what_is_wrong_with_pod/test_case.yaml  |   2 +-
 .../04_related_k8s_events/test_case.yaml      |   2 +-
 .../05_image_version/test_case.yaml           |   2 +-
 .../07_high_latency/test_case.yaml            |   2 +-
 .../09_crashpod/test_case.yaml                |   2 +-
 .../10_image_pull_backoff/test_case.yaml      |   2 +-
 .../11_init_containers/test_case.yaml         |   2 +-
 .../12_job_crashing/test_case.yaml            |   6 +-
 21 files changed, 3653 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/.gitignore
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/agent.py
 create mode 100644 benchmarks/config.py
 create mode 100644 benchmarks/config/.gitkeep
 create mode 100644 benchmarks/config/credentials.yaml.template
 create mode 100644 benchmarks/config/mcp_servers.json.template
 create mode 100644 benchmarks/dashboard.py
 create mode 100644 benchmarks/executor.py
 create mode 100644 benchmarks/reporter.py
 create mode 100644 benchmarks/results/.gitkeep

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 0000000000..e3ee75b93b
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1,10 @@
+# Credentials (contains secrets)
+config/credentials.yaml
+config/mcp_servers.json
+
+# Results directory (can be large)
+results/*.json
+
+# Keep the directories
+!config/.gitkeep
+!results/.gitkeep
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000..951fe3cc94
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,442 @@
+# Benchmark Test Suite
+
+A comprehensive test suite for evaluating LLM models against HolmesGPT test cases.
+Results are tracked by **model** and **test case (use case)** for easy comparison.
+
+## Directory Structure
+
+```
+benchmarks/
+├── config/
+│   ├── credentials.yaml           # Your credentials (git-ignored)
+│   └── credentials.yaml.template  # Template for credentials
+├── results/                       # Test results (JSON files)
+├── agent.py                       # Agent implementations
+├── executor.py                    # Test execution engine
+├── config.py                      # Configuration management
+├── reporter.py                    # CLI report generation
+├── dashboard.py                   # Streamlit dashboard
+└── README.md
+```
+
+## Quick Start
+
+```bash
+# 1. Setup credentials
+cp benchmarks/config/credentials.yaml.template benchmarks/config/credentials.yaml
+# Edit credentials.yaml with your API keys
+
+# 2. List available tests
+python benchmarks/executor.py --list-tests
+
+# 3. Run a test (model is REQUIRED)
+python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods
+
+# 4. View results (choose one)
+python benchmarks/reporter.py --summary           # CLI summary
+python benchmarks/reporter.py --compare-models    # Model comparison
+streamlit run benchmarks/dashboard.py             # Interactive dashboard
+```
+
+## Running Tests
+
+### Single Test
+```bash
+python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods
+```
+
+### Multiple Tests
+```bash
+python benchmarks/executor.py --model gpt5.2 \
+    --test-id 01_how_many_pods \
+    --test-id 02_what_is_wrong_with_pod
+```
+
+### All Tests
+```bash
+python benchmarks/executor.py --model sonnet4.5 --all
+```
+
+### Tests by Tag
+```bash
+# Run all kubernetes tests
+python benchmarks/executor.py --model sonnet4.5 --tag kubernetes
+
+# Run easy tests
+python benchmarks/executor.py --model gpt5.2 --tag easy
+
+# Multiple tags (OR logic)
+python benchmarks/executor.py --model sonnet4.5 --tag kubernetes --tag prometheus
+```
+
+### Skip Setup/Cleanup
+```bash
+# Skip infrastructure setup (useful for debugging)
+python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods --skip-setup
+
+# Skip cleanup (keep infrastructure running)
+python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods --skip-cleanup
+```
+
+## Available Agents
+
+| Agent | Description |
+|-------|-------------|
+| `drdroid` | DrDroid Investigation API (default) |
+| `claudecode` | Local Claude Code CLI with read-only kubectl |
+| `holmes` | HolmesGPT ToolCallingLLM |
+| `openai` | Simple OpenAI completion (no tools) |
+
+### Claude Code Agent
+
+The `claudecode` agent runs prompts through your local Claude Code CLI installation.
+It's restricted to read-only kubectl commands for safe investigation.
+
+```bash
+# Run with Claude Code agent
+python benchmarks/executor.py --model claude-sonnet --agent claudecode --test-id 01_how_many_pods
+
+# With custom model
+CLAUDE_MODEL=claude-sonnet-4-20250514 python benchmarks/executor.py --model sonnet4 --agent claudecode --all
+```
+
+**Requirements:**
+- Claude Code CLI installed and authenticated (`claude` command in PATH)
+- kubectl configured with appropriate cluster context
+
+**Restrictions (enforced via system prompt):**
+- Only read-only kubectl commands allowed: `get`, `describe`, `logs`, `top`, `explain`, `api-resources`, `cluster-info`
+- Write commands forbidden: `apply`, `create`, `delete`, `edit`, `patch`, `exec`, etc.
+
+### Adding a Custom Agent
+
+Edit `benchmarks/agent.py`:
+
+```python
+@register_agent("my_agent")
+def my_custom_agent(test_case: TestCase) -> AgentResult:
+    """My custom agent implementation."""
+
+    # Your agent logic here
+    response = call_my_api(test_case.user_prompt)
+
+    return AgentResult(
+        output=response,
+        tool_calls=["tool1", "tool2"],  # optional
+        metadata={"custom": "data"},     # optional
+    )
+```
+
+## Credentials Configuration
+
+Create `benchmarks/config/credentials.yaml`:
+
+```yaml
+# Kubernetes
+kubernetes:
+  kubeconfig: ~/.kube/config
+  context: my-cluster
+
+# Monitoring tools
+datadog:
+  api_key: your-api-key
+  app_key: your-app-key
+
+prometheus:
+  url: http://localhost:9090
+
+grafana:
+  url: http://localhost:3000
+  api_key: your-api-key
+
+# LLM
+openai:
+  api_key: sk-...
+
+# Judge
+judge:
+  model: gpt-4.1
+
+# Custom
+custom:
+  drdroid:
+    api_url: http://localhost:8000
+    api_key: your-key
+```
+
+Environment variables override file values:
+- `OPENAI_API_KEY`
+- `CLASSIFIER_MODEL`
+- `DRDROID_API_URL`
+- `DRDROID_API_KEY`
+- etc.
+
+## Results Storage
+
+Each test run is saved to `benchmarks/results/` as a JSON file named by model:
+
+```
+results/
+├── sonnet4.5_01_how_many_pods_20260130_163000.json
+├── sonnet4.5_02_what_is_wrong_with_pod_20260130_163100.json
+├── gpt5.2_01_how_many_pods_20260130_164000.json
+└── ...
+```
+
+### Result File Format
+
+```json
+{
+  "test_id": "01_how_many_pods",
+  "agent": "drdroid",
+  "model": "sonnet4.5",
+  "run_id": "sonnet4.5_01_how_many_pods_20260130_163000",
+  "status": "passed",
+  "user_prompt": "How many pods are in the app-01 namespace?",
+  "expected_output": ["There are 14 pods in the app-01 namespace"],
+  "actual_output": "There are 14 pods running in namespace app-01.",
+  "score": 1.0,
+  "judge_rationale": "The output correctly states 14 pods...",
+  "judge_model": "gpt-4.1",
+  "setup_time": 45.2,
+  "agent_time": 3.5,
+  "judge_time": 2.1,
+  "cleanup_time": 5.0,
+  "total_time": 55.8,
+  "tool_calls": ["kubectl_get_pods"],
+  "agent_metadata": {
+    "investigation_id": "inv-123",
+    "tokens": 150
+  },
+  "started_at": "2026-01-30T16:30:00",
+  "completed_at": "2026-01-30T16:30:55"
+}
+```
+
+## Generating Reports
+
+### Interactive Dashboard (Recommended)
+
+```bash
+streamlit run benchmarks/dashboard.py
+```
+
+The dashboard provides:
+
+- **Overview**: Total runs, pass rate, cost, tokens summary
+- **Model Comparison**: Side-by-side comparison of all models
+- **Test Cases**: Analysis by test case with per-model breakdown
+- **Use Case x Model Matrix**: Pivot table showing status/metrics for every combination
+- **Raw Results**: Detailed view with filtering and drill-down
+
+Features:
+
+- Auto-refresh with "Refresh Data" button
+- Filter by model, status, and date range
+- Download CSV exports
+- View detailed output, rationale, and errors for any run
+
+### CLI Reports
+
+#### Summary Report
+```bash
+python benchmarks/reporter.py --summary
+```
+
+Output:
+```
+======================================================================
+BENCHMARK SUMMARY REPORT
+======================================================================
+
+Overall Statistics:
+  Total Runs:     50
+  Passed:         42 ✅
+  Failed:         5 ❌
+  Setup Failed:   2 🔧
+  Errors:         1 ⚠️
+  Pass Rate:      84.0%
+
+Timing:
+  Avg Total Time: 45.30s
+  Avg Agent Time: 3.20s
+
+Coverage:
+  Unique Models:  2
+  Unique Tests:   20
+  Models:         sonnet4.5, gpt5.2
+```
+
+#### Model Comparison
+```bash
+python benchmarks/reporter.py --compare-models
+```
+
+Output:
+```
+======================================================================
+MODEL COMPARISON REPORT
+======================================================================
+
+Model                Runs   Pass   Fail     Rate   Avg Time
+----------------------------------------------------------------------
+sonnet4.5               5      4      1    80.0%     42.50s
+gpt5.2                  5      3      2    60.0%     38.20s
+======================================================================
+```
+
+#### Test Case Report (by Use Case)
+```bash
+python benchmarks/reporter.py --by-test
+```
+
+Output:
+```
+======================================================================
+TEST CASE REPORT (by Use Case)
+======================================================================
+
+01_how_many_pods [kubernetes, easy]
+  Prompt: How many pods are in the app-01 namespace?...
+  Model              Runs  Pass     Rate     Time
+  --------------------------------------------------
+  sonnet4.5             3     3   100.0%    42.50s
+  gpt5.2                2     1    50.0%    38.20s
+
+02_what_is_wrong_with_pod [kubernetes]
+  ...
+```
+
+#### Test-Specific Report
+```bash
+python benchmarks/reporter.py --test-id 01_how_many_pods
+```
+
+#### Export Reports
+```bash
+# JSON export
+python benchmarks/reporter.py --summary --output report.json
+
+# CSV export
+python benchmarks/reporter.py --compare-models --output comparison.csv
+python benchmarks/reporter.py --by-test --output tests.csv
+```
+
+#### Filter Results
+```bash
+# Results for specific model
+python benchmarks/reporter.py --summary --model sonnet4.5
+
+# Results since a date
+python benchmarks/reporter.py --summary --since 2026-01-30
+
+# Failed tests only
+python benchmarks/reporter.py --detailed --status failed
+```
+
+## Test Execution Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│ 1. Load credentials from config/credentials.yaml                    │
+│ 2. Load test case from fixtures (user_prompt, expected_output)      │
+│ 3. Run before_test bash script (setup infrastructure)               │
+│ 4. Call agent with test_case.user_prompt (model passed via env)     │
+│ 5. LLM Judge evaluates actual vs expected output                    │
+│ 6. Run after_test bash script (cleanup)                             │
+│ 7. Save result to results/{model}_{test_id}_{timestamp}.json        │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+## Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `OPENAI_API_KEY` | OpenAI API key for judge | - |
+| `CLASSIFIER_MODEL` | Model for LLM judge | `gpt-4.1` |
+| `DRDROID_API_URL` | DrDroid API URL | `http://localhost:8000` |
+| `DRDROID_API_KEY` | DrDroid API key | - |
+| `DRDROID_MODEL` | Model to use (set by --model flag) | - |
+
+## CLI Reference
+
+### executor.py
+
+```
+python benchmarks/executor.py [OPTIONS]
+
+Options:
+  --model TEXT           Model to use (REQUIRED). Examples: sonnet4.5, gpt5.2
+  --agent TEXT           Agent to use (default: drdroid)
+  --test-id TEXT         Test ID(s) to run (repeatable)
+  --all                  Run all tests
+  --tag TEXT             Filter by tag (repeatable)
+  --skip-setup           Skip before_test scripts
+  --skip-cleanup         Skip after_test scripts
+  --classifier-model     LLM judge model
+  --credentials PATH     Path to credentials file
+  --list-tests           List available tests
+  --list-agents          List registered agents
+  -v, --verbose          Verbose output
+```
+
+### reporter.py
+
+```
+python benchmarks/reporter.py [OPTIONS]
+
+Options:
+  --summary              Generate summary report
+  --compare-models       Compare model performance
+  --by-test              Report grouped by test case (use case)
+  --detailed             Show detailed results
+  --model TEXT           Filter by model
+  --test-id TEXT         Report on specific test
+  --status TEXT          Filter by status
+  --since TEXT           Filter by date (ISO format)
+  --output, -o PATH      Output file (JSON or CSV)
+  --results-dir PATH     Results directory
+```
+
+### dashboard.py
+
+```bash
+# Launch interactive dashboard
+streamlit run benchmarks/dashboard.py
+
+# Or with custom port
+streamlit run benchmarks/dashboard.py --server.port 8501
+```
+
+### agent.py
+
+```bash
+# Test an agent directly
+python benchmarks/agent.py --agent drdroid --prompt "How many pods?"
+```
+
+## Integration with CI/CD
+
+```bash
+#!/bin/bash
+# Run benchmarks and fail if pass rate < 80%
+
+python benchmarks/executor.py --model sonnet4.5 --all
+
+# Check results
+python benchmarks/reporter.py --summary --output results.json
+
+PASS_RATE=$(jq -r '.summary.pass_rate' results.json | tr -d '%')
+if (( $(echo "$PASS_RATE < 80" | bc -l) )); then
+    echo "Pass rate $PASS_RATE% is below threshold"
+    exit 1
+fi
+```
+
+## Requirements
+
+The dashboard requires Streamlit and Pandas:
+
+```bash
+pip install streamlit pandas
+```
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000..b48cde3be1
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1,54 @@
+"""
+Benchmark Test Suite
+
+A comprehensive test suite for evaluating LLM agents against HolmesGPT test cases.
+
+Modules:
+    - agent.py: Agent implementations and registry
+    - executor.py: Test execution engine
+    - config.py: Credentials and configuration management
+    - reporter.py: Report generation from results
+
+Usage:
+    # Run tests
+    python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods
+
+    # Generate reports
+    python benchmarks/reporter.py --summary
+    python benchmarks/reporter.py --compare-agents
+"""
+
+from benchmarks.agent import (
+    AgentResult,
+    TestCase,
+    get_agent,
+    list_agents,
+    register_agent,
+)
+from benchmarks.config import (
+    Credentials,
+    load_credentials,
+)
+from benchmarks.executor import (
+    BenchmarkExecutor,
+    TestResult,
+    discover_tests,
+    load_test_case,
+)
+
+__all__ = [
+    # Agent
+    "AgentResult",
+    "TestCase",
+    "get_agent",
+    "list_agents",
+    "register_agent",
+    # Config
+    "Credentials",
+    "load_credentials",
+    # Executor
+    "BenchmarkExecutor",
+    "TestResult",
+    "discover_tests",
+    "load_test_case",
+]
diff --git a/benchmarks/agent.py b/benchmarks/agent.py
new file mode 100644
index 0000000000..218aaa1bba
--- /dev/null
+++ b/benchmarks/agent.py
@@ -0,0 +1,688 @@
+#!/usr/bin/env python3
+"""
+Agent Implementations
+
+Register your agents here. Each agent must be registered with a unique name.
+The executor requires an agent type to be specified - there is no default.
+
+Usage:
+    python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods
+    python benchmarks/executor.py --agent holmes --test-id 01_how_many_pods
+"""
+
+import json
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import requests
+
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+
+# =============================================================================
+# Data Models
+# =============================================================================
+
+
+@dataclass
+class AgentResult:
+    """Result returned by an agent."""
+
+    output: str  # The agent's answer (REQUIRED)
+    tool_calls: List[str] = field(default_factory=list)  # Tools called
+    metadata: Dict[str, Any] = field(default_factory=dict)  # Extra data
+
+
+@dataclass
+class TestCase:
+    """Test case input to agents."""
+
+    id: str  # e.g., "01_how_many_pods"
+    folder: str  # Path to test folder
+    user_prompt: str  # The question to answer
+    expected_output: List[str]  # What the judge checks for
+    before_test: Optional[str] = None  # Setup script
+    after_test: Optional[str] = None  # Cleanup script
+    tags: List[str] = field(default_factory=list)
+    setup_timeout: int = 300
+
+
+# =============================================================================
+# Agent Registry
+# =============================================================================
+
+# Type for agent functions
+AgentFunction = Callable[[TestCase], AgentResult]
+
+# Registry of all available agents
+_AGENT_REGISTRY: Dict[str, AgentFunction] = {}
+
+
+def register_agent(name: str):
+    """Decorator to register an agent."""
+
+    def decorator(func: AgentFunction) -> AgentFunction:
+        _AGENT_REGISTRY[name] = func
+        return func
+
+    return decorator
+
+
+def get_agent(name: str) -> AgentFunction:
+    """Get an agent by name."""
+    if name not in _AGENT_REGISTRY:
+        available = ", ".join(sorted(_AGENT_REGISTRY.keys()))
+        raise ValueError(
+            f"Unknown agent: '{name}'. Available agents: {available}"
+        )
+    return _AGENT_REGISTRY[name]
+
+
+def list_agents() -> List[str]:
+    """List all registered agent names."""
+    return sorted(_AGENT_REGISTRY.keys())
+
+
+# =============================================================================
+# Agent Implementations
+# =============================================================================
+
+
+@register_agent("drdroid")
+def drdroid_agent(test_case: TestCase) -> AgentResult:
+    """
+    DrDroid Investigation Agent via API.
+
+    Environment variables:
+        DRDROID_API_URL: API endpoint (default: http://localhost:8000)
+        DRDROID_API_KEY: API key for authentication
+        DRDROID_MODEL: Model override (optional). Options:
+            - "sonnet4.5" - Claude Sonnet 4.5 via Portkey
+            - "gpt5.2" - GPT 5.2 via Azure Foundry
+            - "codex" - GPT 5.2 Codex via Azure Foundry
+    """
+    api_url = os.getenv("DRDROID_API_URL", "http://localhost:8000")
+    api_key = os.getenv("DRDROID_API_KEY")
+    model = os.getenv("DRDROID_MODEL")  # Optional: "sonnet4.5" or "gpt5.2"
+
+    if not api_key:
+        raise ValueError("DRDROID_API_KEY environment variable is required")
+
+    url = f"{api_url}/api/external/investigate"
+
+    headers = {
+        "Content-Type": "application/json",
+        "X-API-Key": api_key,
+    }
+
+    payload = {
+        "message": test_case.user_prompt,
+        "metadata": {
+            "test_id": test_case.id,
+            "source": "benchmark",
+        },
+    }
+
+    # Add model override if specified
+    if model:
+        payload["model"] = model
+
+    print(f"[drdroid] Calling API: {url}")
+    print(f"[drdroid] Model: {model or 'default (credits-based)'}")
+    print(f"[drdroid] Prompt: {test_case.user_prompt[:100]}...")
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        response.raise_for_status()
+        result = response.json()
+
+        output = result.get("response", "")
+        investigation_id = result.get("investigation_id")
+        total_tokens = result.get("total_tokens", 0)
+        model_used = result.get("model")
+        credits_used = result.get("credits_used")
+
+        print(f"[drdroid] Investigation ID: {investigation_id}")
+        print(f"[drdroid] Model: {model_used}")
+        print(f"[drdroid] Total tokens: {total_tokens}")
+        print(f"[drdroid] Credits used: {credits_used}")
+        print(f"[drdroid] Output: {output[:200]}...")
+
+        return AgentResult(
+            output=output,
+            metadata={
+                "investigation_id": investigation_id,
+                "status": result.get("status"),
+                "total_tokens": total_tokens,
+                "model": model_used,
+                "credits_used": credits_used,
+            },
+        )
+    except requests.exceptions.RequestException as e:
+        print(f"[drdroid] Error: {str(e)}")
+        return AgentResult(
+            output=f"Error calling DrDroid API: {str(e)}",
+            metadata={"error": str(e)},
+        )
+
+
+@register_agent("holmes")
+def holmes_agent(test_case: TestCase) -> AgentResult:
+    """
+    HolmesGPT Agent using ToolCallingLLM.
+
+    Environment variables:
+        OPENAI_API_KEY: OpenAI API key
+        MODEL: Model to use (default: gpt-4.1)
+    """
+    from holmes.config import Config
+    from holmes.core.llm import DefaultLLM
+    from holmes.core.tool_calling_llm import ToolCallingLLM
+    from holmes.core.tools import ToolExecutor
+    from holmes.core.toolset_manager import ToolsetManager
+
+    model = os.getenv("MODEL", "gpt-4.1")
+
+    print(f"[holmes] Using model: {model}")
+    print(f"[holmes] Prompt: {test_case.user_prompt[:100]}...")
+
+    config = Config()
+    llm = DefaultLLM(model=model)
+
+    toolset_manager = ToolsetManager(config=config)
+    toolsets = toolset_manager.load_builtin_toolsets()
+    toolset_manager.add_toolsets(toolsets)
+    tool_executor = ToolExecutor(toolset_manager.get_enabled_toolsets())
+
+    ai = ToolCallingLLM(llm=llm, tool_executor=tool_executor, max_steps=20)
+
+    messages = [
+        {"role": "system", "content": "You are a Kubernetes troubleshooting assistant."},
+        {"role": "user", "content": test_case.user_prompt},
+    ]
+
+    result = ai.messages_call(messages=messages)
+
+    tool_calls = []
+    if result.tool_calls:
+        tool_calls = [tc.description for tc in result.tool_calls]
+
+    print(f"[holmes] Tool calls: {len(tool_calls)}")
+    print(f"[holmes] Output: {(result.result or '')[:200]}...")
+
+    return AgentResult(
+        output=result.result or "",
+        tool_calls=tool_calls,
+        metadata={
+            "num_llm_calls": result.num_llm_calls,
+            "total_tokens": result.total_tokens,
+            "prompt_tokens": result.prompt_tokens,
+            "completion_tokens": result.completion_tokens,
+            "cost": result.cost,
+            "model": model,
+        },
+    )
+
+
+@register_agent("openai")
+def openai_agent(test_case: TestCase) -> AgentResult:
+    """
+    Simple OpenAI chat completion (no tools).
+
+    Environment variables:
+        OPENAI_API_KEY: OpenAI API key
+        MODEL: Model to use (default: gpt-4.1)
+    """
+    import openai
+
+    model = os.getenv("MODEL", "gpt-4.1")
+
+    print(f"[openai] Using model: {model}")
+    print(f"[openai] Prompt: {test_case.user_prompt[:100]}...")
+
+    client = openai.OpenAI()
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": test_case.user_prompt}],
+    )
+
+    output = response.choices[0].message.content or ""
+
+    print(f"[openai] Output: {output[:200]}...")
+
+    return AgentResult(
+        output=output,
+        metadata={
+            "model": model,
+            "total_tokens": response.usage.total_tokens if response.usage else 0,
+            "prompt_tokens": response.usage.prompt_tokens if response.usage else 0,
+            "completion_tokens": response.usage.completion_tokens if response.usage else 0,
+        },
+    )
+
+
+def _fetch_investigation_prompt(investigation_id: str) -> dict:
+    """Fetch prompt details from Investigation API.
+
+    Returns dict with:
+        - prompt: The investigation prompt
+        - context: Additional context (optional)
+        - metadata: Any metadata from the API
+    """
+    api_url = os.getenv("INVESTIGATION_API_URL", "https://api.drdroid.io")
+    api_key = os.getenv("INVESTIGATION_API_KEY", "")
+
+    if not api_key:
+        raise ValueError(
+            "INVESTIGATION_API_KEY environment variable required when using investigation_id"
+        )
+
+    url = f"{api_url}/api/investigations/{investigation_id}/prompt"
+
+    print(f"[claudecode] Fetching prompt from Investigation API...")
+    print(f"[claudecode]   URL: {url}")
+
+    headers = {
+        "Content-Type": "application/json",
+        "X-API-Key": api_key,
+    }
+
+    try:
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+
+        print(f"[claudecode]   Status: {response.status_code} OK")
+        print(f"[claudecode]   Prompt length: {len(data.get('prompt', ''))} chars")
+
+        return {
+            "prompt": data.get("prompt", ""),
+            "context": data.get("context", ""),
+            "metadata": data.get("metadata", {}),
+        }
+    except requests.exceptions.RequestException as e:
+        print(f"[claudecode]   Error fetching prompt: {e}")
+        raise ValueError(f"Failed to fetch investigation prompt: {e}")
+
+
+@register_agent("claudecode")
+def claudecode_agent(test_case: TestCase) -> AgentResult:
+    """
+    Claude Code Agent - runs prompts through local Claude Code CLI.
+
+    This agent invokes Claude Code in non-interactive mode to investigate
+    issues using kubectl commands on the current cluster context.
+
+    Environment variables:
+        CLAUDE_CODE_PATH: Path to claude CLI (optional, auto-detected)
+        CLAUDE_MODEL: Model to use (optional, uses Claude Code default)
+        CLAUDE_MCP_CONFIG: Path to MCP servers config JSON file
+        INVESTIGATION_ID: Investigation ID to fetch prompt from API (optional)
+        INVESTIGATION_API_URL: Base URL for investigation API
+        INVESTIGATION_API_KEY: API key for investigation API
+
+    Requirements:
+        - Claude Code CLI installed and authenticated (`claude` command available)
+        - kubectl configured with appropriate cluster context
+        - MCP servers configured (optional, for additional tool access)
+    """
+    import shutil
+    import subprocess
+    import time
+    from pathlib import Path
+
+    print(f"[claudecode] {'=' * 60}")
+    print(f"[claudecode] CLAUDE CODE AGENT STARTING")
+    print(f"[claudecode] {'=' * 60}")
+
+    # Check if claude CLI is available
+    # First check environment variable, then PATH, then common locations
+    claude_path = os.getenv("CLAUDE_CODE_PATH")
+
+    if not claude_path:
+        claude_path = shutil.which("claude")
+
+    if not claude_path:
+        # Check common installation locations
+        common_paths = [
+            Path.home() / ".local" / "bin" / "claude",
+            Path("/usr/local/bin/claude"),
+            Path.home() / "bin" / "claude",
+        ]
+        for path in common_paths:
+            if path.exists() and path.is_file():
+                claude_path = str(path)
+                break
+
+    if not claude_path:
+        raise ValueError(
+            "Claude Code CLI not found. Checked PATH and ~/.local/bin/claude. "
+            "Set CLAUDE_CODE_PATH environment variable or ensure 'claude' is in PATH. "
+            "Install from: https://docs.anthropic.com/en/docs/claude-code"
+        )
+
+    model = os.getenv("CLAUDE_MODEL", "")  # Empty means use Claude Code default
+    mcp_config = os.getenv("CLAUDE_MCP_CONFIG", "")  # Path to MCP servers config
+    investigation_id = os.getenv("INVESTIGATION_ID", "")  # Optional investigation ID
+
+    print(f"[claudecode] Configuration:")
+    print(f"[claudecode]   CLI Path: {claude_path}")
+    print(f"[claudecode]   Model: {model or 'default'}")
+    print(f"[claudecode]   MCP Config: {mcp_config or 'none'}")
+    print(f"[claudecode]   Investigation ID: {investigation_id or 'none'}")
+
+    # Determine the prompt to use
+    investigation_metadata = {}
+    if investigation_id:
+        # Fetch prompt from Investigation API
+        print(f"[claudecode] {'─' * 60}")
+        print(f"[claudecode] FETCHING INVESTIGATION PROMPT")
+        inv_data = _fetch_investigation_prompt(investigation_id)
+        user_prompt = inv_data["prompt"]
+        investigation_metadata = inv_data.get("metadata", {})
+
+        if inv_data.get("context"):
+            print(f"[claudecode]   Additional context: {len(inv_data['context'])} chars")
+            user_prompt = f"{inv_data['context']}\n\n{user_prompt}"
+
+        print(f"[claudecode]   Final prompt: {user_prompt[:150]}...")
+    else:
+        # Use prompt from test case
+        user_prompt = test_case.user_prompt
+        print(f"[claudecode]   Using test case prompt: {user_prompt[:100]}...")
+
+    # Build the prompt with read-only kubectl restrictions
+    system_instructions = """You are a Kubernetes troubleshooting assistant.
+
+IMPORTANT RESTRICTIONS:
+- You may ONLY use kubectl commands that are READ-ONLY
+- ALLOWED kubectl commands: get, describe, logs, top, explain, api-resources, api-versions, cluster-info, config view, config get-contexts
+- FORBIDDEN kubectl commands: apply, create, delete, edit, patch, replace, scale, rollout, exec, cp, port-forward, run, set, label, annotate, taint, cordon, uncordon, drain
+- If you need to run a forbidden command, explain what you would do instead of running it
+- Focus on gathering information and diagnosing issues, not making changes
+
+Investigate the following and provide your findings:"""
+
+    full_prompt = f"{system_instructions}\n\n{user_prompt}"
+
+    # Build claude command
+    # Using --print (-p) for non-interactive mode that prints the result
+    # Using --verbose to see tool calls and commands
+    # Using --output-format json to get structured output with token counts
+    cmd = [claude_path, "-p", full_prompt, "--verbose", "--output-format", "json"]
+
+    # Add model override if specified
+    if model:
+        cmd.extend(["--model", model])
+
+    # Add MCP config if specified
+    if mcp_config:
+        mcp_config_path = Path(mcp_config)
+        # If relative path, resolve from benchmarks directory
+        if not mcp_config_path.is_absolute():
+            mcp_config_path = Path(__file__).parent / mcp_config
+        if mcp_config_path.exists():
+            cmd.extend(["--mcp-config", str(mcp_config_path)])
+            print(f"[claudecode]   MCP config loaded: {mcp_config_path}")
+
+            # Log MCP servers being used
+            try:
+                with open(mcp_config_path, "r") as f:
+                    mcp_data = json.load(f)
+                    servers = mcp_data.get("mcpServers", {})
+                    print(f"[claudecode]   MCP servers: {', '.join(servers.keys())}")
+            except Exception as e:
+                print(f"[claudecode]   Warning: Could not parse MCP config: {e}")
+        else:
+            print(f"[claudecode]   Warning: MCP config not found: {mcp_config_path}")
+
+    # Add dangerously skip permissions to avoid interactive prompts
+    # This is safe because we're restricting to read-only in the prompt
+    cmd.append("--dangerously-skip-permissions")
+
+    print(f"[claudecode] Running Claude Code...")
+    print(f"[claudecode] Command: {' '.join(cmd[:3])}... (prompt truncated)")
+    start_time = time.time()
+
+    try:
+        # Use Popen to stream output in real-time
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Merge stderr into stdout
+            text=True,
+            cwd=test_case.folder,
+            bufsize=1,  # Line buffered
+        )
+
+        output_lines = []
+        tool_calls = []
+        token_info = {
+            "total_tokens": 0,
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "cost": 0.0,
+        }
+
+        print(f"[claudecode] {'─' * 60}")
+
+        # Stream output line by line
+        while True:
+            line = process.stdout.readline()
+            if not line and process.poll() is not None:
+                break
+            if line:
+                line_stripped = line.rstrip()
+                print(f"  │ {line_stripped}")
+                output_lines.append(line)
+
+                # Try to detect tool calls from output
+                # Claude Code typically shows tool usage like "Running: kubectl ..." or "Tool: ..."
+                line_lower = line_stripped.lower()
+                if any(indicator in line_lower for indicator in [
+                    "running:", "executing:", "tool:", "$ kubectl", "bash:",
+                    "> kubectl", "command:", "running command"
+                ]):
+                    tool_calls.append(line_stripped)
+
+                # Try to extract token information from output
+                # Look for patterns like "tokens: 1234", "total_tokens", "input/output tokens"
+
+                # Pattern: "X tokens" or "tokens: X" or "total_tokens: X"
+                token_patterns = [
+                    r'total[_\s]?tokens[:\s]+(\d+)',
+                    r'input[_\s]?tokens[:\s]+(\d+)',
+                    r'output[_\s]?tokens[:\s]+(\d+)',
+                    r'prompt[_\s]?tokens[:\s]+(\d+)',
+                    r'completion[_\s]?tokens[:\s]+(\d+)',
+                    r'(\d+)\s+tokens?\s+used',
+                    r'tokens?\s+used[:\s]+(\d+)',
+                    r'cost[:\s]+\$?([\d.]+)',
+                ]
+
+                for pattern in token_patterns:
+                    match = re.search(pattern, line_lower)
+                    if match:
+                        value = match.group(1)
+                        if 'total' in pattern:
+                            token_info["total_tokens"] = int(value)
+                        elif 'input' in pattern or 'prompt' in pattern:
+                            token_info["prompt_tokens"] = int(value)
+                        elif 'output' in pattern or 'completion' in pattern:
+                            token_info["completion_tokens"] = int(value)
+                        elif 'cost' in pattern:
+                            token_info["cost"] = float(value)
+
+            # Check timeout
+            if time.time() - start_time > 300:
+                process.kill()
+                process.wait()
+                print(f"[claudecode] {'─' * 60}")
+                print(f"[claudecode] Timeout after 300s")
+                return AgentResult(
+                    output="Claude Code timed out after 300 seconds",
+                    metadata={"error": "timeout", "timeout": 300},
+                )
+
+        process.wait()
+        elapsed = time.time() - start_time
+
+        print(f"[claudecode] {'─' * 60}")
+        print(f"[claudecode] Completed in {elapsed:.2f}s")
+        print(f"[claudecode] Tool calls detected: {len(tool_calls)}")
+
+        if tool_calls:
+            print(f"[claudecode] Tools used:")
+            for tc in tool_calls[:10]:  # Show first 10
+                print(f"  • {tc[:100]}")
+            if len(tool_calls) > 10:
+                print(f"  ... and {len(tool_calls) - 10} more")
+
+        raw_output = "".join(output_lines).strip()
+
+        # Try to parse JSON output for structured data including tokens
+        output = raw_output
+        try:
+            # Look for JSON in the output (might be at the end or the whole output)
+            json_match = re.search(r'\{[^{}]*"result"[^{}]*\}|\{[^{}]*"output"[^{}]*\}', raw_output, re.DOTALL)
+            if json_match:
+                json_data = json.loads(json_match.group())
+                # Extract result/output from JSON
+                output = json_data.get("result") or json_data.get("output") or raw_output
+                # Extract token info from JSON
+                if "usage" in json_data:
+                    usage = json_data["usage"]
+                    token_info["total_tokens"] = usage.get("total_tokens", 0)
+                    token_info["prompt_tokens"] = usage.get("prompt_tokens") or usage.get("input_tokens", 0)
+                    token_info["completion_tokens"] = usage.get("completion_tokens") or usage.get("output_tokens", 0)
+                if "cost" in json_data:
+                    token_info["cost"] = json_data["cost"]
+                if "total_tokens" in json_data:
+                    token_info["total_tokens"] = json_data["total_tokens"]
+        except (json.JSONDecodeError, AttributeError):
+            pass  # Not JSON or parsing failed, use raw output
+
+        # Print token info
+        if token_info["total_tokens"] > 0:
+            print(f"[claudecode] Tokens: {token_info['total_tokens']} total "
+                  f"({token_info['prompt_tokens']} prompt, {token_info['completion_tokens']} completion)")
+        if token_info["cost"] > 0:
+            print(f"[claudecode] Cost: ${token_info['cost']:.4f}")
+
+        print(f"[claudecode] {'=' * 60}")
+        print(f"[claudecode] CLAUDE CODE AGENT COMPLETED")
+        print(f"[claudecode] {'=' * 60}")
+
+        if process.returncode != 0:
+            print(f"[claudecode] Error: exit code {process.returncode}")
+            return AgentResult(
+                output=f"Claude Code error (exit {process.returncode}): {output}",
+                tool_calls=tool_calls,
+                metadata={
+                    "error": f"exit code {process.returncode}",
+                    "return_code": process.returncode,
+                    "elapsed_time": elapsed,
+                    "total_tokens": token_info["total_tokens"],
+                    "prompt_tokens": token_info["prompt_tokens"],
+                    "completion_tokens": token_info["completion_tokens"],
+                    "cost": token_info["cost"],
+                    "investigation_id": investigation_id or None,
+                    "investigation_metadata": investigation_metadata,
+                    "mcp_config": mcp_config or None,
+                },
+            )
+
+        return AgentResult(
+            output=output,
+            tool_calls=tool_calls,
+            metadata={
+                "model": model or "claude-code-default",
+                "elapsed_time": elapsed,
+                "return_code": process.returncode,
+                "num_tool_calls": len(tool_calls),
+                "total_tokens": token_info["total_tokens"],
+                "prompt_tokens": token_info["prompt_tokens"],
+                "completion_tokens": token_info["completion_tokens"],
+                "cost": token_info["cost"],
+                "investigation_id": investigation_id or None,
+                "investigation_metadata": investigation_metadata,
+                "mcp_config": mcp_config or None,
+            },
+        )
+
+    except subprocess.TimeoutExpired:
+        print(f"[claudecode] Timeout after 300s")
+        return AgentResult(
+            output="Claude Code timed out after 300 seconds",
+            metadata={"error": "timeout", "timeout": 300},
+        )
+    except Exception as e:
+        print(f"[claudecode] Exception: {str(e)}")
+        return AgentResult(
+            output=f"Claude Code exception: {str(e)}",
+            metadata={"error": str(e)},
+        )
+
+
+# =============================================================================
+# Add your custom agents below
+# =============================================================================
+
+
+# @register_agent("my_agent")
+# def my_custom_agent(test_case: TestCase) -> AgentResult:
+#     """Your custom agent implementation."""
+#     # Your code here
+#     return AgentResult(output="response")
+
+
+# =============================================================================
+# CLI for testing agents directly
+# =============================================================================
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Test an agent directly")
+    parser.add_argument(
+        "--agent",
+        required=True,
+        choices=list_agents(),
+        help="Agent to test",
+    )
+    parser.add_argument(
+        "--prompt",
+        default="How many pods are in the app-01 namespace?",
+        help="Test prompt",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Testing agent: {args.agent}")
+    print(f"Prompt: {args.prompt}")
+    print("-" * 50)
+
+    test = TestCase(
+        id="cli-test",
+        folder=".",
+        user_prompt=args.prompt,
+        expected_output=["test"],
+    )
+
+    try:
+        agent_fn = get_agent(args.agent)
+        result = agent_fn(test)
+        print("\n" + "=" * 50)
+        print("RESULT:")
+        print(f"Output: {result.output}")
+        print(f"Tool calls: {result.tool_calls}")
+        print(f"Metadata: {result.metadata}")
+    except Exception as e:
+        print(f"Error: {e}")
diff --git a/benchmarks/config.py b/benchmarks/config.py
new file mode 100644
index 0000000000..a346e9c1b2
--- /dev/null
+++ b/benchmarks/config.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Configuration and Credentials Management
+
+Loads credentials from a unified YAML file for all infrastructure and monitoring tools.
+"""
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import yaml
+
+BENCHMARKS_DIR = Path(__file__).parent
+PROJECT_ROOT = BENCHMARKS_DIR.parent
+CONFIG_DIR = BENCHMARKS_DIR / "config"
+RESULTS_DIR = BENCHMARKS_DIR / "results"
+FIXTURES_DIR = PROJECT_ROOT / "tests" / "llm" / "fixtures" / "test_ask_holmes"
+
+# Default credentials file location
+DEFAULT_CREDENTIALS_FILE = CONFIG_DIR / "credentials.yaml"
+
+
+@dataclass
+class KubernetesCredentials:
+    """Kubernetes cluster credentials."""
+
+    kubeconfig: Optional[str] = None
+    context: Optional[str] = None
+    namespace: Optional[str] = None
+
+
+@dataclass
+class DatadogCredentials:
+    """Datadog credentials."""
+
+    api_key: Optional[str] = None
+    app_key: Optional[str] = None
+    site: str = "datadoghq.com"
+
+
+@dataclass
+class NewRelicCredentials:
+    """New Relic credentials."""
+
+    api_key: Optional[str] = None
+    account_id: Optional[str] = None
+    region: str = "US"
+
+
+@dataclass
+class PrometheusCredentials:
+    """Prometheus credentials."""
+
+    url: Optional[str] = None
+    username: Optional[str] = None
+    password: Optional[str] = None
+
+
+@dataclass
+class GrafanaCredentials:
+    """Grafana credentials."""
+
+    url: Optional[str] = None
+    api_key: Optional[str] = None
+    username: Optional[str] = None
+    password: Optional[str] = None
+
+
+@dataclass
+class LokiCredentials:
+    """Loki credentials."""
+
+    url: Optional[str] = None
+    username: Optional[str] = None
+    password: Optional[str] = None
+
+
+@dataclass
+class ElasticsearchCredentials:
+    """Elasticsearch/OpenSearch credentials."""
+
+    url: Optional[str] = None
+    api_key: Optional[str] = None
+    username: Optional[str] = None
+    password: Optional[str] = None
+
+
+@dataclass
+class OpenAICredentials:
+    """OpenAI/LLM credentials."""
+
+    api_key: Optional[str] = None
+    org_id: Optional[str] = None
+    base_url: Optional[str] = None
+
+
+@dataclass
+class JudgeConfig:
+    """LLM Judge configuration."""
+
+    model: str = "gpt-4.1"
+    api_key: Optional[str] = None
+
+
+@dataclass
+class Credentials:
+    """All credentials for benchmark testing."""
+
+    kubernetes: KubernetesCredentials = field(default_factory=KubernetesCredentials)
+    datadog: DatadogCredentials = field(default_factory=DatadogCredentials)
+    newrelic: NewRelicCredentials = field(default_factory=NewRelicCredentials)
+    prometheus: PrometheusCredentials = field(default_factory=PrometheusCredentials)
+    grafana: GrafanaCredentials = field(default_factory=GrafanaCredentials)
+    loki: LokiCredentials = field(default_factory=LokiCredentials)
+    elasticsearch: ElasticsearchCredentials = field(default_factory=ElasticsearchCredentials)
+    openai: OpenAICredentials = field(default_factory=OpenAICredentials)
+    judge: JudgeConfig = field(default_factory=JudgeConfig)
+    custom: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_yaml(cls, path: Path) -> "Credentials":
+        """Load credentials from YAML file."""
+        if not path.exists():
+            return cls()
+
+        with open(path, "r") as f:
+            data = yaml.safe_load(f) or {}
+
+        return cls(
+            kubernetes=KubernetesCredentials(**data.get("kubernetes", {})),
+            datadog=DatadogCredentials(**data.get("datadog", {})),
+            newrelic=NewRelicCredentials(**data.get("newrelic", {})),
+            prometheus=PrometheusCredentials(**data.get("prometheus", {})),
+            grafana=GrafanaCredentials(**data.get("grafana", {})),
+            loki=LokiCredentials(**data.get("loki", {})),
+            elasticsearch=ElasticsearchCredentials(**data.get("elasticsearch", {})),
+            openai=OpenAICredentials(**data.get("openai", {})),
+            judge=JudgeConfig(**data.get("judge", {})),
+            custom=data.get("custom", {}),
+        )
+
+    def to_env_vars(self) -> Dict[str, str]:
+        """Convert credentials to environment variables."""
+        env_vars = {}
+
+        # Kubernetes
+        if self.kubernetes.kubeconfig:
+            env_vars["KUBECONFIG"] = self.kubernetes.kubeconfig
+        if self.kubernetes.context:
+            env_vars["KUBE_CONTEXT"] = self.kubernetes.context
+
+        # Datadog
+        if self.datadog.api_key:
+            env_vars["DD_API_KEY"] = self.datadog.api_key
+        if self.datadog.app_key:
+            env_vars["DD_APP_KEY"] = self.datadog.app_key
+        if self.datadog.site:
+            env_vars["DD_SITE"] = self.datadog.site
+
+        # New Relic
+        if self.newrelic.api_key:
+            env_vars["NEW_RELIC_API_KEY"] = self.newrelic.api_key
+        if self.newrelic.account_id:
+            env_vars["NEW_RELIC_ACCOUNT_ID"] = self.newrelic.account_id
+
+        # Prometheus
+        if self.prometheus.url:
+            env_vars["PROMETHEUS_URL"] = self.prometheus.url
+        if self.prometheus.username:
+            env_vars["PROMETHEUS_USERNAME"] = self.prometheus.username
+        if self.prometheus.password:
+            env_vars["PROMETHEUS_PASSWORD"] = self.prometheus.password
+
+        # Grafana
+        if self.grafana.url:
+            env_vars["GRAFANA_URL"] = self.grafana.url
+        if self.grafana.api_key:
+            env_vars["GRAFANA_API_KEY"] = self.grafana.api_key
+
+        # Loki
+        if self.loki.url:
+            env_vars["LOKI_URL"] = self.loki.url
+
+        # Elasticsearch
+        if self.elasticsearch.url:
+            env_vars["ELASTICSEARCH_URL"] = self.elasticsearch.url
+        if self.elasticsearch.api_key:
+            env_vars["ELASTICSEARCH_API_KEY"] = self.elasticsearch.api_key
+
+        # OpenAI
+        if self.openai.api_key:
+            env_vars["OPENAI_API_KEY"] = self.openai.api_key
+        if self.openai.org_id:
+            env_vars["OPENAI_ORG_ID"] = self.openai.org_id
+        if self.openai.base_url:
+            env_vars["OPENAI_BASE_URL"] = self.openai.base_url
+
+        return env_vars
+
+    def apply_to_env(self) -> None:
+        """Apply credentials to current environment."""
+        for key, value in self.to_env_vars().items():
+            os.environ[key] = value
+
+
+def load_credentials(path: Optional[Path] = None) -> Credentials:
+    """Load credentials from file or environment."""
+    if path is None:
+        path = DEFAULT_CREDENTIALS_FILE
+
+    # Load from file if exists
+    credentials = Credentials.from_yaml(path)
+
+    # Override with environment variables
+    if os.environ.get("OPENAI_API_KEY"):
+        credentials.openai.api_key = os.environ["OPENAI_API_KEY"]
+    if os.environ.get("CLASSIFIER_MODEL"):
+        credentials.judge.model = os.environ["CLASSIFIER_MODEL"]
+
+    return credentials
+
+
+def ensure_directories():
+    """Ensure required directories exist."""
+    CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
diff --git a/benchmarks/config/.gitkeep b/benchmarks/config/.gitkeep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmarks/config/credentials.yaml.template b/benchmarks/config/credentials.yaml.template
new file mode 100644
index 0000000000..6af33fedec
--- /dev/null
+++ b/benchmarks/config/credentials.yaml.template
@@ -0,0 +1,78 @@
+# Benchmark Credentials Configuration
+# Copy this file to credentials.yaml and fill in your values
+# Environment variables can override these values
+
+# Kubernetes Configuration
+kubernetes:
+  kubeconfig: ~/.kube/config  # Path to kubeconfig file
+  context: null               # Kubernetes context to use (null = current)
+  namespace: default          # Default namespace
+
+# Datadog
+datadog:
+  api_key: null
+  app_key: null
+  site: datadoghq.com  # or datadoghq.eu, us3.datadoghq.com, etc.
+
+# New Relic
+newrelic:
+  api_key: null
+  account_id: null
+  region: US  # or EU
+
+# Prometheus
+prometheus:
+  url: http://localhost:9090
+  username: null
+  password: null
+
+# Grafana
+grafana:
+  url: http://localhost:3000
+  api_key: null
+  username: admin
+  password: null
+
+# Loki
+loki:
+  url: http://localhost:3100
+  username: null
+  password: null
+
+# Elasticsearch / OpenSearch
+elasticsearch:
+  url: http://localhost:9200
+  api_key: null
+  username: null
+  password: null
+
+# OpenAI / LLM Provider
+openai:
+  api_key: null  # Or set OPENAI_API_KEY env var
+  org_id: null
+  base_url: null  # For custom endpoints
+
+# LLM Judge Configuration
+judge:
+  model: gpt-4.1  # Model used for evaluating test results
+  api_key: null   # If different from openai.api_key
+
+# Investigation API (for fetching prompts by investigation ID)
+investigation_api:
+  url: https://api.drdroid.io  # Base URL for investigation API
+  api_key: null                 # API key for authentication
+  # Or set INVESTIGATION_API_URL and INVESTIGATION_API_KEY env vars
+
+# Claude Code Configuration
+claudecode:
+  mcp_config: config/mcp_servers.json  # Path to MCP servers config (relative to benchmarks/)
+  # Or set CLAUDE_MCP_CONFIG env var
+
+# Custom credentials for additional integrations
+custom:
+  drdroid:
+    api_url: http://localhost:8000
+    api_key: null  # your-api-key
+  # splunk:
+  #   url: https://splunk.example.com
+  #   token: your-token
diff --git a/benchmarks/config/mcp_servers.json.template b/benchmarks/config/mcp_servers.json.template
new file mode 100644
index 0000000000..b81af7c759
--- /dev/null
+++ b/benchmarks/config/mcp_servers.json.template
@@ -0,0 +1,42 @@
+{
+  "mcpServers": {
+    "kubernetes": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-server-kubernetes"],
+      "env": {
+        "KUBECONFIG": "~/.kube/config"
+      }
+    },
+    "prometheus": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-server-prometheus"],
+      "env": {
+        "PROMETHEUS_URL": "http://localhost:9090"
+      }
+    },
+    "grafana": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-server-grafana"],
+      "env": {
+        "GRAFANA_URL": "http://localhost:3000",
+        "GRAFANA_API_KEY": "your-grafana-api-key"
+      }
+    },
+    "datadog": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-server-datadog"],
+      "env": {
+        "DD_API_KEY": "your-datadog-api-key",
+        "DD_APP_KEY": "your-datadog-app-key"
+      }
+    },
+    "elasticsearch": {
+      "command": "npx",
+      "args": ["-y", "@anthropic/mcp-server-elasticsearch"],
+      "env": {
+        "ELASTICSEARCH_URL": "http://localhost:9200",
+        "ELASTICSEARCH_API_KEY": "your-elasticsearch-api-key"
+      }
+    }
+  }
+}
diff --git a/benchmarks/dashboard.py b/benchmarks/dashboard.py
new file mode 100644
index 0000000000..7c40778194
--- /dev/null
+++ b/benchmarks/dashboard.py
@@ -0,0 +1,652 @@
+#!/usr/bin/env python3
+"""
+Benchmark Results Dashboard
+
+A Streamlit app to visualize benchmark test results with use-case x model level reporting.
+
+Usage:
+    streamlit run benchmarks/dashboard.py
+"""
+
+import json
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+import streamlit as st
+
+# Results directory
+RESULTS_DIR = Path(__file__).parent / "results"
+
+
+# =============================================================================
+# Data Loading
+# =============================================================================
+
+
+@st.cache_data(ttl=30)  # Cache for 30 seconds, then refresh
+def load_all_results() -> List[Dict[str, Any]]:
+    """Load all results from JSON files."""
+    results = []
+
+    if not RESULTS_DIR.exists():
+        return results
+
+    for filepath in RESULTS_DIR.glob("*.json"):
+        try:
+            with open(filepath, "r") as f:
+                data = json.load(f)
+                data["_filepath"] = str(filepath)
+                results.append(data)
+        except Exception as e:
+            st.warning(f"Failed to load {filepath.name}: {e}")
+
+    return sorted(results, key=lambda r: r.get("started_at", ""), reverse=True)
+
+
+def extract_metrics(result: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract key metrics from a result."""
+    metadata = result.get("agent_metadata", {})
+
+    # Build investigation URL if investigation_id exists
+    investigation_id = metadata.get("investigation_id")
+    investigation_url = None
+    if investigation_id:
+        investigation_url = f"https://aiops.drdroid.io/investigations/{investigation_id}"
+
+    return {
+        "test_id": result.get("test_id", "unknown"),
+        "model": result.get("model", "unknown"),
+        "agent": result.get("agent", "unknown"),
+        "status": result.get("status", "unknown"),
+        "score": result.get("score"),
+        "total_time": result.get("total_time", 0),
+        "setup_time": result.get("setup_time", 0),
+        "agent_time": result.get("agent_time", 0),
+        "judge_time": result.get("judge_time", 0),
+        "cleanup_time": result.get("cleanup_time", 0),
+        "cost": metadata.get("cost", 0) or 0,
+        "total_tokens": metadata.get("total_tokens", 0) or 0,
+        "prompt_tokens": metadata.get("prompt_tokens", 0) or 0,
+        "completion_tokens": metadata.get("completion_tokens", 0) or 0,
+        "num_llm_calls": metadata.get("num_llm_calls", 0) or 0,
+        "tags": result.get("tags", []),
+        "started_at": result.get("started_at", ""),
+        "run_id": result.get("run_id", ""),
+        "user_prompt": result.get("user_prompt", ""),
+        "actual_output": result.get("actual_output", ""),
+        "expected_output": result.get("expected_output", []),
+        "judge_rationale": result.get("judge_rationale", ""),
+        "error_message": result.get("error_message", ""),
+        "investigation_id": investigation_id,
+        "investigation_url": investigation_url,
+    }
+
+
+def build_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Build a pandas DataFrame from results."""
+    if not results:
+        return pd.DataFrame()
+
+    metrics = [extract_metrics(r) for r in results]
+    df = pd.DataFrame(metrics)
+
+    # Convert started_at to datetime
+    if "started_at" in df.columns:
+        df["started_at"] = pd.to_datetime(df["started_at"], errors="coerce")
+
+    return df
+
+
+# =============================================================================
+# Aggregation Functions
+# =============================================================================
+
+
+def aggregate_by_model(df: pd.DataFrame) -> pd.DataFrame:
+    """Aggregate metrics by model."""
+    if df.empty:
+        return pd.DataFrame()
+
+    agg = df.groupby("model").agg(
+        total_runs=("test_id", "count"),
+        passed=("status", lambda x: (x == "passed").sum()),
+        failed=("status", lambda x: (x == "failed").sum()),
+        setup_failed=("status", lambda x: (x == "setup_failed").sum()),
+        errors=("status", lambda x: (x == "error").sum()),
+        avg_total_time=("total_time", "mean"),
+        avg_agent_time=("agent_time", "mean"),
+        total_cost=("cost", "sum"),
+        avg_cost=("cost", "mean"),
+        total_tokens=("total_tokens", "sum"),
+        avg_tokens=("total_tokens", "mean"),
+        unique_tests=("test_id", "nunique"),
+    ).reset_index()
+
+    agg["pass_rate"] = (agg["passed"] / agg["total_runs"] * 100).round(1)
+
+    return agg.sort_values("pass_rate", ascending=False)
+
+
+def aggregate_by_test(df: pd.DataFrame) -> pd.DataFrame:
+    """Aggregate metrics by test case."""
+    if df.empty:
+        return pd.DataFrame()
+
+    agg = df.groupby("test_id").agg(
+        total_runs=("model", "count"),
+        passed=("status", lambda x: (x == "passed").sum()),
+        failed=("status", lambda x: (x == "failed").sum()),
+        avg_total_time=("total_time", "mean"),
+        avg_agent_time=("agent_time", "mean"),
+        total_cost=("cost", "sum"),
+        avg_cost=("cost", "mean"),
+        total_tokens=("total_tokens", "sum"),
+        models_tested=("model", "nunique"),
+    ).reset_index()
+
+    agg["pass_rate"] = (agg["passed"] / agg["total_runs"] * 100).round(1)
+
+    return agg.sort_values("test_id")
+
+
+def create_pivot_table(df: pd.DataFrame, value_col: str, aggfunc: str = "mean") -> pd.DataFrame:
+    """Create a pivot table with test_id as rows and model as columns."""
+    if df.empty:
+        return pd.DataFrame()
+
+    pivot = pd.pivot_table(
+        df,
+        values=value_col,
+        index="test_id",
+        columns="model",
+        aggfunc=aggfunc,
+        fill_value=0,
+    )
+
+    return pivot
+
+
+def create_status_pivot(df: pd.DataFrame) -> pd.DataFrame:
+    """Create a pivot table showing pass/fail status per test x model."""
+    if df.empty:
+        return pd.DataFrame()
+
+    # Get latest result for each test_id x model combination
+    latest = df.sort_values("started_at", ascending=False).drop_duplicates(
+        subset=["test_id", "model"]
+    )
+
+    # Create status mapping with icons
+    status_map = {
+        "passed": "✅",
+        "failed": "❌",
+        "setup_failed": "🔧",
+        "error": "⚠️",
+    }
+    latest["status_icon"] = latest["status"].map(status_map).fillna("?")
+
+    pivot = pd.pivot_table(
+        latest,
+        values="status_icon",
+        index="test_id",
+        columns="model",
+        aggfunc="first",
+        fill_value="-",
+    )
+
+    return pivot
+
+
+def get_latest_results_matrix(df: pd.DataFrame) -> Dict[str, Dict[str, Dict[str, Any]]]:
+    """Get latest results organized as test_id -> model -> result_data.
+
+    Returns a nested dict for building an interactive status matrix.
+    """
+    if df.empty:
+        return {}
+
+    # Get latest result for each test_id x model combination
+    latest = df.sort_values("started_at", ascending=False).drop_duplicates(
+        subset=["test_id", "model"]
+    )
+
+    # Status mapping with icons
+    status_map = {
+        "passed": "✅",
+        "failed": "❌",
+        "setup_failed": "🔧",
+        "error": "⚠️",
+    }
+
+    # Build nested dict
+    matrix: Dict[str, Dict[str, Dict[str, Any]]] = {}
+    for _, row in latest.iterrows():
+        test_id = row["test_id"]
+        model = row["model"]
+
+        if test_id not in matrix:
+            matrix[test_id] = {}
+
+        matrix[test_id][model] = {
+            "status": row["status"],
+            "status_icon": status_map.get(row["status"], "?"),
+            "score": row["score"],
+            "judge_rationale": row["judge_rationale"] or "No rationale available",
+            "agent_time": row["agent_time"],
+            "cost": row["cost"],
+            "total_tokens": row["total_tokens"],
+            "error_message": row["error_message"],
+            "run_id": row["run_id"],
+            "investigation_url": row.get("investigation_url"),
+        }
+
+    return matrix
+
+
+# =============================================================================
+# Streamlit App
+# =============================================================================
+
+
+def main():
+    st.set_page_config(
+        page_title="Benchmark Dashboard",
+        page_icon="📊",
+        layout="wide",
+    )
+
+    st.title("📊 Benchmark Results Dashboard")
+
+    # Sidebar
+    with st.sidebar:
+        st.header("Controls")
+
+        if st.button("🔄 Refresh Data"):
+            st.cache_data.clear()
+            st.rerun()
+
+        st.markdown("---")
+
+        # Load data
+        results = load_all_results()
+
+        if not results:
+            st.warning("No results found")
+            st.info(f"Results directory: {RESULTS_DIR}")
+            return
+
+        df = build_dataframe(results)
+
+        st.metric("Total Results", len(df))
+        st.metric("Unique Models", df["model"].nunique())
+        st.metric("Unique Tests", df["test_id"].nunique())
+
+        st.markdown("---")
+
+        # Filters
+        st.subheader("Filters")
+
+        models = ["All"] + sorted(df["model"].unique().tolist())
+        selected_model = st.selectbox("Model", models)
+
+        statuses = ["All"] + sorted(df["status"].unique().tolist())
+        selected_status = st.selectbox("Status", statuses)
+
+        # Date filter
+        if not df["started_at"].isna().all():
+            min_date = df["started_at"].min().date()
+            max_date = df["started_at"].max().date()
+            date_range = st.date_input(
+                "Date Range",
+                value=(min_date, max_date),
+                min_value=min_date,
+                max_value=max_date,
+            )
+        else:
+            date_range = None
+
+    # Apply filters
+    filtered_df = df.copy()
+
+    if selected_model != "All":
+        filtered_df = filtered_df[filtered_df["model"] == selected_model]
+
+    if selected_status != "All":
+        filtered_df = filtered_df[filtered_df["status"] == selected_status]
+
+    if date_range and len(date_range) == 2:
+        start_date, end_date = date_range
+        filtered_df = filtered_df[
+            (filtered_df["started_at"].dt.date >= start_date)
+            & (filtered_df["started_at"].dt.date <= end_date)
+        ]
+
+    # Main content
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([
+        "📈 Overview",
+        "🤖 Model Comparison",
+        "📋 Test Cases",
+        "🔀 Use Case x Model",
+        "📄 Raw Results",
+    ])
+
+    # Tab 1: Overview
+    with tab1:
+        st.header("Overview")
+
+        col1, col2, col3, col4 = st.columns(4)
+
+        total = len(filtered_df)
+        passed = (filtered_df["status"] == "passed").sum()
+        failed = (filtered_df["status"] == "failed").sum()
+        pass_rate = (passed / total * 100) if total > 0 else 0
+
+        col1.metric("Total Runs", total)
+        col2.metric("Passed", f"{passed} ✅")
+        col3.metric("Failed", f"{failed} ❌")
+        col4.metric("Pass Rate", f"{pass_rate:.1f}%")
+
+        col5, col6, col7, col8 = st.columns(4)
+
+        col5.metric("Avg Time", f"{filtered_df['total_time'].mean():.1f}s")
+        col6.metric("Total Cost", f"${filtered_df['cost'].sum():.4f}")
+        col7.metric("Total Tokens", f"{filtered_df['total_tokens'].sum():,}")
+        col8.metric("Avg Tokens/Run", f"{filtered_df['total_tokens'].mean():,.0f}")
+
+        st.markdown("---")
+
+        # Status distribution
+        col1, col2 = st.columns(2)
+
+        with col1:
+            st.subheader("Status Distribution")
+            status_counts = filtered_df["status"].value_counts()
+            st.bar_chart(status_counts)
+
+        with col2:
+            st.subheader("Runs by Model")
+            model_counts = filtered_df["model"].value_counts()
+            st.bar_chart(model_counts)
+
+    # Tab 2: Model Comparison
+    with tab2:
+        st.header("Model Comparison")
+
+        model_agg = aggregate_by_model(filtered_df)
+
+        if not model_agg.empty:
+            # Summary table
+            st.subheader("Summary by Model")
+            display_cols = [
+                "model", "total_runs", "passed", "failed", "pass_rate",
+                "avg_agent_time", "total_cost", "avg_cost", "total_tokens", "avg_tokens",
+            ]
+            display_df = model_agg[display_cols].copy()
+            display_df.columns = [
+                "Model", "Runs", "Passed", "Failed", "Pass Rate %",
+                "Avg Agent Time (s)", "Total Cost ($)", "Avg Cost ($)", "Total Tokens", "Avg Tokens",
+            ]
+
+            # Format numeric columns
+            display_df["Avg Agent Time (s)"] = display_df["Avg Agent Time (s)"].round(2)
+            display_df["Total Cost ($)"] = display_df["Total Cost ($)"].round(4)
+            display_df["Avg Cost ($)"] = display_df["Avg Cost ($)"].round(4)
+            display_df["Avg Tokens"] = display_df["Avg Tokens"].round(0).astype(int)
+
+            st.dataframe(display_df, use_container_width=True, hide_index=True)
+
+            st.markdown("---")
+
+            # Charts
+            col1, col2 = st.columns(2)
+
+            with col1:
+                st.subheader("Pass Rate by Model")
+                chart_data = model_agg.set_index("model")["pass_rate"]
+                st.bar_chart(chart_data)
+
+            with col2:
+                st.subheader("Avg Cost by Model")
+                chart_data = model_agg.set_index("model")["avg_cost"]
+                st.bar_chart(chart_data)
+
+    # Tab 3: Test Cases
+    with tab3:
+        st.header("Test Case Analysis")
+
+        test_agg = aggregate_by_test(filtered_df)
+
+        if not test_agg.empty:
+            st.subheader("Summary by Test Case")
+            display_cols = [
+                "test_id", "total_runs", "passed", "failed", "pass_rate",
+                "models_tested", "avg_agent_time", "avg_cost",
+            ]
+            display_df = test_agg[display_cols].copy()
+            display_df.columns = [
+                "Test ID", "Runs", "Passed", "Failed", "Pass Rate %",
+                "Models", "Avg Time (s)", "Avg Cost ($)",
+            ]
+            display_df["Avg Time (s)"] = display_df["Avg Time (s)"].round(2)
+            display_df["Avg Cost ($)"] = display_df["Avg Cost ($)"].round(4)
+
+            st.dataframe(display_df, use_container_width=True, hide_index=True)
+
+            st.markdown("---")
+
+            # Failing tests
+            failing_tests = test_agg[test_agg["pass_rate"] < 100].sort_values("pass_rate")
+            if not failing_tests.empty:
+                st.subheader("Tests with Failures")
+                st.dataframe(
+                    failing_tests[["test_id", "total_runs", "passed", "failed", "pass_rate"]],
+                    use_container_width=True,
+                    hide_index=True,
+                )
+
+    # Tab 4: Use Case x Model Matrix
+    with tab4:
+        st.header("Use Case x Model Matrix")
+
+        st.subheader("Status Matrix (Latest Run)")
+        st.caption("Click the ℹ️ icon to view judge rationale and details")
+
+        # Get the matrix data for interactive display
+        results_matrix = get_latest_results_matrix(filtered_df)
+        models = sorted(filtered_df["model"].unique().tolist())
+        test_ids = sorted(results_matrix.keys())
+
+        if results_matrix and models:
+            # Build header row
+            header_cols = st.columns([2] + [1] * len(models))
+            header_cols[0].markdown("**Test Case**")
+            for i, model in enumerate(models):
+                header_cols[i + 1].markdown(f"**{model}**")
+
+            # Build data rows
+            for test_id in test_ids:
+                row_cols = st.columns([2] + [1] * len(models))
+                row_cols[0].write(test_id)
+
+                for i, model in enumerate(models):
+                    result_data = results_matrix.get(test_id, {}).get(model)
+
+                    if result_data:
+                        status_icon = result_data["status_icon"]
+
+                        # Create a container with status icon and info popover
+                        with row_cols[i + 1]:
+                            col_status, col_info = st.columns([1, 1])
+                            col_status.write(status_icon)
+
+                            with col_info.popover("ℹ️"):
+                                st.markdown(f"**Test:** {test_id}")
+                                st.markdown(f"**Model:** {model}")
+                                st.markdown(f"**Status:** {result_data['status']} {status_icon}")
+                                st.markdown(f"**Score:** {result_data['score']}")
+
+                                st.markdown("---")
+                                st.markdown("**Timing & Cost:**")
+                                st.write(f"Agent Time: {result_data['agent_time']:.2f}s")
+                                st.write(f"Cost: ${result_data['cost']:.4f}")
+                                st.write(f"Tokens: {result_data['total_tokens']:,}")
+
+                                st.markdown("---")
+                                st.markdown("**Judge Rationale:**")
+                                st.write(result_data["judge_rationale"])
+
+                                if result_data.get("error_message"):
+                                    st.markdown("---")
+                                    st.error(f"Error: {result_data['error_message']}")
+
+                                if result_data.get("investigation_url"):
+                                    st.markdown("---")
+                                    st.markdown(
+                                        f"[Open Investigation]({result_data['investigation_url']})"
+                                    )
+                    else:
+                        row_cols[i + 1].write("-")
+
+        st.markdown("---")
+
+        # Metric selection for pivot
+        metric = st.selectbox(
+            "Select Metric for Heatmap",
+            ["pass_rate", "agent_time", "cost", "total_tokens"],
+        )
+
+        if metric == "pass_rate":
+            # Calculate pass rate per test x model
+            pass_df = filtered_df.copy()
+            pass_df["is_passed"] = (pass_df["status"] == "passed").astype(int)
+            pivot = create_pivot_table(pass_df, "is_passed", "mean") * 100
+            st.subheader("Pass Rate % (Test x Model)")
+        elif metric == "agent_time":
+            pivot = create_pivot_table(filtered_df, "agent_time", "mean")
+            st.subheader("Avg Agent Time in seconds (Test x Model)")
+        elif metric == "cost":
+            pivot = create_pivot_table(filtered_df, "cost", "mean")
+            st.subheader("Avg Cost in $ (Test x Model)")
+        else:
+            pivot = create_pivot_table(filtered_df, "total_tokens", "mean")
+            st.subheader("Avg Tokens (Test x Model)")
+
+        if not pivot.empty:
+            # Round values
+            pivot = pivot.round(2 if metric in ["agent_time", "cost"] else 1)
+            st.dataframe(pivot, use_container_width=True)
+
+            # Download button
+            csv = pivot.to_csv()
+            st.download_button(
+                "Download as CSV",
+                csv,
+                f"benchmark_{metric}_matrix.csv",
+                "text/csv",
+            )
+
+    # Tab 5: Raw Results
+    with tab5:
+        st.header("Raw Results")
+
+        # Sort options
+        sort_col = st.selectbox(
+            "Sort by",
+            ["started_at", "test_id", "model", "status", "total_time", "cost"],
+        )
+        sort_order = st.radio("Order", ["Descending", "Ascending"], horizontal=True)
+
+        sorted_df = filtered_df.sort_values(
+            sort_col, ascending=(sort_order == "Ascending")
+        )
+
+        # Create display dataframe with investigation links
+        display_df = sorted_df[["test_id", "model", "status", "score", "agent_time", "cost", "total_tokens", "started_at", "investigation_url"]].copy()
+
+        display_df.columns = [
+            "Test ID", "Model", "Status", "Score", "Agent Time (s)",
+            "Cost ($)", "Tokens", "Started At", "Investigation",
+        ]
+
+        # Use LinkColumn for clickable investigation links
+        st.dataframe(
+            display_df,
+            use_container_width=True,
+            hide_index=True,
+            column_config={
+                "Investigation": st.column_config.LinkColumn(
+                    "Investigation",
+                    help="Open investigation in DrDroid (opens in new tab)",
+                    display_text="Open",
+                ),
+            },
+        )
+
+        st.markdown("---")
+
+        # Detailed view for selected result
+        st.subheader("Result Details")
+
+        result_options = sorted_df["run_id"].tolist()
+        if result_options:
+            selected_run = st.selectbox("Select Run", result_options)
+
+            if selected_run:
+                result_row = sorted_df[sorted_df["run_id"] == selected_run].iloc[0]
+
+                col1, col2 = st.columns(2)
+
+                with col1:
+                    st.markdown("**Test Info**")
+                    st.write(f"**Test ID:** {result_row['test_id']}")
+                    st.write(f"**Model:** {result_row['model']}")
+                    st.write(f"**Status:** {result_row['status']}")
+                    st.write(f"**Score:** {result_row['score']}")
+
+                    # Investigation link
+                    if result_row.get("investigation_url"):
+                        st.markdown(
+                            f"**Investigation:** [Open in DrDroid]({result_row['investigation_url']})"
+                        )
+
+                    st.markdown("**Timing**")
+                    st.write(f"Setup: {result_row['setup_time']:.2f}s")
+                    st.write(f"Agent: {result_row['agent_time']:.2f}s")
+                    st.write(f"Judge: {result_row['judge_time']:.2f}s")
+                    st.write(f"Total: {result_row['total_time']:.2f}s")
+
+                with col2:
+                    st.markdown("**Costs & Tokens**")
+                    st.write(f"Cost: ${result_row['cost']:.4f}")
+                    st.write(f"Total Tokens: {result_row['total_tokens']:,}")
+                    st.write(f"Prompt Tokens: {result_row['prompt_tokens']:,}")
+                    st.write(f"Completion Tokens: {result_row['completion_tokens']:,}")
+
+                st.markdown("**User Prompt**")
+                st.code(result_row["user_prompt"], language=None)
+
+                st.markdown("**Expected Output**")
+                expected = result_row["expected_output"]
+                if isinstance(expected, list):
+                    for exp in expected:
+                        st.write(f"- {exp}")
+                else:
+                    st.write(expected)
+
+                st.markdown("**Actual Output**")
+                st.code(result_row["actual_output"] or "N/A", language=None)
+
+                if result_row["judge_rationale"]:
+                    st.markdown("**Judge Rationale**")
+                    st.write(result_row["judge_rationale"])
+
+                if result_row["error_message"]:
+                    st.markdown("**Error**")
+                    st.error(result_row["error_message"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/executor.py b/benchmarks/executor.py
new file mode 100644
index 0000000000..a5e28f52bc
--- /dev/null
+++ b/benchmarks/executor.py
@@ -0,0 +1,817 @@
+#!/usr/bin/env python3
+"""
+Benchmark Test Suite Executor
+
+A comprehensive test suite for running LLM evaluation tests against different agents.
+Each test run is saved to a JSON file in the results directory for analysis.
+
+Usage:
+    # Run single test
+    python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods
+
+    # Run multiple tests
+    python benchmarks/executor.py --agent drdroid --test-id 01_how_many_pods --test-id 02_what_is_wrong_with_pod
+
+    # Run all tests
+    python benchmarks/executor.py --agent drdroid --all
+
+    # Run tests by tag
+    python benchmarks/executor.py --agent drdroid --tag kubernetes --tag easy
+
+    # List available tests and agents
+    python benchmarks/executor.py --list-tests
+    python benchmarks/executor.py --list-agents
+"""
+
+import argparse
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import yaml
+
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+from benchmarks.agent import AgentResult, TestCase, get_agent, list_agents
+from benchmarks.config import (
+    FIXTURES_DIR,
+    RESULTS_DIR,
+    Credentials,
+    ensure_directories,
+    load_credentials,
+)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Data Models
+# =============================================================================
+
+
+@dataclass
+class TestResult:
+    """Complete result of a single test execution."""
+
+    # Identifiers
+    test_id: str
+    agent: str
+    model: str  # Model used (e.g., "sonnet4.5", "gpt5.2")
+    run_id: str  # Unique ID for this run
+
+    # Test case info
+    user_prompt: str
+    expected_output: List[str]
+    tags: List[str]
+
+    # Execution results
+    status: str  # "passed", "failed", "setup_failed", "error"
+    actual_output: Optional[str] = None
+    tool_calls: List[str] = field(default_factory=list)
+
+    # Judge evaluation
+    score: Optional[float] = None
+    judge_rationale: Optional[str] = None
+    judge_model: Optional[str] = None
+
+    # Timing
+    setup_time: float = 0.0
+    agent_time: float = 0.0
+    judge_time: float = 0.0
+    cleanup_time: float = 0.0
+    total_time: float = 0.0
+
+    # Agent metadata
+    agent_metadata: Dict[str, Any] = field(default_factory=dict)
+
+    # Error info
+    error_message: Optional[str] = None
+    error_type: Optional[str] = None
+
+    # Timestamps
+    started_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    completed_at: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return asdict(self)
+
+
+# =============================================================================
+# Test Case Loading
+# =============================================================================
+
+
+def load_test_case(test_id: str, fixtures_path: Path = FIXTURES_DIR) -> TestCase:
+    """Load a test case from YAML."""
+    folder = fixtures_path / test_id
+    yaml_path = folder / "test_case.yaml"
+
+    if not yaml_path.exists():
+        raise ValueError(f"Test case not found: {test_id}")
+
+    with open(yaml_path, "r") as f:
+        data = yaml.safe_load(f)
+
+    expected = data.get("expected_output", [])
+    if isinstance(expected, str):
+        expected = [expected]
+
+    user_prompt = data.get("user_prompt", "")
+    if isinstance(user_prompt, list):
+        user_prompt = user_prompt[0]
+
+    return TestCase(
+        id=test_id,
+        folder=str(folder),
+        user_prompt=user_prompt,
+        expected_output=expected,
+        before_test=data.get("before_test"),
+        after_test=data.get("after_test"),
+        tags=data.get("tags", []),
+        setup_timeout=data.get("setup_timeout", 300),
+    )
+
+
+def discover_tests(
+    fixtures_path: Path = FIXTURES_DIR,
+    tags: Optional[List[str]] = None,
+) -> List[str]:
+    """Discover all test IDs, optionally filtered by tags."""
+    test_ids = []
+
+    for item in fixtures_path.iterdir():
+        if item.is_dir() and not item.name.startswith("."):
+            yaml_path = item / "test_case.yaml"
+            if yaml_path.exists():
+                # Filter by tags if specified
+                if tags:
+                    with open(yaml_path, "r") as f:
+                        data = yaml.safe_load(f)
+                    test_tags = data.get("tags", [])
+                    if not any(t in test_tags for t in tags):
+                        continue
+
+                test_ids.append(item.name)
+
+    return sorted(test_ids)
+
+
+# =============================================================================
+# Setup/Cleanup Execution
+# =============================================================================
+
+
+def run_bash_script(
+    script: str,
+    cwd: str,
+    timeout: int = 300,
+    credentials: Optional[Credentials] = None,
+    stream_output: bool = True,
+) -> tuple[bool, str, float]:
+    """Run a bash script with credentials applied to environment.
+
+    Args:
+        script: Bash script to run
+        cwd: Working directory
+        timeout: Timeout in seconds
+        credentials: Credentials to apply to environment
+        stream_output: If True, stream output to console in real-time
+    """
+    if not script or not script.strip():
+        return True, "", 0.0
+
+    # Prepare environment with credentials
+    env = os.environ.copy()
+    if credentials:
+        env.update(credentials.to_env_vars())
+
+    start_time = time.time()
+
+    try:
+        if stream_output:
+            # Use Popen to stream output in real-time
+            process = subprocess.Popen(
+                script,
+                shell=True,
+                executable="/bin/bash",
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,  # Merge stderr into stdout
+                text=True,
+                cwd=cwd,
+                env=env,
+                bufsize=1,  # Line buffered
+            )
+
+            output_lines = []
+            try:
+                # Read and print output line by line
+                while True:
+                    line = process.stdout.readline()
+                    if not line and process.poll() is not None:
+                        break
+                    if line:
+                        print(f"  | {line.rstrip()}")
+                        output_lines.append(line)
+
+                    # Check timeout
+                    if time.time() - start_time > timeout:
+                        process.kill()
+                        process.wait()
+                        elapsed = time.time() - start_time
+                        return False, f"Timeout after {timeout}s", elapsed
+
+                process.wait()
+                elapsed = time.time() - start_time
+                output = "".join(output_lines)
+
+                if process.returncode != 0:
+                    return False, f"Exit code {process.returncode}\n{output}", elapsed
+
+                return True, output, elapsed
+
+            except Exception as e:
+                process.kill()
+                process.wait()
+                raise e
+        else:
+            # Original behavior: capture output without streaming
+            result = subprocess.run(
+                script,
+                shell=True,
+                executable="/bin/bash",
+                capture_output=True,
+                text=True,
+                cwd=cwd,
+                timeout=timeout,
+                env=env,
+            )
+            elapsed = time.time() - start_time
+            output = f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
+
+            if result.returncode != 0:
+                return False, f"Exit code {result.returncode}\n{output}", elapsed
+
+            return True, output, elapsed
+
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start_time
+        return False, f"Timeout after {timeout}s", elapsed
+    except Exception as e:
+        elapsed = time.time() - start_time
+        return False, f"Error: {str(e)}", elapsed
+
+
+# =============================================================================
+# LLM Judge
+# =============================================================================
+
+
+def evaluate_with_llm_judge(
+    expected_elements: List[str],
+    actual_output: str,
+    classifier_model: str = "gpt-4.1",
+) -> tuple[float, str]:
+    """Evaluate output using LLM-as-judge."""
+    try:
+        from autoevals import LLMClassifier
+    except ImportError:
+        logger.error("autoevals not installed. Run: pip install autoevals")
+        return 0.0, "autoevals not installed"
+
+    expected_str = "\n- ".join(expected_elements)
+
+    prompt_template = """
+You are evaluating the correctness of an OUTPUT given by a LLM. You must return a score that
+represents the correctness of that OUTPUT.
+
+The correctness is defined by the presence of EXPECTED ELEMENTS in the OUTPUT.
+Make a judgement call whether each ELEMENT sufficiently matches the OUTPUT. ELEMENTS do
+not need to appear verbatim or be a perfect match but their essence should be
+present in the whole OUTPUT, even if it spans multiple sentences.
+
+# EXPECTED ELEMENTS
+
+- {{expected}}
+
+# OUTPUT
+
+{{output}}
+
+
+Return a choice based on the number of EXPECTED ELEMENTS present in the OUTPUT.
+Possible choices:
+- A: All elements are present
+- B: Either no element is present or only some but not all elements are present
+"""
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        logger.warning("OPENAI_API_KEY not set, skipping LLM judge")
+        return 0.0, "No API key for judge"
+
+    try:
+        classifier = LLMClassifier(
+            name="Correctness",
+            prompt_template=prompt_template,
+            choice_scores={"A": 1, "B": 0},
+            use_cot=True,
+            model=classifier_model,
+            api_key=api_key,
+        )
+
+        result = classifier(
+            input=prompt_template,
+            output=actual_output,
+            expected=expected_str,
+        )
+
+        return result.score, result.metadata.get("rationale", "")
+
+    except Exception as e:
+        logger.error(f"Judge error: {e}")
+        return 0.0, f"Judge error: {str(e)}"
+
+
+# =============================================================================
+# Result Storage
+# =============================================================================
+
+
+def generate_run_id(model: str, test_id: str) -> str:
+    """Generate a unique run ID based on model and test."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"{model}_{test_id}_{timestamp}"
+
+
+def save_result(result: TestResult, results_dir: Path = RESULTS_DIR) -> Path:
+    """Save a test result to JSON file."""
+    ensure_directories()
+
+    filename = f"{result.run_id}.json"
+    filepath = results_dir / filename
+
+    with open(filepath, "w") as f:
+        json.dump(result.to_dict(), f, indent=2, default=str)
+
+    return filepath
+
+
+def load_results(
+    results_dir: Path = RESULTS_DIR,
+    agent: Optional[str] = None,
+    test_id: Optional[str] = None,
+) -> List[TestResult]:
+    """Load results from JSON files, optionally filtered."""
+    results = []
+
+    if not results_dir.exists():
+        return results
+
+    for filepath in results_dir.glob("*.json"):
+        try:
+            with open(filepath, "r") as f:
+                data = json.load(f)
+
+            # Filter by agent
+            if agent and data.get("agent") != agent:
+                continue
+
+            # Filter by test_id
+            if test_id and data.get("test_id") != test_id:
+                continue
+
+            results.append(TestResult(**data))
+        except Exception as e:
+            logger.warning(f"Failed to load {filepath}: {e}")
+
+    return sorted(results, key=lambda r: r.started_at, reverse=True)
+
+
+# =============================================================================
+# Test Executor
+# =============================================================================
+
+
+class BenchmarkExecutor:
+    """Executes benchmark tests against registered agents."""
+
+    def __init__(
+        self,
+        agent: str,
+        model: str,
+        credentials: Optional[Credentials] = None,
+        skip_setup: bool = False,
+        skip_cleanup: bool = False,
+        classifier_model: str = "gpt-4.1",
+        quiet: bool = False,
+    ):
+        if not agent:
+            raise ValueError(
+                "Agent must be specified. Use --agent <name>. "
+                f"Available agents: {', '.join(list_agents())}"
+            )
+
+        if not model:
+            raise ValueError(
+                "Model must be specified. Use --model <name>. "
+                "Examples: sonnet4.5, gpt5.2, gpt-4.1"
+            )
+
+        self.agent_name = agent
+        self.model = model
+        self.agent_fn = get_agent(agent)
+        self.credentials = credentials or load_credentials()
+        self.skip_setup = skip_setup
+        self.skip_cleanup = skip_cleanup
+        self.classifier_model = classifier_model
+        self.quiet = quiet  # If True, don't stream script output to console
+        self.results: List[TestResult] = []
+
+        # Apply credentials to environment
+        self.credentials.apply_to_env()
+
+        # Set model in environment for agent to use
+        os.environ["DRDROID_MODEL"] = model
+
+    def run_test(self, test_case: TestCase) -> TestResult:
+        """Run a single test case."""
+        run_id = generate_run_id(self.model, test_case.id)
+        start_time = time.time()
+
+        logger.info(f"{'='*60}")
+        logger.info(f"Test: {test_case.id}")
+        logger.info(f"Agent: {self.agent_name}")
+        logger.info(f"Model: {self.model}")
+        logger.info(f"Run ID: {run_id}")
+        logger.info(f"{'='*60}")
+
+        result = TestResult(
+            test_id=test_case.id,
+            agent=self.agent_name,
+            model=self.model,
+            run_id=run_id,
+            user_prompt=test_case.user_prompt,
+            expected_output=test_case.expected_output,
+            tags=test_case.tags,
+            status="error",
+        )
+
+        # 1. Run setup
+        if not self.skip_setup and test_case.before_test:
+            logger.info("Running setup...")
+            success, output, elapsed = run_bash_script(
+                test_case.before_test,
+                test_case.folder,
+                timeout=test_case.setup_timeout,
+                credentials=self.credentials,
+                stream_output=not self.quiet,  # Stream output unless --quiet
+            )
+            result.setup_time = elapsed
+
+            if not success:
+                logger.error(f"Setup failed:\n{output[:500]}...")
+                result.status = "setup_failed"
+                result.error_message = output
+                result.error_type = "setup_failure"
+                result.completed_at = datetime.now().isoformat()
+                result.total_time = time.time() - start_time
+                save_result(result)
+                return result
+
+            logger.info(f"Setup completed in {elapsed:.2f}s")
+
+        # 2. Run agent
+        try:
+            logger.info(f"Running agent [{self.agent_name}]...")
+            agent_start = time.time()
+
+            agent_result = self.agent_fn(test_case)
+
+            result.agent_time = time.time() - agent_start
+            result.actual_output = agent_result.output
+            result.tool_calls = agent_result.tool_calls
+            result.agent_metadata = agent_result.metadata
+
+            logger.info(f"Agent completed in {result.agent_time:.2f}s")
+
+        except Exception as e:
+            logger.error(f"Agent error: {e}")
+            result.status = "error"
+            result.error_message = str(e)
+            result.error_type = type(e).__name__
+            self._run_cleanup(test_case, result)
+            result.completed_at = datetime.now().isoformat()
+            result.total_time = time.time() - start_time
+            save_result(result)
+            return result
+
+        # 3. Run LLM judge
+        try:
+            logger.info("Running LLM judge...")
+            judge_start = time.time()
+
+            score, rationale = evaluate_with_llm_judge(
+                test_case.expected_output,
+                result.actual_output,
+                self.classifier_model,
+            )
+
+            result.judge_time = time.time() - judge_start
+            result.score = score
+            result.judge_rationale = rationale
+            result.judge_model = self.classifier_model
+            result.status = "passed" if score == 1 else "failed"
+
+            logger.info(f"Judge score: {score} ({result.status})")
+
+        except Exception as e:
+            logger.error(f"Judge error: {e}")
+            result.status = "error"
+            result.error_message = f"Judge error: {str(e)}"
+            result.error_type = "judge_error"
+
+        # 4. Run cleanup
+        self._run_cleanup(test_case, result)
+
+        # Finalize
+        result.completed_at = datetime.now().isoformat()
+        result.total_time = time.time() - start_time
+
+        # Save result
+        filepath = save_result(result)
+        logger.info(f"Result saved to: {filepath}")
+
+        return result
+
+    def _run_cleanup(self, test_case: TestCase, result: TestResult) -> None:
+        """Run cleanup after test."""
+        if not self.skip_cleanup and test_case.after_test:
+            logger.info("Running cleanup...")
+            success, output, elapsed = run_bash_script(
+                test_case.after_test,
+                test_case.folder,
+                timeout=120,
+                credentials=self.credentials,
+                stream_output=False,  # Cleanup output is less important
+            )
+            result.cleanup_time = elapsed
+            if not success:
+                logger.warning(f"Cleanup failed (non-fatal)")
+            else:
+                logger.info(f"Cleanup completed in {elapsed:.2f}s")
+
+    def run_tests(self, test_ids: List[str]) -> List[TestResult]:
+        """Run multiple tests."""
+        results = []
+
+        for i, test_id in enumerate(test_ids, 1):
+            logger.info(f"\n[{i}/{len(test_ids)}] Running test: {test_id}")
+            try:
+                test_case = load_test_case(test_id)
+                result = self.run_test(test_case)
+                results.append(result)
+                self.results.append(result)
+            except Exception as e:
+                logger.error(f"Error loading test {test_id}: {e}")
+
+        return results
+
+    def print_summary(self) -> None:
+        """Print summary of all results."""
+        total = len(self.results)
+        passed = sum(1 for r in self.results if r.status == "passed")
+        failed = sum(1 for r in self.results if r.status == "failed")
+        setup_failed = sum(1 for r in self.results if r.status == "setup_failed")
+        errors = sum(1 for r in self.results if r.status == "error")
+
+        total_time = sum(r.total_time for r in self.results)
+
+        print("\n" + "=" * 70)
+        print(f"BENCHMARK RESULTS")
+        print(f"Model: {self.model}")
+        print("=" * 70)
+        print(f"Total:        {total}")
+        print(f"Passed:       {passed} ✅")
+        print(f"Failed:       {failed} ❌")
+        print(f"Setup Failed: {setup_failed} 🔧")
+        print(f"Errors:       {errors} ⚠️")
+        print(f"Pass Rate:    {(passed/total*100):.1f}%" if total > 0 else "N/A")
+        print(f"Total Time:   {total_time:.2f}s")
+        print("=" * 70)
+
+        print("\nDetailed Results:")
+        print("-" * 70)
+        for r in self.results:
+            icon = {"passed": "✅", "failed": "❌", "setup_failed": "🔧", "error": "⚠️"}.get(
+                r.status, "?"
+            )
+            print(f"{icon} {r.test_id}: {r.status.upper()} (score={r.score}, time={r.total_time:.1f}s)")
+            if r.error_message:
+                print(f"   Error: {r.error_message[:80]}...")
+        print()
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark Test Suite Executor",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run single test with model
+  python benchmarks/executor.py --model sonnet4.5 --test-id 01_how_many_pods
+
+  # Run multiple tests
+  python benchmarks/executor.py --model gpt5.2 --test-id 01_how_many_pods --test-id 02_what_is_wrong_with_pod
+
+  # Run all tests
+  python benchmarks/executor.py --model sonnet4.5 --all
+
+  # Run tests by tag
+  python benchmarks/executor.py --model sonnet4.5 --tag kubernetes --tag easy
+
+  # List available tests
+  python benchmarks/executor.py --list-tests
+  python benchmarks/executor.py --list-tests --tag kubernetes
+        """,
+    )
+
+    # Model selection (REQUIRED)
+    parser.add_argument(
+        "--model",
+        help="Model to use for testing (REQUIRED). Examples: sonnet4.5, gpt5.2",
+    )
+
+    # Agent selection (defaults to drdroid)
+    parser.add_argument(
+        "--agent",
+        default="drdroid",
+        help="Agent to use (default: drdroid)",
+    )
+
+    # Test selection
+    parser.add_argument(
+        "--test-id",
+        action="append",
+        dest="test_ids",
+        help="Test ID(s) to run (can specify multiple)",
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Run all available tests",
+    )
+    parser.add_argument(
+        "--tag",
+        action="append",
+        dest="tags",
+        help="Filter tests by tag (can specify multiple)",
+    )
+
+    # Execution options
+    parser.add_argument(
+        "--skip-setup",
+        action="store_true",
+        help="Skip before_test scripts",
+    )
+    parser.add_argument(
+        "--skip-cleanup",
+        action="store_true",
+        help="Skip after_test scripts",
+    )
+    parser.add_argument(
+        "--classifier-model",
+        default=os.environ.get("CLASSIFIER_MODEL", "gpt-4.1"),
+        help="Model for LLM judge (default: gpt-4.1)",
+    )
+    parser.add_argument(
+        "--credentials",
+        type=Path,
+        help="Path to credentials YAML file",
+    )
+
+    # List commands
+    parser.add_argument(
+        "--list-tests",
+        action="store_true",
+        help="List all available test IDs",
+    )
+    parser.add_argument(
+        "--list-agents",
+        action="store_true",
+        help="List all registered agents",
+    )
+
+    # Output options
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Verbose output",
+    )
+    parser.add_argument(
+        "-q", "--quiet",
+        action="store_true",
+        help="Suppress setup/cleanup script output (don't stream to console)",
+    )
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle list commands
+    if args.list_agents:
+        print("Available agents:")
+        for name in list_agents():
+            print(f"  - {name}")
+        return
+
+    if args.list_tests:
+        test_ids = discover_tests(tags=args.tags)
+        print(f"Available tests ({len(test_ids)}):")
+        for tid in test_ids:
+            test_case = load_test_case(tid)
+            tags_str = f" [{', '.join(test_case.tags)}]" if test_case.tags else ""
+            print(f"  - {tid}{tags_str}")
+        return
+
+    # Validate model is specified for test execution
+    if not args.model:
+        parser.print_help()
+        print("\n" + "=" * 60)
+        print("ERROR: --model is required for test execution")
+        print("Examples: --model sonnet4.5, --model gpt5.2")
+        print("=" * 60)
+        sys.exit(1)
+
+    # Load credentials
+    credentials = load_credentials(args.credentials)
+
+    # Create executor
+    executor = BenchmarkExecutor(
+        agent=args.agent,
+        model=args.model,
+        credentials=credentials,
+        skip_setup=args.skip_setup,
+        skip_cleanup=args.skip_cleanup,
+        classifier_model=args.classifier_model,
+        quiet=args.quiet,
+    )
+
+    # Determine tests to run
+    if args.all:
+        test_ids = discover_tests(tags=args.tags)
+    elif args.test_ids:
+        test_ids = args.test_ids
+    elif args.tags:
+        test_ids = discover_tests(tags=args.tags)
+    else:
+        parser.print_help()
+        print("\nError: Specify --test-id, --all, or --tag")
+        sys.exit(1)
+
+    if not test_ids:
+        print("No tests found matching criteria")
+        sys.exit(1)
+
+    # Run tests
+    logger.info(f"Running {len(test_ids)} test(s) with model '{args.model}'...")
+    executor.run_tests(test_ids)
+
+    # Print summary
+    executor.print_summary()
+
+    # Exit code based on results
+    report = {
+        "passed": sum(1 for r in executor.results if r.status == "passed"),
+        "total": len(executor.results),
+    }
+
+    if report["passed"] == report["total"] and report["total"] > 0:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/reporter.py b/benchmarks/reporter.py
new file mode 100644
index 0000000000..306d89a6a8
--- /dev/null
+++ b/benchmarks/reporter.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+"""
+Benchmark Report Generator
+
+Generates reports from saved test results focusing on MODEL and TEST CASE analysis.
+
+Usage:
+    # Summary report
+    python benchmarks/reporter.py --summary
+
+    # Compare models
+    python benchmarks/reporter.py --compare-models
+
+    # Report by test case (use case)
+    python benchmarks/reporter.py --by-test
+
+    # Report for specific model
+    python benchmarks/reporter.py --model sonnet4.5
+
+    # Report for specific test case
+    python benchmarks/reporter.py --test-id 01_how_many_pods
+
+    # Export to JSON/CSV
+    python benchmarks/reporter.py --compare-models --output report.json
+"""
+
+import argparse
+import csv
+import json
+import sys
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+from benchmarks.config import RESULTS_DIR
+
+
+def load_all_results(results_dir: Path = RESULTS_DIR) -> List[Dict[str, Any]]:
+    """Load all results from JSON files."""
+    results = []
+
+    if not results_dir.exists():
+        return results
+
+    for filepath in results_dir.glob("*.json"):
+        try:
+            with open(filepath, "r") as f:
+                data = json.load(f)
+                results.append(data)
+        except Exception as e:
+            print(f"Warning: Failed to load {filepath}: {e}")
+
+    return sorted(results, key=lambda r: r.get("started_at", ""), reverse=True)
+
+
+def filter_results(
+    results: List[Dict[str, Any]],
+    model: Optional[str] = None,
+    test_id: Optional[str] = None,
+    status: Optional[str] = None,
+    since: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """Filter results by criteria."""
+    filtered = results
+
+    if model:
+        filtered = [r for r in filtered if r.get("model") == model]
+
+    if test_id:
+        filtered = [r for r in filtered if r.get("test_id") == test_id]
+
+    if status:
+        filtered = [r for r in filtered if r.get("status") == status]
+
+    if since:
+        filtered = [r for r in filtered if r.get("started_at", "") >= since]
+
+    return filtered
+
+
+def generate_summary_report(results: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Generate overall summary report."""
+    if not results:
+        return {"error": "No results found"}
+
+    total = len(results)
+    passed = sum(1 for r in results if r.get("status") == "passed")
+    failed = sum(1 for r in results if r.get("status") == "failed")
+    setup_failed = sum(1 for r in results if r.get("status") == "setup_failed")
+    errors = sum(1 for r in results if r.get("status") == "error")
+
+    # Timing stats
+    total_times = [r.get("total_time", 0) for r in results if r.get("total_time")]
+    agent_times = [r.get("agent_time", 0) for r in results if r.get("agent_time")]
+
+    # Unique counts
+    unique_models = set(r.get("model") for r in results if r.get("model"))
+    unique_tests = set(r.get("test_id") for r in results if r.get("test_id"))
+
+    return {
+        "summary": {
+            "total_runs": total,
+            "passed": passed,
+            "failed": failed,
+            "setup_failed": setup_failed,
+            "errors": errors,
+            "pass_rate": f"{(passed / total * 100):.1f}%" if total > 0 else "N/A",
+        },
+        "timing": {
+            "avg_total_time": round(sum(total_times) / len(total_times), 2) if total_times else 0,
+            "avg_agent_time": round(sum(agent_times) / len(agent_times), 2) if agent_times else 0,
+            "max_total_time": round(max(total_times), 2) if total_times else 0,
+            "min_total_time": round(min(total_times), 2) if total_times else 0,
+        },
+        "coverage": {
+            "unique_models": len(unique_models),
+            "unique_tests": len(unique_tests),
+            "models": sorted(unique_models),
+        },
+        "generated_at": datetime.now().isoformat(),
+    }
+
+
+def generate_model_comparison(results: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Generate model comparison report."""
+    model_stats = defaultdict(lambda: {
+        "total": 0,
+        "passed": 0,
+        "failed": 0,
+        "setup_failed": 0,
+        "errors": 0,
+        "total_time": 0,
+        "agent_time": 0,
+        "tests": set(),
+    })
+
+    for r in results:
+        model = r.get("model", "unknown")
+        status = r.get("status", "unknown")
+
+        model_stats[model]["total"] += 1
+        model_stats[model]["tests"].add(r.get("test_id"))
+
+        if status == "passed":
+            model_stats[model]["passed"] += 1
+        elif status == "failed":
+            model_stats[model]["failed"] += 1
+        elif status == "setup_failed":
+            model_stats[model]["setup_failed"] += 1
+        else:
+            model_stats[model]["errors"] += 1
+
+        model_stats[model]["total_time"] += r.get("total_time", 0)
+        model_stats[model]["agent_time"] += r.get("agent_time", 0)
+
+    # Convert to serializable format
+    comparison = {}
+    for model, stats in sorted(model_stats.items()):
+        total = stats["total"]
+        comparison[model] = {
+            "total_runs": total,
+            "passed": stats["passed"],
+            "failed": stats["failed"],
+            "setup_failed": stats["setup_failed"],
+            "errors": stats["errors"],
+            "pass_rate": f"{(stats['passed'] / total * 100):.1f}%" if total > 0 else "N/A",
+            "pass_rate_numeric": round(stats['passed'] / total * 100, 1) if total > 0 else 0,
+            "avg_total_time": round(stats["total_time"] / total, 2) if total > 0 else 0,
+            "avg_agent_time": round(stats["agent_time"] / total, 2) if total > 0 else 0,
+            "unique_tests": len(stats["tests"]),
+        }
+
+    return {
+        "model_comparison": comparison,
+        "generated_at": datetime.now().isoformat(),
+    }
+
+
+def generate_test_case_report(results: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Generate report grouped by test case (use case)."""
+    test_stats = defaultdict(lambda: {
+        "models": defaultdict(lambda: {
+            "runs": 0,
+            "passed": 0,
+            "failed": 0,
+            "avg_time": 0,
+            "total_time": 0,
+        }),
+        "tags": [],
+        "user_prompt": "",
+    })
+
+    for r in results:
+        test_id = r.get("test_id", "unknown")
+        model = r.get("model", "unknown")
+        status = r.get("status", "unknown")
+
+        test_stats[test_id]["models"][model]["runs"] += 1
+        test_stats[test_id]["models"][model]["total_time"] += r.get("total_time", 0)
+
+        if status == "passed":
+            test_stats[test_id]["models"][model]["passed"] += 1
+        elif status == "failed":
+            test_stats[test_id]["models"][model]["failed"] += 1
+
+        # Store test metadata
+        if not test_stats[test_id]["user_prompt"]:
+            test_stats[test_id]["user_prompt"] = r.get("user_prompt", "")
+            test_stats[test_id]["tags"] = r.get("tags", [])
+
+    # Convert to serializable format
+    report = {}
+    for test_id, stats in sorted(test_stats.items()):
+        model_results = {}
+        for model, model_stats in stats["models"].items():
+            runs = model_stats["runs"]
+            model_results[model] = {
+                "runs": runs,
+                "passed": model_stats["passed"],
+                "failed": model_stats["failed"],
+                "pass_rate": f"{(model_stats['passed'] / runs * 100):.1f}%" if runs > 0 else "N/A",
+                "avg_time": round(model_stats["total_time"] / runs, 2) if runs > 0 else 0,
+            }
+
+        report[test_id] = {
+            "tags": stats["tags"],
+            "user_prompt": stats["user_prompt"][:100] + "..." if len(stats["user_prompt"]) > 100 else stats["user_prompt"],
+            "models": model_results,
+        }
+
+    return {
+        "test_case_report": report,
+        "generated_at": datetime.now().isoformat(),
+    }
+
+
+def generate_single_test_report(
+    results: List[Dict[str, Any]],
+    test_id: str,
+) -> Dict[str, Any]:
+    """Generate detailed report for a specific test case."""
+    test_results = [r for r in results if r.get("test_id") == test_id]
+
+    if not test_results:
+        return {"error": f"No results found for test: {test_id}"}
+
+    # Group by model
+    by_model = defaultdict(list)
+    for r in test_results:
+        by_model[r.get("model", "unknown")].append(r)
+
+    model_performance = {}
+    for model, runs in sorted(by_model.items()):
+        passed = sum(1 for r in runs if r.get("status") == "passed")
+        failed = sum(1 for r in runs if r.get("status") == "failed")
+        total = len(runs)
+
+        # Get latest run details
+        latest = runs[0] if runs else {}
+
+        model_performance[model] = {
+            "runs": total,
+            "passed": passed,
+            "failed": failed,
+            "pass_rate": f"{(passed / total * 100):.1f}%" if total > 0 else "N/A",
+            "avg_time": round(sum(r.get("total_time", 0) for r in runs) / total, 2) if total > 0 else 0,
+            "latest_status": latest.get("status"),
+            "latest_score": latest.get("score"),
+            "latest_run_id": latest.get("run_id"),
+            "latest_judge_rationale": latest.get("judge_rationale", "")[:200] if latest.get("judge_rationale") else None,
+        }
+
+    return {
+        "test_id": test_id,
+        "total_runs": len(test_results),
+        "models_tested": sorted(by_model.keys()),
+        "tags": test_results[0].get("tags", []) if test_results else [],
+        "user_prompt": test_results[0].get("user_prompt") if test_results else None,
+        "expected_output": test_results[0].get("expected_output") if test_results else None,
+        "model_performance": model_performance,
+        "generated_at": datetime.now().isoformat(),
+    }
+
+
+def generate_detailed_report(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Generate detailed report with all results."""
+    return [
+        {
+            "test_id": r.get("test_id"),
+            "model": r.get("model"),
+            "run_id": r.get("run_id"),
+            "status": r.get("status"),
+            "score": r.get("score"),
+            "total_time": r.get("total_time"),
+            "agent_time": r.get("agent_time"),
+            "setup_time": r.get("setup_time"),
+            "judge_time": r.get("judge_time"),
+            "started_at": r.get("started_at"),
+            "error_message": r.get("error_message"),
+        }
+        for r in results
+    ]
+
+
+# =============================================================================
+# Console Output Functions
+# =============================================================================
+
+
+def print_summary_report(report: Dict[str, Any]) -> None:
+    """Print summary report to console."""
+    summary = report.get("summary", {})
+    timing = report.get("timing", {})
+    coverage = report.get("coverage", {})
+
+    print("\n" + "=" * 70)
+    print("BENCHMARK SUMMARY REPORT")
+    print("=" * 70)
+
+    print("\nOverall Statistics:")
+    print(f"  Total Runs:     {summary.get('total_runs', 0)}")
+    print(f"  Passed:         {summary.get('passed', 0)} ✅")
+    print(f"  Failed:         {summary.get('failed', 0)} ❌")
+    print(f"  Setup Failed:   {summary.get('setup_failed', 0)} 🔧")
+    print(f"  Errors:         {summary.get('errors', 0)} ⚠️")
+    print(f"  Pass Rate:      {summary.get('pass_rate', 'N/A')}")
+
+    print("\nTiming:")
+    print(f"  Avg Total Time: {timing.get('avg_total_time', 0):.2f}s")
+    print(f"  Avg Agent Time: {timing.get('avg_agent_time', 0):.2f}s")
+
+    print("\nCoverage:")
+    print(f"  Unique Models:  {coverage.get('unique_models', 0)}")
+    print(f"  Unique Tests:   {coverage.get('unique_tests', 0)}")
+    print(f"  Models:         {', '.join(coverage.get('models', []))}")
+
+    print("=" * 70 + "\n")
+
+
+def print_model_comparison(report: Dict[str, Any]) -> None:
+    """Print model comparison to console."""
+    comparison = report.get("model_comparison", {})
+
+    print("\n" + "=" * 70)
+    print("MODEL COMPARISON REPORT")
+    print("=" * 70)
+
+    # Header
+    print(f"\n{'Model':<20} {'Runs':>6} {'Pass':>6} {'Fail':>6} {'Rate':>8} {'Avg Time':>10}")
+    print("-" * 70)
+
+    for model, stats in sorted(comparison.items(), key=lambda x: -x[1].get("pass_rate_numeric", 0)):
+        print(
+            f"{model:<20} "
+            f"{stats.get('total_runs', 0):>6} "
+            f"{stats.get('passed', 0):>6} "
+            f"{stats.get('failed', 0):>6} "
+            f"{stats.get('pass_rate', 'N/A'):>8} "
+            f"{stats.get('avg_agent_time', 0):>9.2f}s"
+        )
+
+    print("=" * 70 + "\n")
+
+
+def print_test_case_report(report: Dict[str, Any]) -> None:
+    """Print test case report to console."""
+    test_cases = report.get("test_case_report", {})
+
+    print("\n" + "=" * 70)
+    print("TEST CASE REPORT (by Use Case)")
+    print("=" * 70)
+
+    for test_id, data in sorted(test_cases.items()):
+        tags_str = f" [{', '.join(data.get('tags', []))}]" if data.get('tags') else ""
+        print(f"\n{test_id}{tags_str}")
+        print(f"  Prompt: {data.get('user_prompt', 'N/A')}")
+
+        models = data.get("models", {})
+        if models:
+            print(f"  {'Model':<18} {'Runs':>5} {'Pass':>5} {'Rate':>8} {'Time':>8}")
+            print(f"  {'-'*50}")
+            for model, stats in sorted(models.items()):
+                print(
+                    f"  {model:<18} "
+                    f"{stats.get('runs', 0):>5} "
+                    f"{stats.get('passed', 0):>5} "
+                    f"{stats.get('pass_rate', 'N/A'):>8} "
+                    f"{stats.get('avg_time', 0):>7.2f}s"
+                )
+
+    print("\n" + "=" * 70 + "\n")
+
+
+def print_single_test_report(report: Dict[str, Any]) -> None:
+    """Print single test report to console."""
+    if "error" in report:
+        print(f"Error: {report['error']}")
+        return
+
+    print("\n" + "=" * 70)
+    print(f"TEST REPORT: {report.get('test_id', 'Unknown')}")
+    print("=" * 70)
+
+    print(f"\nTotal Runs: {report.get('total_runs', 0)}")
+    print(f"Models Tested: {', '.join(report.get('models_tested', []))}")
+    print(f"Tags: {', '.join(report.get('tags', []))}")
+
+    print(f"\nPrompt: {report.get('user_prompt', 'N/A')}")
+
+    print("\nExpected Output:")
+    for exp in report.get("expected_output", []):
+        print(f"  - {exp}")
+
+    print("\nModel Performance:")
+    print(f"  {'Model':<18} {'Runs':>5} {'Pass':>5} {'Rate':>8} {'Latest':>10} {'Score':>6}")
+    print(f"  {'-'*60}")
+
+    for model, perf in sorted(report.get("model_performance", {}).items()):
+        print(
+            f"  {model:<18} "
+            f"{perf.get('runs', 0):>5} "
+            f"{perf.get('passed', 0):>5} "
+            f"{perf.get('pass_rate', 'N/A'):>8} "
+            f"{perf.get('latest_status', 'N/A'):>10} "
+            f"{perf.get('latest_score', 'N/A'):>6}"
+        )
+
+    print("\n" + "=" * 70 + "\n")
+
+
+def export_to_file(data: Any, filepath: Path) -> None:
+    """Export report to JSON or CSV file."""
+    if filepath.suffix == ".json":
+        with open(filepath, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+        print(f"Report saved to: {filepath}")
+
+    elif filepath.suffix == ".csv":
+        if isinstance(data, list):
+            rows = data
+        elif isinstance(data, dict) and "model_comparison" in data:
+            rows = [
+                {"model": model, **stats}
+                for model, stats in data["model_comparison"].items()
+            ]
+        elif isinstance(data, dict) and "test_case_report" in data:
+            rows = []
+            for test_id, test_data in data["test_case_report"].items():
+                for model, model_stats in test_data.get("models", {}).items():
+                    rows.append({
+                        "test_id": test_id,
+                        "model": model,
+                        "tags": ", ".join(test_data.get("tags", [])),
+                        **model_stats,
+                    })
+        else:
+            print("Cannot export this report type to CSV")
+            return
+
+        if rows:
+            with open(filepath, "w", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+                writer.writeheader()
+                writer.writerows(rows)
+            print(f"Report saved to: {filepath}")
+    else:
+        print(f"Unsupported file format: {filepath.suffix}")
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate benchmark reports (Model + Test Case level)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Summary report
+  python benchmarks/reporter.py --summary
+
+  # Compare models side-by-side
+  python benchmarks/reporter.py --compare-models
+
+  # Report by test case (use case)
+  python benchmarks/reporter.py --by-test
+
+  # Report for specific model
+  python benchmarks/reporter.py --model sonnet4.5
+
+  # Report for specific test case
+  python benchmarks/reporter.py --test-id 01_how_many_pods
+
+  # Export to file
+  python benchmarks/reporter.py --compare-models --output comparison.json
+  python benchmarks/reporter.py --by-test --output tests.csv
+        """,
+    )
+
+    # Report types
+    parser.add_argument(
+        "--summary",
+        action="store_true",
+        help="Generate summary report",
+    )
+    parser.add_argument(
+        "--compare-models",
+        action="store_true",
+        help="Compare performance across models",
+    )
+    parser.add_argument(
+        "--by-test",
+        action="store_true",
+        help="Report grouped by test case (use case)",
+    )
+    parser.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Show detailed results list",
+    )
+
+    # Filters
+    parser.add_argument(
+        "--model",
+        help="Filter by model",
+    )
+    parser.add_argument(
+        "--test-id",
+        help="Report on specific test case",
+    )
+    parser.add_argument(
+        "--status",
+        choices=["passed", "failed", "setup_failed", "error"],
+        help="Filter by status",
+    )
+    parser.add_argument(
+        "--since",
+        help="Filter results since date (ISO format)",
+    )
+
+    # Output
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output file (JSON or CSV)",
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=RESULTS_DIR,
+        help="Results directory",
+    )
+
+    args = parser.parse_args()
+
+    # Load results
+    results = load_all_results(args.results_dir)
+
+    if not results:
+        print("No results found in results directory")
+        print(f"Results directory: {args.results_dir}")
+        sys.exit(1)
+
+    # Apply filters (except test_id if doing single test report)
+    filtered = filter_results(
+        results,
+        model=args.model,
+        test_id=None,  # Don't filter by test_id for filtering, handled separately
+        status=args.status,
+        since=args.since,
+    )
+
+    # Generate requested report
+    if args.test_id:
+        # Single test report
+        report = generate_single_test_report(results, args.test_id)
+        if args.output:
+            export_to_file(report, args.output)
+        else:
+            print_single_test_report(report)
+
+    elif args.compare_models:
+        report = generate_model_comparison(filtered)
+        if args.output:
+            export_to_file(report, args.output)
+        else:
+            print_model_comparison(report)
+
+    elif args.by_test:
+        report = generate_test_case_report(filtered)
+        if args.output:
+            export_to_file(report, args.output)
+        else:
+            print_test_case_report(report)
+
+    elif args.detailed:
+        report = generate_detailed_report(filtered)
+        if args.output:
+            export_to_file(report, args.output)
+        else:
+            for r in report[:30]:
+                status_icon = {"passed": "✅", "failed": "❌", "setup_failed": "🔧", "error": "⚠️"}.get(
+                    r["status"], "?"
+                )
+                print(f"{status_icon} {r['model']}/{r['test_id']}: {r['status']} (score={r['score']}, time={r['total_time']:.1f}s)")
+            if len(report) > 30:
+                print(f"... and {len(report) - 30} more results")
+
+    else:
+        # Default to summary
+        report = generate_summary_report(filtered)
+        if args.output:
+            export_to_file(report, args.output)
+        else:
+            print_summary_report(report)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/results/.gitkeep b/benchmarks/results/.gitkeep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml
index 3c3ce7e942..7fc6b2ec63 100644
--- a/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml
@@ -1,23 +1,23 @@
-user_prompt: "How many pods are in the app-01 namespace?"
+user_prompt: "How many pods are in the app-01 namespace in Azure Prod cluster?"
 expected_output:
   - There are 14 pods in the app-01 namespace
 before_test: |
   kubectl apply -f manifests.yaml
   # Wait for 14 pods to be created and ready (60s total) - MUST succeed or test fails
   PODS_READY=false
-  for i in {1..12}; do
+  for i in {1..25}; do
     POD_COUNT=$(kubectl get pods -l app=test-pod -n app-01 --no-headers 2>/dev/null | wc -l)
-    if [ "$POD_COUNT" -eq 14 ] && kubectl wait --for=condition=ready pod -l app=test-pod -n app-01 --timeout=5s 2>/dev/null; then
+    if [ "$POD_COUNT" -eq 14 ]; then
       echo "✅ All 14 pods created and ready!"
       PODS_READY=true
       break
     else
-      echo "⏳ Attempt $i/12: $POD_COUNT/14 pods found, waiting 5s..."
-      sleep 5
+      echo "⏳ Attempt $i/25: $POD_COUNT/14 pods found, waiting 10s..."
+      sleep 10
     fi
   done
   if [ "$PODS_READY" = false ]; then
-    echo "❌ 14 ready pods not achieved after 60s"
+    echo "❌ 14 ready pods not achieved after 250s"
     kubectl get pods -n app-01
     exit 1
   fi
diff --git a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml
index 9c8e8f023d..6d2c72f1fb 100644
--- a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: 'What is wrong with << { "type": "pod", "name": "giant-narwhal-6958c5bdd8-69gtn", "namespace": "app-02" } >> ?'
+user_prompt: 'What is wrong with << { "type": "pod", "name": "giant-narwhal-6958c5bdd8-69gtn", "namespace": "app-02" } >> in Azure Prod cluster?'
 expected_output:
   - The pod was killed due to it being out of memory
 
diff --git a/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml
index 53c596d72e..5a7dec215e 100644
--- a/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: "What kubernetes events are related to the nginx pod in app-04?"
+user_prompt: "What kubernetes events are related to the nginx pod in app-04 in Azure Prod cluster?"
 expected_output:
   - "Container is failing or crashing (CrashLoopBackOff or similar restart events)"
   - "Events showing repeated container restarts or failures"
diff --git a/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml
index bc2b15b971..88b671e4d5 100644
--- a/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: 'what is the image version of << { "type": "pod", "name": "sea-turtle" } >>'
+user_prompt: 'what is the image version of << { "type": "pod", "name": "sea-turtle" } >> in Azure Prod cluster ?'
 expected_output:
   - The image version of the `sea-turtle` pod is either `1.23.4` or `nginx:1.23.4`.
 tags:
diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml
index 90eb9a908c..753d80a6bb 100644
--- a/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: "Why is there high latency with the customer-orders deployment?"
+user_prompt: "Why is there high latency with the customer-orders deployment in Azure Prod Cluster?"
 expected_output:
   - High latency in the `customer-orders` deployment is due to slow database calls
   - Database operations are taking between 5 to 10 seconds to complete
diff --git a/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml
index 249ac71f0c..0d0368b98e 100644
--- a/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/09_crashpod/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: "What is the issue with payment-processing-worker?"
+user_prompt: "What is the issue with payment-processing-worker in Azure Prod cluster?"
 expected_output:
   - The `DEPLOY_ENV` environment variable is undefined or missing
 before_test: |
diff --git a/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml
index 40f6ca32e3..94ba06fb52 100644
--- a/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/10_image_pull_backoff/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: What is the issue with customer-relations-webapp?
+user_prompt: What is the issue with customer-relations-webapp in Azure Prod cluster?
 expected_output:
   - The Docker image `yourcompany/crw:latest` cannot be pulled
 before_test: |
diff --git a/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml
index 4f2ed9fad9..947d5cbb19 100644
--- a/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/11_init_containers/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: "What is the issue with logging-agent?"
+user_prompt: "What is the issue with logging-agent in Azure Prod cluster?"
 expected_output:
   - The command `wge` is not found
   - The answer should suggets `wge` may be a typo for `wget`.
diff --git a/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml
index fe7dd1d7ab..28fdc6a3c8 100644
--- a/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/12_job_crashing/test_case.yaml
@@ -1,4 +1,4 @@
-user_prompt: "What is the issue with job java-api-checker in namespace app-12"
+user_prompt: "What is the issue with job java-api-checker in namespace app-12 in Azure Prod cluster?"
 expected_output:
   - The `java-api-checker` job repeatedly fails to connect to the database at `prod-db:3333`
 before_test: |
@@ -18,8 +18,8 @@ before_test: |
       LOGS_READY=true
       break
     else
-      echo "⏳ Attempt $i/20: waiting for job pod with specific log lines, checking in 3s..."
-      sleep 3
+      echo "⏳ Attempt $i/20: waiting for job pod with specific log lines, checking in 10s..."
+      sleep 10
     fi
   done
   if [ "$LOGS_READY" = false ]; then

From ad70f2565f1ebcc90a4b8eb7a96efe059882b6db Mon Sep 17 00:00:00 2001
From: Dipesh Mittal <dipesh@drdroid.io>
Date: Mon, 16 Feb 2026 18:18:57 +0530
Subject: [PATCH 2/2] added benchmarking guides

---
 benchmarks/BENCHMARKING_GUIDE.md | 267 +++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 benchmarks/BENCHMARKING_GUIDE.md

diff --git a/benchmarks/BENCHMARKING_GUIDE.md b/benchmarks/BENCHMARKING_GUIDE.md
new file mode 100644
index 0000000000..2f260b733b
--- /dev/null
+++ b/benchmarks/BENCHMARKING_GUIDE.md
@@ -0,0 +1,267 @@
+# Benchmarking Guide
+
+This guide covers how to run benchmark tests using the DrDroid agent against HolmesGPT's evaluation test cases, generate reports, and review results in the Streamlit dashboard.
+
+## Prerequisites
+
+- Python 3.10+
+- A running Kubernetes cluster with `kubectl` configured
+- API keys for the LLM judge (OpenAI) and DrDroid agent
+- Install extra dependencies:
+  ```bash
+  pip install streamlit pandas pyyaml requests
+  ```
+
+## 1. Setup Credentials
+
+```bash
+cp benchmarks/config/credentials.yaml.template benchmarks/config/credentials.yaml
+```
+
+Edit `benchmarks/config/credentials.yaml` and fill in:
+
+```yaml
+# Required for the DrDroid agent
+custom:
+  drdroid:
+    api_url: http://your-drdroid-api-url
+    api_key: your-drdroid-api-key
+
+# Required for the LLM judge that scores results
+openai:
+  api_key: sk-...
+
+judge:
+  model: gpt-4.1
+
+# Required for Kubernetes-based tests
+kubernetes:
+  kubeconfig: ~/.kube/config
+  context: your-cluster-context
+```
+
+Alternatively, set environment variables (these override the YAML file):
+
+```bash
+export DRDROID_API_URL=http://your-drdroid-api-url
+export DRDROID_API_KEY=your-drdroid-api-key
+export OPENAI_API_KEY=sk-...
+export CLASSIFIER_MODEL=gpt-4.1
+```
+
+## 2. Running Benchmark Tests
+
+### List Available Tests
+
+```bash
+python benchmarks/executor.py --list-tests
+```
+
+### Run a Single Test
+
+```bash
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods
+```
+
+The `--model` flag is **required** and labels which model the agent is using (used for tracking/comparison). The `--agent` flag selects the agent implementation (defaults to `drdroid`).
+
+### Run Multiple Specific Tests
+
+```bash
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid \
+    --test-id 01_how_many_pods \
+    --test-id 02_what_is_wrong_with_pod \
+    --test-id 09_crashpod
+```
+
+### Run All Tests
+
+```bash
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --all
+```
+
+### Run Tests by Tag
+
+```bash
+# Run only Kubernetes tests
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag kubernetes
+
+# Run easy/regression tests
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag easy
+
+# Multiple tags (OR logic)
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --tag kubernetes --tag prometheus
+```
+
+### Skip Setup or Cleanup
+
+Useful for iterative debugging:
+
+```bash
+# Skip infrastructure setup (if resources are already running)
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods --skip-setup
+
+# Skip cleanup (keep infrastructure running after test)
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id 01_how_many_pods --skip-cleanup
+```
+
+## 3. Available Agents
+
+| Agent        | Description                                      |
+|------------- |--------------------------------------------------|
+| `drdroid`    | DrDroid Investigation API                        |
+| `holmes`     | HolmesGPT ToolCallingLLM                         |
+| `claudecode` | Local Claude Code CLI with read-only kubectl     |
+| `openai`     | Simple OpenAI completion (no tools)              |
+
+To compare agents, run the same tests with different `--agent` flags:
+
+```bash
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --all
+python benchmarks/executor.py --model sonnet4.5 --agent holmes --all
+```
+
+## 4. Generating Reports
+
+### CLI Summary
+
+```bash
+python benchmarks/reporter.py --summary
+```
+
+Shows overall pass rate, timing, and coverage stats.
+
+### Model Comparison
+
+```bash
+python benchmarks/reporter.py --compare-models
+```
+
+Side-by-side comparison of pass rates across models.
+
+### Report by Test Case
+
+```bash
+python benchmarks/reporter.py --by-test
+```
+
+Breakdown of results per test case, showing which models passed/failed each one.
+
+### Filter Results
+
+```bash
+# Results for a specific model only
+python benchmarks/reporter.py --summary --model sonnet4.5
+
+# Results since a specific date
+python benchmarks/reporter.py --summary --since 2026-01-30
+
+# Only failed tests
+python benchmarks/reporter.py --detailed --status failed
+
+# Specific test case
+python benchmarks/reporter.py --test-id 01_how_many_pods
+```
+
+### Export to File
+
+```bash
+# JSON export
+python benchmarks/reporter.py --summary --output report.json
+
+# CSV export
+python benchmarks/reporter.py --compare-models --output comparison.csv
+python benchmarks/reporter.py --by-test --output tests.csv
+```
+
+## 5. Reviewing Results in the Streamlit Dashboard
+
+Launch the interactive dashboard:
+
+```bash
+streamlit run benchmarks/dashboard.py
+```
+
+Or on a custom port:
+
+```bash
+streamlit run benchmarks/dashboard.py --server.port 8501
+```
+
+### Dashboard Sections
+
+- **Overview** - Total runs, pass rate, cost, and tokens summary
+- **Model Comparison** - Side-by-side comparison of all models tested
+- **Test Cases** - Per-test-case analysis with per-model breakdown
+- **Use Case x Model Matrix** - Pivot table showing status/metrics for every test-model combination
+- **Raw Results** - Detailed view with filtering and drill-down into individual runs
+
+### Dashboard Features
+
+- **Refresh Data** button to reload latest results
+- Filter by model, status, and date range
+- Download CSV exports directly from the UI
+- View detailed output, judge rationale, and errors for any run
+
+## 6. Understanding Results
+
+Each test run produces a JSON file in `benchmarks/results/`:
+
+```
+results/
+  sonnet4.5_01_how_many_pods_20260130_163000.json
+  sonnet4.5_02_what_is_wrong_with_pod_20260130_163100.json
+  ...
+```
+
+Key fields in each result:
+
+| Field             | Description                                 |
+|-------------------|---------------------------------------------|
+| `status`          | `passed`, `failed`, `setup_failed`, `error` |
+| `score`           | 0.0 to 1.0 score from the LLM judge        |
+| `judge_rationale` | Explanation of why the judge scored it       |
+| `actual_output`   | The agent's raw response                    |
+| `agent_time`      | Time the agent took to respond              |
+| `setup_time`      | Time for infrastructure setup               |
+| `tool_calls`      | Tools the agent invoked                     |
+
+## 7. Adding More Tests
+
+Test cases live in `tests/llm/fixtures/test_ask_holmes/`. Each test is a directory containing a `test_case.yaml`:
+
+```yaml
+user_prompt: "Your question here?"
+expected_output:
+  - "What the judge should check for"
+  - "Another expected fact"
+tags:
+  - kubernetes
+  - easy
+before_test: |
+  # Bash script to set up infrastructure
+  kubectl apply -f manifests.yaml
+after_test: |
+  # Bash script to clean up
+  kubectl delete -f manifests.yaml
+```
+
+After adding a test, verify it appears:
+
+```bash
+python benchmarks/executor.py --list-tests
+```
+
+Then run it:
+
+```bash
+python benchmarks/executor.py --model sonnet4.5 --agent drdroid --test-id your_new_test
+```
+
+## 8. Branch-Specific Changes
+
+The `benchmarking-drdroid-agent` branch includes these changes over `master`:
+
+1. **New `benchmarks/` module** - Complete benchmarking framework with executor, agent registry, reporter, config management, and Streamlit dashboard
+2. **Updated test prompts** - Added "in Azure Prod cluster" context to test prompts (tests 01, 02, 04, 05, 07, 09, 10, 11, 12) for more realistic DrDroid agent evaluation
+3. **Increased setup timeouts** - Longer wait times in `before_test` scripts for tests 01 and 12 to handle slower cluster environments