From 69f1fa7914d87b11120aa3ff3ee10cfc3542db89 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 14:27:50 +0100
Subject: [PATCH 1/8] AI-2588: add KaiBench evaluation workflow to CI pipeline

Add kaibench.yml reusable workflow that builds MCP server from the PR
branch, starts the full stack (MCP server + kai-assistant + Postgres +
Redis), and runs KaiBench evaluations. Triggered as a non-blocking job
in ci.yml after the build passes (same-repo pushes only).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml       |   8 +
 .github/workflows/kaibench.yml | 399 +++++++++++++++++++++++++++++++++
 2 files changed, 407 insertions(+)
 create mode 100644 .github/workflows/kaibench.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 645e1af7..1245abd9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -147,6 +147,14 @@ jobs:
           path: ./integtest-results.xml
           reporter: 'java-junit'
 
+  kaibench:
+    name: KaiBench Evaluation
+    needs: build
+    # Run only for same-repo pushes (not fork PRs, which lack secrets)
+    if: github.event_name == 'push' && github.repository == 'keboola/keboola-mcp-server'
+    uses: ./.github/workflows/kaibench.yml
+    secrets: inherit
+
   deploy_to_pypi:
     name: Deploy to pypi.org
     needs:
diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
new file mode 100644
index 00000000..670c300d
--- /dev/null
+++ b/.github/workflows/kaibench.yml
@@ -0,0 +1,399 @@
+name: KaiBench Evaluation
+
+on:
+  workflow_dispatch:
+    inputs:
+      question_types:
+        description: 'Question types (comma-separated, or "all")'
+        default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning'
+        type: string
+      regression_only:
+        description: 'Only regression-flagged questions'
+        type: boolean
+        default: false
+      kai_assistant_image_tag:
+        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)'
+        required: false
+        type: string
+        default: ''
+  workflow_call:
+    inputs:
+      question_types:
+        description: 'Question types (comma-separated)'
+        required: false
+        type: string
+        default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning'
+      regression_only:
+        description: 'Only regression-flagged questions'
+        required: false
+        type: boolean
+        default: false
+      kai_assistant_image_tag:
+        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)'
+        required: false
+        type: string
+        default: ''
+    secrets:
+      KAIBENCH_STATIC_TOKEN:
+        required: true
+      KAIBENCH_MANAGEMENT_TOKEN:
+        required: false
+      KAIBENCH_API_URL:
+        required: true
+      KAIBENCH_REPO_TOKEN:
+        required: true
+      DOCKERHUB_TOKEN:
+        required: true
+      KAI_GOOGLE_VERTEX_CREDENTIALS:
+        required: true
+      KAI_GOOGLE_VERTEX_PROJECT:
+        required: true
+      KAI_GOOGLE_VERTEX_LOCATION:
+        required: true
+      TURBO_TOKEN:
+        required: false
+    outputs:
+      status:
+        description: 'Test status (passed, failed, error)'
+        value: ${{ jobs.evaluate.outputs.status }}
+      passed:
+        value: ${{ jobs.evaluate.outputs.passed }}
+      failed:
+        value: ${{ jobs.evaluate.outputs.failed }}
+      total:
+        value: ${{ jobs.evaluate.outputs.total }}
+      pass_rate:
+        value: ${{ jobs.evaluate.outputs.pass_rate }}
+      duration:
+        value: ${{ jobs.evaluate.outputs.duration }}
+
+jobs:
+  evaluate:
+    name: Run KaiBench evaluation
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    outputs:
+      status: ${{ steps.parse.outputs.status }}
+      passed: ${{ steps.parse.outputs.passed }}
+      failed: ${{ steps.parse.outputs.failed }}
+      total: ${{ steps.parse.outputs.total }}
+      pass_rate: ${{ steps.parse.outputs.pass_rate }}
+      duration: ${{ steps.parse.outputs.duration }}
+
+    services:
+      postgres:
+        image: postgres:16
+        env:
+          POSTGRES_DB: kai_db
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5432:5432
+        options: --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5
+      redis:
+        image: redis:7-alpine
+        ports:
+          - 6379:6379
+        options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5
+
+    steps:
+      # ── Checkouts ──────────────────────────────────────────────
+      - name: Checkout MCP server
+        uses: actions/checkout@v4
+        with:
+          path: mcp-server
+
+      - name: Checkout KaiBench
+        uses: actions/checkout@v4
+        with:
+          repository: keboola-rnd/KaiBench
+          token: ${{ secrets.KAIBENCH_REPO_TOKEN }}
+          path: kaibench
+
+      - name: Checkout UI repo (for kai-assistant build)
+        if: inputs.kai_assistant_image_tag == ''
+        uses: actions/checkout@v4
+        with:
+          repository: keboola/ui
+          token: ${{ secrets.KAIBENCH_REPO_TOKEN }}
+          ref: main
+          path: ui
+
+      # ── Docker setup ───────────────────────────────────────────
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker Hub login
+        uses: docker/login-action@v3
+        with:
+          username: keboolabot
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      # ── Build MCP server from this PR ──────────────────────────
+      - name: Build MCP server image
+        uses: docker/build-push-action@v6
+        with:
+          load: true
+          tags: keboola/mcp-server:kaibench-test
+          context: ./mcp-server
+          file: ./mcp-server/Dockerfile
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      # ── kai-assistant image ────────────────────────────────────
+      - name: Build kai-assistant from main
+        if: inputs.kai_assistant_image_tag == ''
+        uses: docker/build-push-action@v6
+        with:
+          load: true
+          tags: keboola/kai-assistant:kaibench-test
+          context: ./ui
+          file: ./ui/apps/kai-assistant-backend/Dockerfile
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            VERSION=kaibench-test
+            KEBOOLA_STACK=dev-keboola-canary-orion
+            KEBOOLA_STORAGE_API_URL=${{ secrets.KAIBENCH_API_URL }}
+          secrets: |
+            turbo_token=${{ secrets.TURBO_TOKEN }}
+
+      - name: Pull pre-built kai-assistant image
+        if: inputs.kai_assistant_image_tag != ''
+        run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }}
+
+      - name: Resolve kai-assistant image tag
+        id: resolve-kai-image
+        run: |
+          if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then
+            echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT
+          else
+            echo "tag=kaibench-test" >> $GITHUB_OUTPUT
+          fi
+
+      # ── Start services ─────────────────────────────────────────
+      - name: Start MCP server
+        run: |
+          docker run -d --name mcp-server --network host \
+            keboola/mcp-server:kaibench-test \
+            --transport streamable-http --host 0.0.0.0 --port 8000
+          echo "Waiting for MCP server..."
+          for i in $(seq 1 30); do
+            if curl -so /dev/null http://localhost:8000/mcp 2>&1; then
+              echo "MCP server is ready"
+              break
+            fi
+            if [ $i -eq 30 ]; then
+              echo "::error::MCP server failed to start"
+              docker logs mcp-server
+              exit 1
+            fi
+            sleep 2
+          done
+
+      - name: Run database migrations
+        env:
+          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
+        run: |
+          docker run --rm --network host \
+            -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
+            --workdir /app/apps/kai-assistant-backend \
+            keboola/kai-assistant:$KAI_IMAGE_TAG \
+            node dist/lib/db/migrate.js
+
+      - name: Start kai-assistant
+        env:
+          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
+          KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
+          GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }}
+          GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }}
+          GOOGLE_VERTEX_LOCATION: ${{ secrets.KAI_GOOGLE_VERTEX_LOCATION }}
+        run: |
+          AUTH_SECRET=$(openssl rand -base64 32)
+          docker run -d --name kai-assistant --network host \
+            -e AUTH_SECRET="$AUTH_SECRET" \
+            -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
+            -e REDIS_URL="redis://localhost:6379" \
+            -e MCP_SERVER_URL="http://localhost:8000/mcp" \
+            -e KEBOOLA_STORAGE_API_URL="$KAIBENCH_API_URL" \
+            -e KEBOOLA_STACK="dev-keboola-canary-orion" \
+            -e CLOUD_LLM_PROVIDER="google-vertex" \
+            -e GOOGLE_SERVICE_ACCOUNT_JSON="$GOOGLE_SERVICE_ACCOUNT_JSON" \
+            -e GOOGLE_VERTEX_PROJECT="$GOOGLE_VERTEX_PROJECT" \
+            -e GOOGLE_VERTEX_LOCATION="$GOOGLE_VERTEX_LOCATION" \
+            keboola/kai-assistant:$KAI_IMAGE_TAG
+          echo "Waiting for kai-assistant..."
+          for i in $(seq 1 60); do
+            if curl -sf http://localhost:3000/ping > /dev/null 2>&1; then
+              echo "kai-assistant is ready"
+              break
+            fi
+            if [ $i -eq 60 ]; then
+              echo "::error::kai-assistant failed to start"
+              docker logs kai-assistant
+              exit 1
+            fi
+            sleep 2
+          done
+
+      # ── Run evaluation ─────────────────────────────────────────
+      - name: Setup uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install Python
+        run: uv python install 3.11
+
+      - name: Install dependencies
+        working-directory: kaibench
+        run: uv sync
+
+      - name: Run evaluation
+        working-directory: kaibench
+        env:
+          KAIBENCH_STATIC_TOKEN: ${{ secrets.KAIBENCH_STATIC_TOKEN }}
+          KAIBENCH_STATIC_HOST: connection.canary-orion.keboola.dev
+          KAIBENCH_STATIC_PROJECT_ID: '293'
+          KAIBENCH_MANAGEMENT_TOKEN: ${{ secrets.KAIBENCH_MANAGEMENT_TOKEN }}
+          KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
+          KAIBENCH_ORGANIZATION_ID: '58'
+          KAIBENCH_EVAL_PARALLEL_WORKERS: '4'
+          KAIBENCH_EVAL_KAI_BACKEND_URL: http://localhost:3000
+        run: |
+          TYPES="${{ inputs.question_types }}"
+          CMD="uv run kaibench run"
+          # Split comma-separated types into -t flags
+          IFS=',' read -ra TYPE_ARRAY <<< "$TYPES"
+          for t in "${TYPE_ARRAY[@]}"; do
+            trimmed=$(echo "$t" | xargs)
+            CMD="$CMD -t \"$trimmed\""
+          done
+          if [ "${{ inputs.regression_only }}" = "true" ]; then
+            CMD="$CMD --regression-only"
+          fi
+          eval $CMD
+        continue-on-error: true
+
+      # ── Results ────────────────────────────────────────────────
+      - name: Parse results
+        id: parse
+        if: always()
+        working-directory: kaibench
+        run: |
+          LATEST=$(ls -td results/run_* 2>/dev/null | head -1)
+          if [ -z "$LATEST" ]; then
+            echo "status=error" >> $GITHUB_OUTPUT
+            echo "passed=0" >> $GITHUB_OUTPUT
+            echo "failed=0" >> $GITHUB_OUTPUT
+            echo "total=0" >> $GITHUB_OUTPUT
+            echo "pass_rate=0" >> $GITHUB_OUTPUT
+            echo "duration=0" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          python3 << 'PYEOF' >> $GITHUB_OUTPUT
+          import json
+          from pathlib import Path
+
+          latest = sorted(Path('results').glob('run_*'))[-1]
+          s = json.loads((latest / 'summary.json').read_text())
+          m = s['metrics']
+
+          print(f"passed={m['passed']}")
+          print(f"failed={m['failed']}")
+          print(f"total={m['total_questions']}")
+          print(f"pass_rate={m['overall_pass_rate']:.2f}")
+          print(f"duration={m['duration_seconds']:.0f}")
+          status = 'passed' if m['failed'] == 0 and m['errors'] == 0 else 'failed'
+          print(f"status={status}")
+          PYEOF
+
+      - name: Step summary
+        if: always()
+        working-directory: kaibench
+        run: |
+          LATEST=$(ls -td results/run_* 2>/dev/null | head -1)
+          [ -z "$LATEST" ] && exit 0
+          python3 << 'PYEOF' >> $GITHUB_STEP_SUMMARY
+          import json, os
+          from pathlib import Path
+
+          latest = sorted(Path('results').glob('run_*'))[-1]
+          s = json.loads((latest / 'summary.json').read_text())
+          m = s['metrics']
+
+          results = []
+          results_file = latest / 'results.jsonl'
+          if results_file.exists():
+              for line in results_file.read_text().splitlines():
+                  if line.strip():
+                      results.append(json.loads(line))
+
+          evaluated = [r for r in results if r.get('status') != 'skipped']
+          total_tools = sum(len(r.get('trace', {}).get('tool_calls', [])) for r in evaluated)
+          avg_tools = total_tools / len(evaluated) if evaluated else 0
+          avg_duration = sum(r.get('duration_ms', 0) for r in evaluated) / len(evaluated) / 1000 if evaluated else 0
+          total_tokens = sum(r.get('trace', {}).get('total_tokens', 0) or 0 for r in evaluated)
+          avg_tokens = total_tokens / len(evaluated) if evaluated else 0
+
+          print('## KaiBench Evaluation Results')
+          print()
+          print('| Metric | Value |')
+          print('|--------|-------|')
+          print(f'| Run ID | `{s["run_id"]}` |')
+          print(f'| MCP Server | `PR build` |')
+          print(f'| Duration | {m["duration_seconds"]:.0f}s |')
+          print(f'| **Total** | **{m["total_questions"]}** |')
+          print(f'| Passed | {m["passed"]} |')
+          print(f'| Failed | {m["failed"]} |')
+          print(f'| Skipped | {m["skipped"]} |')
+          print(f'| Errors | {m["errors"]} |')
+          print(f'| **Pass Rate** | **{m["overall_pass_rate"]:.1%}** |')
+          print(f'| Total Tool Calls | {total_tools} |')
+          print(f'| Avg Tool Calls/Question | {avg_tools:.1f} |')
+          print(f'| Avg Duration/Question | {avg_duration:.0f}s |')
+          print(f'| Total Tokens | {total_tokens:,} |')
+          print(f'| Avg Tokens/Question | {avg_tokens:,.0f} |')
+
+          for t in s.get('by_question_type', []):
+              print()
+              print(f'### {t["question_type"]}')
+              print(f'{t["passed_count"]}/{t["total_count"]} passed ({t["pass_rate"]:.1%})')
+              if t['skipped_count']:
+                  print(f'_{t["skipped_count"]} skipped_')
+
+          if evaluated:
+              print()
+              print('### Per-Question Results')
+              print()
+              print('| Q | Type | Status | Tools | Tokens | Duration | Expected | Extracted | Notes |')
+              print('|---|------|--------|-------|--------|----------|----------|-----------|-------|')
+              for r in sorted(evaluated, key=lambda x: int(x.get('question_id', 0))):
+                  qid = r.get('question_id', '?')
+                  qtype = (r.get('question_type') or '')[:12]
+                  status = r.get('status', '?')
+                  emoji = {'passed': ':white_check_mark:', 'failed': ':x:', 'error': ':warning:'}.get(status, status)
+                  tools = len(r.get('trace', {}).get('tool_calls', []))
+                  tokens = r.get('trace', {}).get('total_tokens', 0) or 0
+                  tokens_str = f'{tokens:,}' if tokens else '-'
+                  dur = f'{r.get("duration_ms", 0)/1000:.0f}s'
+                  expected = str(r.get('expected_answer') or '-')[:25]
+                  extracted = str(r.get('extracted_answer') or '-')[:25]
+                  notes = (r.get('verification', {}).get('notes') or '')[:40]
+                  print(f'| {qid} | {qtype} | {emoji} | {tools} | {tokens_str} | {dur} | {expected} | {extracted} | {notes} |')
+          PYEOF
+
+      - name: Dump container logs
+        if: always()
+        run: |
+          echo "=== kai-assistant logs ==="
+          docker logs kai-assistant 2>&1 || true
+          echo ""
+          echo "=== mcp-server logs ==="
+          docker logs mcp-server 2>&1 || true
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: kaibench-results-${{ github.run_id }}
+          path: kaibench/results/
+          retention-days: 90

From 18b8dd1a8a2a44169cecc2ddd7eb32b46d8c9521 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 14:35:25 +0100
Subject: [PATCH 2/8] AI-2588: add permissions block to kaibench workflow

Restricts GITHUB_TOKEN to read-only contents access to satisfy CodeQL
security policy.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/kaibench.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
index 670c300d..76ffb263 100644
--- a/.github/workflows/kaibench.yml
+++ b/.github/workflows/kaibench.yml
@@ -1,5 +1,8 @@
 name: KaiBench Evaluation
 
+permissions:
+  contents: read
+
 on:
   workflow_dispatch:
     inputs:

From 58d339ef6a76394a771b877aa684458a459d9b0f Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 16:58:39 +0100
Subject: [PATCH 3/8] AI-2588: add missing requests dependency

cli.py imports requests (for requests.JSONDecodeError handler) but it
was not declared in pyproject.toml. This caused test collection to fail
on Python 3.11 in CI where no transitive dependency pulls it in.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pyproject.toml | 1 +
 uv.lock        | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e3c7f14b..cc90be67 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "sqlglot ~= 28.5",
     "toon-format ~= 0.9.0b1",
     "pyyaml ~= 6.0",
+    "requests ~= 2.32",
 ]
 [project.optional-dependencies]
 codestyle = [
diff --git a/uv.lock b/uv.lock
index 20ed7b74..5f221532 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.13'",
@@ -1237,6 +1237,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "pyjwt" },
     { name = "pyyaml" },
+    { name = "requests" },
     { name = "sqlglot" },
     { name = "toon-format" },
 ]
@@ -1303,6 +1304,7 @@ requires-dist = [
     { name = "python-dateutil", marker = "extra == 'tests'", specifier = "~=2.9" },
     { name = "python-dotenv", marker = "extra == 'tests'", specifier = "~=1.2" },
     { name = "pyyaml", specifier = "~=6.0" },
+    { name = "requests", specifier = "~=2.32" },
     { name = "sqlglot", specifier = "~=28.5" },
     { name = "toon-format", specifier = "~=0.9.0b1" },
     { name = "tox", marker = "extra == 'dev'", specifier = "~=4.32" },
@@ -2489,8 +2491,8 @@ name = "taskgroup"
 version = "0.2.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "exceptiongroup" },
-    { name = "typing-extensions" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504, upload-time = "2025-01-03T09:24:13.761Z" }
 wheels = [

From 4304763d31edff3834326ae2935018483e63e3ef Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 17:03:22 +0100
Subject: [PATCH 4/8] AI-2588: fix repo name in kaibench job condition

The repo was renamed from keboola/keboola-mcp-server to
keboola/mcp-server, causing the if condition to evaluate false.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1245abd9..58706e3c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -151,7 +151,7 @@ jobs:
     name: KaiBench Evaluation
     needs: build
     # Run only for same-repo pushes (not fork PRs, which lack secrets)
-    if: github.event_name == 'push' && github.repository == 'keboola/keboola-mcp-server'
+    if: github.event_name == 'push' && github.repository == 'keboola/mcp-server'
     uses: ./.github/workflows/kaibench.yml
     secrets: inherit
 

From a9ac2ff2fa40b5386fb4b13e2586c675ea140de3 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 17:24:37 +0100
Subject: [PATCH 5/8] AI-2588: remove kai-assistant build-from-source path

Always use pre-built kai-assistant Docker image tag instead of building
from the UI repo source. This removes the need for UI repo access and
TURBO_TOKEN. The image tag is provided via vars.KAI_ASSISTANT_IMAGE_TAG.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml       |  2 ++
 .github/workflows/kaibench.yml | 52 ++++------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 58706e3c..cac7de0a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -153,6 +153,8 @@ jobs:
     # Run only for same-repo pushes (not fork PRs, which lack secrets)
     if: github.event_name == 'push' && github.repository == 'keboola/mcp-server'
     uses: ./.github/workflows/kaibench.yml
+    with:
+      kai_assistant_image_tag: ${{ vars.KAI_ASSISTANT_IMAGE_TAG }}
     secrets: inherit
 
   deploy_to_pypi:
diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
index 76ffb263..97ba6d53 100644
--- a/.github/workflows/kaibench.yml
+++ b/.github/workflows/kaibench.yml
@@ -15,10 +15,9 @@ on:
         type: boolean
         default: false
       kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)'
-        required: false
+        description: 'Pre-built kai-assistant Docker Hub tag'
+        required: true
         type: string
-        default: ''
   workflow_call:
     inputs:
       question_types:
@@ -32,10 +31,9 @@ on:
         type: boolean
         default: false
       kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)'
-        required: false
+        description: 'Pre-built kai-assistant Docker Hub tag'
+        required: true
         type: string
-        default: ''
     secrets:
       KAIBENCH_STATIC_TOKEN:
         required: true
@@ -53,8 +51,6 @@ on:
         required: true
       KAI_GOOGLE_VERTEX_LOCATION:
         required: true
-      TURBO_TOKEN:
-        required: false
     outputs:
       status:
         description: 'Test status (passed, failed, error)'
@@ -113,15 +109,6 @@ jobs:
           token: ${{ secrets.KAIBENCH_REPO_TOKEN }}
           path: kaibench
 
-      - name: Checkout UI repo (for kai-assistant build)
-        if: inputs.kai_assistant_image_tag == ''
-        uses: actions/checkout@v4
-        with:
-          repository: keboola/ui
-          token: ${{ secrets.KAIBENCH_REPO_TOKEN }}
-          ref: main
-          path: ui
-
       # ── Docker setup ───────────────────────────────────────────
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -144,36 +131,9 @@ jobs:
           cache-to: type=gha,mode=max
 
       # ── kai-assistant image ────────────────────────────────────
-      - name: Build kai-assistant from main
-        if: inputs.kai_assistant_image_tag == ''
-        uses: docker/build-push-action@v6
-        with:
-          load: true
-          tags: keboola/kai-assistant:kaibench-test
-          context: ./ui
-          file: ./ui/apps/kai-assistant-backend/Dockerfile
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          build-args: |
-            VERSION=kaibench-test
-            KEBOOLA_STACK=dev-keboola-canary-orion
-            KEBOOLA_STORAGE_API_URL=${{ secrets.KAIBENCH_API_URL }}
-          secrets: |
-            turbo_token=${{ secrets.TURBO_TOKEN }}
-
       - name: Pull pre-built kai-assistant image
-        if: inputs.kai_assistant_image_tag != ''
         run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }}
 
-      - name: Resolve kai-assistant image tag
-        id: resolve-kai-image
-        run: |
-          if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then
-            echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT
-          else
-            echo "tag=kaibench-test" >> $GITHUB_OUTPUT
-          fi
-
       # ── Start services ─────────────────────────────────────────
       - name: Start MCP server
         run: |
@@ -196,7 +156,7 @@ jobs:
 
       - name: Run database migrations
         env:
-          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
+          KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }}
         run: |
           docker run --rm --network host \
             -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
@@ -206,7 +166,7 @@ jobs:
 
       - name: Start kai-assistant
         env:
-          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
+          KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }}
           KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
           GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }}
           GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }}

From 1bbd159da228a25f63d668838b78ce9329bc2ca0 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 17:28:17 +0100
Subject: [PATCH 6/8] AI-2588: auto-resolve latest kai-assistant tag from
 Docker Hub

Instead of requiring a static image tag, the workflow now queries
Docker Hub for the latest production-kai-assi-* tag when none is
provided. This ensures CI always tests against the newest kai-assistant
without manual variable updates.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml       |  2 --
 .github/workflows/kaibench.yml | 42 +++++++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cac7de0a..58706e3c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -153,8 +153,6 @@ jobs:
     # Run only for same-repo pushes (not fork PRs, which lack secrets)
     if: github.event_name == 'push' && github.repository == 'keboola/mcp-server'
     uses: ./.github/workflows/kaibench.yml
-    with:
-      kai_assistant_image_tag: ${{ vars.KAI_ASSISTANT_IMAGE_TAG }}
     secrets: inherit
 
   deploy_to_pypi:
diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
index 97ba6d53..37302330 100644
--- a/.github/workflows/kaibench.yml
+++ b/.github/workflows/kaibench.yml
@@ -15,9 +15,10 @@ on:
         type: boolean
         default: false
       kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag'
-        required: true
+        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)'
+        required: false
         type: string
+        default: ''
   workflow_call:
     inputs:
       question_types:
@@ -31,9 +32,10 @@ on:
         type: boolean
         default: false
       kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag'
-        required: true
+        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)'
+        required: false
         type: string
+        default: ''
     secrets:
       KAIBENCH_STATIC_TOKEN:
         required: true
@@ -131,8 +133,32 @@ jobs:
           cache-to: type=gha,mode=max
 
       # ── kai-assistant image ────────────────────────────────────
-      - name: Pull pre-built kai-assistant image
-        run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }}
+      - name: Resolve kai-assistant image tag
+        id: resolve-kai-image
+        env:
+          DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+        run: |
+          if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then
+            echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT
+            echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}"
+          else
+            echo "No tag provided, resolving latest production tag from Docker Hub..."
+            TOKEN=$(curl -sf -H "Content-Type: application/json" \
+              -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \
+              https://hub.docker.com/v2/users/login/ | jq -r .token)
+            TAG=$(curl -sf -H "Authorization: JWT $TOKEN" \
+              "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi" \
+              | jq -r '.results[0].name')
+            if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then
+              echo "::error::Failed to resolve latest kai-assistant production tag"
+              exit 1
+            fi
+            echo "tag=$TAG" >> $GITHUB_OUTPUT
+            echo "Resolved latest production tag: $TAG"
+          fi
+
+      - name: Pull kai-assistant image
+        run: docker pull keboola/kai-assistant:${{ steps.resolve-kai-image.outputs.tag }}
 
       # ── Start services ─────────────────────────────────────────
       - name: Start MCP server
@@ -156,7 +182,7 @@ jobs:
 
       - name: Run database migrations
         env:
-          KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }}
+          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
         run: |
           docker run --rm --network host \
             -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
@@ -166,7 +192,7 @@ jobs:
 
       - name: Start kai-assistant
         env:
-          KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }}
+          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
           KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
           GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }}
           GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }}

From 8eb32d9a1b96ca96e630428147b9bdcabcdc9671 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 17:36:08 +0100
Subject: [PATCH 7/8] AI-2588: add debug output to Docker Hub tag resolution

Remove silent curl flags to surface errors when Docker Hub API
calls fail during kai-assistant tag resolution.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/kaibench.yml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
index 37302330..4131ae49 100644
--- a/.github/workflows/kaibench.yml
+++ b/.github/workflows/kaibench.yml
@@ -143,14 +143,22 @@ jobs:
             echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}"
           else
             echo "No tag provided, resolving latest production tag from Docker Hub..."
-            TOKEN=$(curl -sf -H "Content-Type: application/json" \
+            LOGIN_RESPONSE=$(curl -s -H "Content-Type: application/json" \
               -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \
-              https://hub.docker.com/v2/users/login/ | jq -r .token)
-            TAG=$(curl -sf -H "Authorization: JWT $TOKEN" \
-              "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi" \
-              | jq -r '.results[0].name')
+              https://hub.docker.com/v2/users/login/)
+            TOKEN=$(echo "$LOGIN_RESPONSE" | jq -r .token)
+            if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+              echo "::error::Docker Hub login failed"
+              echo "Response: $LOGIN_RESPONSE" | head -c 200
+              exit 1
+            fi
+            echo "Docker Hub login successful"
+            TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \
+              "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi")
+            TAG=$(echo "$TAGS_RESPONSE" | jq -r '.results[0].name')
             if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then
               echo "::error::Failed to resolve latest kai-assistant production tag"
+              echo "Response: $TAGS_RESPONSE" | head -c 500
               exit 1
             fi
             echo "tag=$TAG" >> $GITHUB_OUTPUT

From 6ad0e8278551e7ccdca9c3b81310538f412424f2 Mon Sep 17 00:00:00 2001
From: jordanrburger <jordanrburger@gmail.com>
Date: Sat, 21 Feb 2026 18:50:36 +0100
Subject: [PATCH 8/8] AI-2588: replace local eval with KaiBench dispatch
 trigger

Instead of running the full evaluation locally (which needs UI repo
access, TURBO_TOKEN, and registry credentials), dispatch the eval
to the KaiBench repo where all secrets are centralized.

Results are posted back as a commit status on the MCP server repo.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/kaibench.yml | 394 ++-------------------------------
 1 file changed, 23 insertions(+), 371 deletions(-)

diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml
index 4131ae49..b674d9ae 100644
--- a/.github/workflows/kaibench.yml
+++ b/.github/workflows/kaibench.yml
@@ -1,5 +1,9 @@
 name: KaiBench Evaluation
 
+# Thin trigger that dispatches the full evaluation workflow in the KaiBench repo.
+# All infrastructure (kai-assistant build, eval framework, secrets) lives there.
+# Results are posted back as a commit status on this repo.
+
 permissions:
   contents: read
 
@@ -8,389 +12,37 @@ on:
     inputs:
       question_types:
         description: 'Question types (comma-separated, or "all")'
-        default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning'
+        default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation'
         type: string
       regression_only:
         description: 'Only regression-flagged questions'
         type: boolean
         default: false
       kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)'
+        description: 'Pre-built kai-assistant image tag (leave empty to build from UI main)'
         required: false
         type: string
         default: ''
   workflow_call:
-    inputs:
-      question_types:
-        description: 'Question types (comma-separated)'
-        required: false
-        type: string
-        default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning'
-      regression_only:
-        description: 'Only regression-flagged questions'
-        required: false
-        type: boolean
-        default: false
-      kai_assistant_image_tag:
-        description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)'
-        required: false
-        type: string
-        default: ''
-    secrets:
-      KAIBENCH_STATIC_TOKEN:
-        required: true
-      KAIBENCH_MANAGEMENT_TOKEN:
-        required: false
-      KAIBENCH_API_URL:
-        required: true
-      KAIBENCH_REPO_TOKEN:
-        required: true
-      DOCKERHUB_TOKEN:
-        required: true
-      KAI_GOOGLE_VERTEX_CREDENTIALS:
-        required: true
-      KAI_GOOGLE_VERTEX_PROJECT:
-        required: true
-      KAI_GOOGLE_VERTEX_LOCATION:
-        required: true
-    outputs:
-      status:
-        description: 'Test status (passed, failed, error)'
-        value: ${{ jobs.evaluate.outputs.status }}
-      passed:
-        value: ${{ jobs.evaluate.outputs.passed }}
-      failed:
-        value: ${{ jobs.evaluate.outputs.failed }}
-      total:
-        value: ${{ jobs.evaluate.outputs.total }}
-      pass_rate:
-        value: ${{ jobs.evaluate.outputs.pass_rate }}
-      duration:
-        value: ${{ jobs.evaluate.outputs.duration }}
 
 jobs:
-  evaluate:
-    name: Run KaiBench evaluation
+  trigger:
+    name: Trigger KaiBench evaluation
     runs-on: ubuntu-latest
-    timeout-minutes: 60
-    outputs:
-      status: ${{ steps.parse.outputs.status }}
-      passed: ${{ steps.parse.outputs.passed }}
-      failed: ${{ steps.parse.outputs.failed }}
-      total: ${{ steps.parse.outputs.total }}
-      pass_rate: ${{ steps.parse.outputs.pass_rate }}
-      duration: ${{ steps.parse.outputs.duration }}
-
-    services:
-      postgres:
-        image: postgres:16
-        env:
-          POSTGRES_DB: kai_db
-          POSTGRES_USER: postgres
-          POSTGRES_PASSWORD: postgres
-        ports:
-          - 5432:5432
-        options: --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5
-      redis:
-        image: redis:7-alpine
-        ports:
-          - 6379:6379
-        options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5
-
     steps:
-      # ── Checkouts ──────────────────────────────────────────────
-      - name: Checkout MCP server
-        uses: actions/checkout@v4
-        with:
-          path: mcp-server
-
-      - name: Checkout KaiBench
-        uses: actions/checkout@v4
-        with:
-          repository: keboola-rnd/KaiBench
-          token: ${{ secrets.KAIBENCH_REPO_TOKEN }}
-          path: kaibench
-
-      # ── Docker setup ───────────────────────────────────────────
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Docker Hub login
-        uses: docker/login-action@v3
-        with:
-          username: keboolabot
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      # ── Build MCP server from this PR ──────────────────────────
-      - name: Build MCP server image
-        uses: docker/build-push-action@v6
-        with:
-          load: true
-          tags: keboola/mcp-server:kaibench-test
-          context: ./mcp-server
-          file: ./mcp-server/Dockerfile
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      # ── kai-assistant image ────────────────────────────────────
-      - name: Resolve kai-assistant image tag
-        id: resolve-kai-image
-        env:
-          DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: |
-          if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then
-            echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT
-            echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}"
-          else
-            echo "No tag provided, resolving latest production tag from Docker Hub..."
-            LOGIN_RESPONSE=$(curl -s -H "Content-Type: application/json" \
-              -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \
-              https://hub.docker.com/v2/users/login/)
-            TOKEN=$(echo "$LOGIN_RESPONSE" | jq -r .token)
-            if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
-              echo "::error::Docker Hub login failed"
-              echo "Response: $LOGIN_RESPONSE" | head -c 200
-              exit 1
-            fi
-            echo "Docker Hub login successful"
-            TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \
-              "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi")
-            TAG=$(echo "$TAGS_RESPONSE" | jq -r '.results[0].name')
-            if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then
-              echo "::error::Failed to resolve latest kai-assistant production tag"
-              echo "Response: $TAGS_RESPONSE" | head -c 500
-              exit 1
-            fi
-            echo "tag=$TAG" >> $GITHUB_OUTPUT
-            echo "Resolved latest production tag: $TAG"
-          fi
-
-      - name: Pull kai-assistant image
-        run: docker pull keboola/kai-assistant:${{ steps.resolve-kai-image.outputs.tag }}
-
-      # ── Start services ─────────────────────────────────────────
-      - name: Start MCP server
-        run: |
-          docker run -d --name mcp-server --network host \
-            keboola/mcp-server:kaibench-test \
-            --transport streamable-http --host 0.0.0.0 --port 8000
-          echo "Waiting for MCP server..."
-          for i in $(seq 1 30); do
-            if curl -so /dev/null http://localhost:8000/mcp 2>&1; then
-              echo "MCP server is ready"
-              break
-            fi
-            if [ $i -eq 30 ]; then
-              echo "::error::MCP server failed to start"
-              docker logs mcp-server
-              exit 1
-            fi
-            sleep 2
-          done
-
-      - name: Run database migrations
-        env:
-          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
-        run: |
-          docker run --rm --network host \
-            -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
-            --workdir /app/apps/kai-assistant-backend \
-            keboola/kai-assistant:$KAI_IMAGE_TAG \
-            node dist/lib/db/migrate.js
-
-      - name: Start kai-assistant
-        env:
-          KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }}
-          KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
-          GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }}
-          GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }}
-          GOOGLE_VERTEX_LOCATION: ${{ secrets.KAI_GOOGLE_VERTEX_LOCATION }}
-        run: |
-          AUTH_SECRET=$(openssl rand -base64 32)
-          docker run -d --name kai-assistant --network host \
-            -e AUTH_SECRET="$AUTH_SECRET" \
-            -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \
-            -e REDIS_URL="redis://localhost:6379" \
-            -e MCP_SERVER_URL="http://localhost:8000/mcp" \
-            -e KEBOOLA_STORAGE_API_URL="$KAIBENCH_API_URL" \
-            -e KEBOOLA_STACK="dev-keboola-canary-orion" \
-            -e CLOUD_LLM_PROVIDER="google-vertex" \
-            -e GOOGLE_SERVICE_ACCOUNT_JSON="$GOOGLE_SERVICE_ACCOUNT_JSON" \
-            -e GOOGLE_VERTEX_PROJECT="$GOOGLE_VERTEX_PROJECT" \
-            -e GOOGLE_VERTEX_LOCATION="$GOOGLE_VERTEX_LOCATION" \
-            keboola/kai-assistant:$KAI_IMAGE_TAG
-          echo "Waiting for kai-assistant..."
-          for i in $(seq 1 60); do
-            if curl -sf http://localhost:3000/ping > /dev/null 2>&1; then
-              echo "kai-assistant is ready"
-              break
-            fi
-            if [ $i -eq 60 ]; then
-              echo "::error::kai-assistant failed to start"
-              docker logs kai-assistant
-              exit 1
-            fi
-            sleep 2
-          done
-
-      # ── Run evaluation ─────────────────────────────────────────
-      - name: Setup uv
-        uses: astral-sh/setup-uv@v5
-
-      - name: Install Python
-        run: uv python install 3.11
-
-      - name: Install dependencies
-        working-directory: kaibench
-        run: uv sync
-
-      - name: Run evaluation
-        working-directory: kaibench
+      - name: Dispatch evaluation to KaiBench repo
         env:
-          KAIBENCH_STATIC_TOKEN: ${{ secrets.KAIBENCH_STATIC_TOKEN }}
-          KAIBENCH_STATIC_HOST: connection.canary-orion.keboola.dev
-          KAIBENCH_STATIC_PROJECT_ID: '293'
-          KAIBENCH_MANAGEMENT_TOKEN: ${{ secrets.KAIBENCH_MANAGEMENT_TOKEN }}
-          KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }}
-          KAIBENCH_ORGANIZATION_ID: '58'
-          KAIBENCH_EVAL_PARALLEL_WORKERS: '4'
-          KAIBENCH_EVAL_KAI_BACKEND_URL: http://localhost:3000
-        run: |
-          TYPES="${{ inputs.question_types }}"
-          CMD="uv run kaibench run"
-          # Split comma-separated types into -t flags
-          IFS=',' read -ra TYPE_ARRAY <<< "$TYPES"
-          for t in "${TYPE_ARRAY[@]}"; do
-            trimmed=$(echo "$t" | xargs)
-            CMD="$CMD -t \"$trimmed\""
-          done
-          if [ "${{ inputs.regression_only }}" = "true" ]; then
-            CMD="$CMD --regression-only"
-          fi
-          eval $CMD
-        continue-on-error: true
-
-      # ── Results ────────────────────────────────────────────────
-      - name: Parse results
-        id: parse
-        if: always()
-        working-directory: kaibench
-        run: |
-          LATEST=$(ls -td results/run_* 2>/dev/null | head -1)
-          if [ -z "$LATEST" ]; then
-            echo "status=error" >> $GITHUB_OUTPUT
-            echo "passed=0" >> $GITHUB_OUTPUT
-            echo "failed=0" >> $GITHUB_OUTPUT
-            echo "total=0" >> $GITHUB_OUTPUT
-            echo "pass_rate=0" >> $GITHUB_OUTPUT
-            echo "duration=0" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          python3 << 'PYEOF' >> $GITHUB_OUTPUT
-          import json
-          from pathlib import Path
-
-          latest = sorted(Path('results').glob('run_*'))[-1]
-          s = json.loads((latest / 'summary.json').read_text())
-          m = s['metrics']
-
-          print(f"passed={m['passed']}")
-          print(f"failed={m['failed']}")
-          print(f"total={m['total_questions']}")
-          print(f"pass_rate={m['overall_pass_rate']:.2f}")
-          print(f"duration={m['duration_seconds']:.0f}")
-          status = 'passed' if m['failed'] == 0 and m['errors'] == 0 else 'failed'
-          print(f"status={status}")
-          PYEOF
-
-      - name: Step summary
-        if: always()
-        working-directory: kaibench
-        run: |
-          LATEST=$(ls -td results/run_* 2>/dev/null | head -1)
-          [ -z "$LATEST" ] && exit 0
-          python3 << 'PYEOF' >> $GITHUB_STEP_SUMMARY
-          import json, os
-          from pathlib import Path
-
-          latest = sorted(Path('results').glob('run_*'))[-1]
-          s = json.loads((latest / 'summary.json').read_text())
-          m = s['metrics']
-
-          results = []
-          results_file = latest / 'results.jsonl'
-          if results_file.exists():
-              for line in results_file.read_text().splitlines():
-                  if line.strip():
-                      results.append(json.loads(line))
-
-          evaluated = [r for r in results if r.get('status') != 'skipped']
-          total_tools = sum(len(r.get('trace', {}).get('tool_calls', [])) for r in evaluated)
-          avg_tools = total_tools / len(evaluated) if evaluated else 0
-          avg_duration = sum(r.get('duration_ms', 0) for r in evaluated) / len(evaluated) / 1000 if evaluated else 0
-          total_tokens = sum(r.get('trace', {}).get('total_tokens', 0) or 0 for r in evaluated)
-          avg_tokens = total_tokens / len(evaluated) if evaluated else 0
-
-          print('## KaiBench Evaluation Results')
-          print()
-          print('| Metric | Value |')
-          print('|--------|-------|')
-          print(f'| Run ID | `{s["run_id"]}` |')
-          print(f'| MCP Server | `PR build` |')
-          print(f'| Duration | {m["duration_seconds"]:.0f}s |')
-          print(f'| **Total** | **{m["total_questions"]}** |')
-          print(f'| Passed | {m["passed"]} |')
-          print(f'| Failed | {m["failed"]} |')
-          print(f'| Skipped | {m["skipped"]} |')
-          print(f'| Errors | {m["errors"]} |')
-          print(f'| **Pass Rate** | **{m["overall_pass_rate"]:.1%}** |')
-          print(f'| Total Tool Calls | {total_tools} |')
-          print(f'| Avg Tool Calls/Question | {avg_tools:.1f} |')
-          print(f'| Avg Duration/Question | {avg_duration:.0f}s |')
-          print(f'| Total Tokens | {total_tokens:,} |')
-          print(f'| Avg Tokens/Question | {avg_tokens:,.0f} |')
-
-          for t in s.get('by_question_type', []):
-              print()
-              print(f'### {t["question_type"]}')
-              print(f'{t["passed_count"]}/{t["total_count"]} passed ({t["pass_rate"]:.1%})')
-              if t['skipped_count']:
-                  print(f'_{t["skipped_count"]} skipped_')
-
-          if evaluated:
-              print()
-              print('### Per-Question Results')
-              print()
-              print('| Q | Type | Status | Tools | Tokens | Duration | Expected | Extracted | Notes |')
-              print('|---|------|--------|-------|--------|----------|----------|-----------|-------|')
-              for r in sorted(evaluated, key=lambda x: int(x.get('question_id', 0))):
-                  qid = r.get('question_id', '?')
-                  qtype = (r.get('question_type') or '')[:12]
-                  status = r.get('status', '?')
-                  emoji = {'passed': ':white_check_mark:', 'failed': ':x:', 'error': ':warning:'}.get(status, status)
-                  tools = len(r.get('trace', {}).get('tool_calls', []))
-                  tokens = r.get('trace', {}).get('total_tokens', 0) or 0
-                  tokens_str = f'{tokens:,}' if tokens else '-'
-                  dur = f'{r.get("duration_ms", 0)/1000:.0f}s'
-                  expected = str(r.get('expected_answer') or '-')[:25]
-                  extracted = str(r.get('extracted_answer') or '-')[:25]
-                  notes = (r.get('verification', {}).get('notes') or '')[:40]
-                  print(f'| {qid} | {qtype} | {emoji} | {tools} | {tokens_str} | {dur} | {expected} | {extracted} | {notes} |')
-          PYEOF
-
-      - name: Dump container logs
-        if: always()
-        run: |
-          echo "=== kai-assistant logs ==="
-          docker logs kai-assistant 2>&1 || true
-          echo ""
-          echo "=== mcp-server logs ==="
-          docker logs mcp-server 2>&1 || true
-
-      - name: Upload results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: kaibench-results-${{ github.run_id }}
-          path: kaibench/results/
-          retention-days: 90
+          GH_TOKEN: ${{ secrets.KAIBENCH_REPO_TOKEN }}
+        run: |
+          gh workflow run evaluate.yml \
+            --repo keboola-rnd/KaiBench \
+            --field mcp_server_repo="${{ github.repository }}" \
+            --field mcp_server_ref="${{ github.sha }}" \
+            --field callback_repo="${{ github.repository }}" \
+            --field callback_sha="${{ github.sha }}" \
+            --field question_types="${{ inputs.question_types || 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation' }}" \
+            --field regression_only="${{ inputs.regression_only || 'false' }}" \
+            --field kai_assistant_image_tag="${{ inputs.kai_assistant_image_tag || '' }}"
+          echo "Dispatched KaiBench evaluation"
+          echo "Results will appear as a commit status on ${{ github.sha }}"
+          echo "Monitor at: https://github.com/keboola-rnd/KaiBench/actions"