From 69f1fa7914d87b11120aa3ff3ee10cfc3542db89 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 14:27:50 +0100 Subject: [PATCH 1/8] AI-2588: add KaiBench evaluation workflow to CI pipeline Add kaibench.yml reusable workflow that builds MCP server from the PR branch, starts the full stack (MCP server + kai-assistant + Postgres + Redis), and runs KaiBench evaluations. Triggered as a non-blocking job in ci.yml after the build passes (same-repo pushes only). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 8 + .github/workflows/kaibench.yml | 399 +++++++++++++++++++++++++++++++++ 2 files changed, 407 insertions(+) create mode 100644 .github/workflows/kaibench.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 645e1af7..1245abd9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,6 +147,14 @@ jobs: path: ./integtest-results.xml reporter: 'java-junit' + kaibench: + name: KaiBench Evaluation + needs: build + # Run only for same-repo pushes (not fork PRs, which lack secrets) + if: github.event_name == 'push' && github.repository == 'keboola/keboola-mcp-server' + uses: ./.github/workflows/kaibench.yml + secrets: inherit + deploy_to_pypi: name: Deploy to pypi.org needs: diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml new file mode 100644 index 00000000..670c300d --- /dev/null +++ b/.github/workflows/kaibench.yml @@ -0,0 +1,399 @@ +name: KaiBench Evaluation + +on: + workflow_dispatch: + inputs: + question_types: + description: 'Question types (comma-separated, or "all")' + default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning' + type: string + regression_only: + description: 'Only regression-flagged questions' + type: boolean + default: false + kai_assistant_image_tag: + description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)' + required: false + type: string + default: '' + workflow_call: + inputs: + question_types: + description: 'Question types (comma-separated)' + required: false + type: string + default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning' + regression_only: + description: 'Only regression-flagged questions' + required: false + type: boolean + default: false + kai_assistant_image_tag: + description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)' + required: false + type: string + default: '' + secrets: + KAIBENCH_STATIC_TOKEN: + required: true + KAIBENCH_MANAGEMENT_TOKEN: + required: false + KAIBENCH_API_URL: + required: true + KAIBENCH_REPO_TOKEN: + required: true + DOCKERHUB_TOKEN: + required: true + KAI_GOOGLE_VERTEX_CREDENTIALS: + required: true + KAI_GOOGLE_VERTEX_PROJECT: + required: true + KAI_GOOGLE_VERTEX_LOCATION: + required: true + TURBO_TOKEN: + required: false + outputs: + status: + description: 'Test status (passed, failed, error)' + value: ${{ jobs.evaluate.outputs.status }} + passed: + value: ${{ jobs.evaluate.outputs.passed }} + failed: + value: ${{ jobs.evaluate.outputs.failed }} + total: + value: ${{ jobs.evaluate.outputs.total }} + pass_rate: + value: ${{ jobs.evaluate.outputs.pass_rate }} + duration: + value: ${{ jobs.evaluate.outputs.duration }} + +jobs: + evaluate: + name: Run KaiBench evaluation + runs-on: ubuntu-latest + timeout-minutes: 60 + outputs: + status: ${{ steps.parse.outputs.status }} + passed: ${{ steps.parse.outputs.passed }} + failed: ${{ steps.parse.outputs.failed }} + total: ${{ steps.parse.outputs.total }} + pass_rate: ${{ steps.parse.outputs.pass_rate }} + duration: ${{ steps.parse.outputs.duration }} + + services: + postgres: + image: postgres:16 + env: + POSTGRES_DB: kai_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: + - 5432:5432 + options: --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5 + redis: + image: redis:7-alpine + ports: + - 6379:6379 + options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5 + + steps: + # ── Checkouts ────────────────────────────────────────────── + - name: Checkout MCP server + uses: actions/checkout@v4 + with: + path: mcp-server + + - name: Checkout KaiBench + uses: actions/checkout@v4 + with: + repository: keboola-rnd/KaiBench + token: ${{ secrets.KAIBENCH_REPO_TOKEN }} + path: kaibench + + - name: Checkout UI repo (for kai-assistant build) + if: inputs.kai_assistant_image_tag == '' + uses: actions/checkout@v4 + with: + repository: keboola/ui + token: ${{ secrets.KAIBENCH_REPO_TOKEN }} + ref: main + path: ui + + # ── Docker setup ─────────────────────────────────────────── + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Docker Hub login + uses: docker/login-action@v3 + with: + username: keboolabot + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # ── Build MCP server from this PR ────────────────────────── + - name: Build MCP server image + uses: docker/build-push-action@v6 + with: + load: true + tags: keboola/mcp-server:kaibench-test + context: ./mcp-server + file: ./mcp-server/Dockerfile + cache-from: type=gha + cache-to: type=gha,mode=max + + # ── kai-assistant image ──────────────────────────────────── + - name: Build kai-assistant from main + if: inputs.kai_assistant_image_tag == '' + uses: docker/build-push-action@v6 + with: + load: true + tags: keboola/kai-assistant:kaibench-test + context: ./ui + file: ./ui/apps/kai-assistant-backend/Dockerfile + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + VERSION=kaibench-test + KEBOOLA_STACK=dev-keboola-canary-orion + KEBOOLA_STORAGE_API_URL=${{ secrets.KAIBENCH_API_URL }} + secrets: | + turbo_token=${{ secrets.TURBO_TOKEN }} + + - name: Pull pre-built kai-assistant image + if: inputs.kai_assistant_image_tag != '' + run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }} + + - name: Resolve kai-assistant image tag + id: resolve-kai-image + run: | + if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then + echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT + else + echo "tag=kaibench-test" >> $GITHUB_OUTPUT + fi + + # ── Start services ───────────────────────────────────────── + - name: Start MCP server + run: | + docker run -d --name mcp-server --network host \ + keboola/mcp-server:kaibench-test \ + --transport streamable-http --host 0.0.0.0 --port 8000 + echo "Waiting for MCP server..." + for i in $(seq 1 30); do + if curl -so /dev/null http://localhost:8000/mcp 2>&1; then + echo "MCP server is ready" + break + fi + if [ $i -eq 30 ]; then + echo "::error::MCP server failed to start" + docker logs mcp-server + exit 1 + fi + sleep 2 + done + + - name: Run database migrations + env: + KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} + run: | + docker run --rm --network host \ + -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ + --workdir /app/apps/kai-assistant-backend \ + keboola/kai-assistant:$KAI_IMAGE_TAG \ + node dist/lib/db/migrate.js + + - name: Start kai-assistant + env: + KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} + KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} + GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }} + GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }} + GOOGLE_VERTEX_LOCATION: ${{ secrets.KAI_GOOGLE_VERTEX_LOCATION }} + run: | + AUTH_SECRET=$(openssl rand -base64 32) + docker run -d --name kai-assistant --network host \ + -e AUTH_SECRET="$AUTH_SECRET" \ + -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ + -e REDIS_URL="redis://localhost:6379" \ + -e MCP_SERVER_URL="http://localhost:8000/mcp" \ + -e KEBOOLA_STORAGE_API_URL="$KAIBENCH_API_URL" \ + -e KEBOOLA_STACK="dev-keboola-canary-orion" \ + -e CLOUD_LLM_PROVIDER="google-vertex" \ + -e GOOGLE_SERVICE_ACCOUNT_JSON="$GOOGLE_SERVICE_ACCOUNT_JSON" \ + -e GOOGLE_VERTEX_PROJECT="$GOOGLE_VERTEX_PROJECT" \ + -e GOOGLE_VERTEX_LOCATION="$GOOGLE_VERTEX_LOCATION" \ + keboola/kai-assistant:$KAI_IMAGE_TAG + echo "Waiting for kai-assistant..." + for i in $(seq 1 60); do + if curl -sf http://localhost:3000/ping > /dev/null 2>&1; then + echo "kai-assistant is ready" + break + fi + if [ $i -eq 60 ]; then + echo "::error::kai-assistant failed to start" + docker logs kai-assistant + exit 1 + fi + sleep 2 + done + + # ── Run evaluation ───────────────────────────────────────── + - name: Setup uv + uses: astral-sh/setup-uv@v5 + + - name: Install Python + run: uv python install 3.11 + + - name: Install dependencies + working-directory: kaibench + run: uv sync + + - name: Run evaluation + working-directory: kaibench + env: + KAIBENCH_STATIC_TOKEN: ${{ secrets.KAIBENCH_STATIC_TOKEN }} + KAIBENCH_STATIC_HOST: connection.canary-orion.keboola.dev + KAIBENCH_STATIC_PROJECT_ID: '293' + KAIBENCH_MANAGEMENT_TOKEN: ${{ secrets.KAIBENCH_MANAGEMENT_TOKEN }} + KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} + KAIBENCH_ORGANIZATION_ID: '58' + KAIBENCH_EVAL_PARALLEL_WORKERS: '4' + KAIBENCH_EVAL_KAI_BACKEND_URL: http://localhost:3000 + run: | + TYPES="${{ inputs.question_types }}" + CMD="uv run kaibench run" + # Split comma-separated types into -t flags + IFS=',' read -ra TYPE_ARRAY <<< "$TYPES" + for t in "${TYPE_ARRAY[@]}"; do + trimmed=$(echo "$t" | xargs) + CMD="$CMD -t \"$trimmed\"" + done + if [ "${{ inputs.regression_only }}" = "true" ]; then + CMD="$CMD --regression-only" + fi + eval $CMD + continue-on-error: true + + # ── Results ──────────────────────────────────────────────── + - name: Parse results + id: parse + if: always() + working-directory: kaibench + run: | + LATEST=$(ls -td results/run_* 2>/dev/null | head -1) + if [ -z "$LATEST" ]; then + echo "status=error" >> $GITHUB_OUTPUT + echo "passed=0" >> $GITHUB_OUTPUT + echo "failed=0" >> $GITHUB_OUTPUT + echo "total=0" >> $GITHUB_OUTPUT + echo "pass_rate=0" >> $GITHUB_OUTPUT + echo "duration=0" >> $GITHUB_OUTPUT + exit 0 + fi + python3 << 'PYEOF' >> $GITHUB_OUTPUT + import json + from pathlib import Path + + latest = sorted(Path('results').glob('run_*'))[-1] + s = json.loads((latest / 'summary.json').read_text()) + m = s['metrics'] + + print(f"passed={m['passed']}") + print(f"failed={m['failed']}") + print(f"total={m['total_questions']}") + print(f"pass_rate={m['overall_pass_rate']:.2f}") + print(f"duration={m['duration_seconds']:.0f}") + status = 'passed' if m['failed'] == 0 and m['errors'] == 0 else 'failed' + print(f"status={status}") + PYEOF + + - name: Step summary + if: always() + working-directory: kaibench + run: | + LATEST=$(ls -td results/run_* 2>/dev/null | head -1) + [ -z "$LATEST" ] && exit 0 + python3 << 'PYEOF' >> $GITHUB_STEP_SUMMARY + import json, os + from pathlib import Path + + latest = sorted(Path('results').glob('run_*'))[-1] + s = json.loads((latest / 'summary.json').read_text()) + m = s['metrics'] + + results = [] + results_file = latest / 'results.jsonl' + if results_file.exists(): + for line in results_file.read_text().splitlines(): + if line.strip(): + results.append(json.loads(line)) + + evaluated = [r for r in results if r.get('status') != 'skipped'] + total_tools = sum(len(r.get('trace', {}).get('tool_calls', [])) for r in evaluated) + avg_tools = total_tools / len(evaluated) if evaluated else 0 + avg_duration = sum(r.get('duration_ms', 0) for r in evaluated) / len(evaluated) / 1000 if evaluated else 0 + total_tokens = sum(r.get('trace', {}).get('total_tokens', 0) or 0 for r in evaluated) + avg_tokens = total_tokens / len(evaluated) if evaluated else 0 + + print('## KaiBench Evaluation Results') + print() + print('| Metric | Value |') + print('|--------|-------|') + print(f'| Run ID | `{s["run_id"]}` |') + print(f'| MCP Server | `PR build` |') + print(f'| Duration | {m["duration_seconds"]:.0f}s |') + print(f'| **Total** | **{m["total_questions"]}** |') + print(f'| Passed | {m["passed"]} |') + print(f'| Failed | {m["failed"]} |') + print(f'| Skipped | {m["skipped"]} |') + print(f'| Errors | {m["errors"]} |') + print(f'| **Pass Rate** | **{m["overall_pass_rate"]:.1%}** |') + print(f'| Total Tool Calls | {total_tools} |') + print(f'| Avg Tool Calls/Question | {avg_tools:.1f} |') + print(f'| Avg Duration/Question | {avg_duration:.0f}s |') + print(f'| Total Tokens | {total_tokens:,} |') + print(f'| Avg Tokens/Question | {avg_tokens:,.0f} |') + + for t in s.get('by_question_type', []): + print() + print(f'### {t["question_type"]}') + print(f'{t["passed_count"]}/{t["total_count"]} passed ({t["pass_rate"]:.1%})') + if t['skipped_count']: + print(f'_{t["skipped_count"]} skipped_') + + if evaluated: + print() + print('### Per-Question Results') + print() + print('| Q | Type | Status | Tools | Tokens | Duration | Expected | Extracted | Notes |') + print('|---|------|--------|-------|--------|----------|----------|-----------|-------|') + for r in sorted(evaluated, key=lambda x: int(x.get('question_id', 0))): + qid = r.get('question_id', '?') + qtype = (r.get('question_type') or '')[:12] + status = r.get('status', '?') + emoji = {'passed': ':white_check_mark:', 'failed': ':x:', 'error': ':warning:'}.get(status, status) + tools = len(r.get('trace', {}).get('tool_calls', [])) + tokens = r.get('trace', {}).get('total_tokens', 0) or 0 + tokens_str = f'{tokens:,}' if tokens else '-' + dur = f'{r.get("duration_ms", 0)/1000:.0f}s' + expected = str(r.get('expected_answer') or '-')[:25] + extracted = str(r.get('extracted_answer') or '-')[:25] + notes = (r.get('verification', {}).get('notes') or '')[:40] + print(f'| {qid} | {qtype} | {emoji} | {tools} | {tokens_str} | {dur} | {expected} | {extracted} | {notes} |') + PYEOF + + - name: Dump container logs + if: always() + run: | + echo "=== kai-assistant logs ===" + docker logs kai-assistant 2>&1 || true + echo "" + echo "=== mcp-server logs ===" + docker logs mcp-server 2>&1 || true + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: kaibench-results-${{ github.run_id }} + path: kaibench/results/ + retention-days: 90 From 18b8dd1a8a2a44169cecc2ddd7eb32b46d8c9521 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 14:35:25 +0100 Subject: [PATCH 2/8] AI-2588: add permissions block to kaibench workflow Restricts GITHUB_TOKEN to read-only contents access to satisfy CodeQL security policy. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/kaibench.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml index 670c300d..76ffb263 100644 --- a/.github/workflows/kaibench.yml +++ b/.github/workflows/kaibench.yml @@ -1,5 +1,8 @@ name: KaiBench Evaluation +permissions: + contents: read + on: workflow_dispatch: inputs: From 58d339ef6a76394a771b877aa684458a459d9b0f Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 16:58:39 +0100 Subject: [PATCH 3/8] AI-2588: add missing requests dependency cli.py imports requests (for requests.JSONDecodeError handler) but it was not declared in pyproject.toml. This caused test collection to fail on Python 3.11 in CI where no transitive dependency pulls it in. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 1 + uv.lock | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e3c7f14b..cc90be67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "sqlglot ~= 28.5", "toon-format ~= 0.9.0b1", "pyyaml ~= 6.0", + "requests ~= 2.32", ] [project.optional-dependencies] codestyle = [ diff --git a/uv.lock b/uv.lock index 20ed7b74..5f221532 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1237,6 +1237,7 @@ dependencies = [ { name = "pydantic" }, { name = "pyjwt" }, { name = "pyyaml" }, + { name = "requests" }, { name = "sqlglot" }, { name = "toon-format" }, ] @@ -1303,6 +1304,7 @@ requires-dist = [ { name = "python-dateutil", marker = "extra == 'tests'", specifier = "~=2.9" }, { name = "python-dotenv", marker = "extra == 'tests'", specifier = "~=1.2" }, { name = "pyyaml", specifier = "~=6.0" }, + { name = "requests", specifier = "~=2.32" }, { name = "sqlglot", specifier = "~=28.5" }, { name = "toon-format", specifier = "~=0.9.0b1" }, { name = "tox", marker = "extra == 'dev'", specifier = "~=4.32" }, @@ -2489,8 +2491,8 @@ name = "taskgroup" version = "0.2.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "exceptiongroup" }, - { name = "typing-extensions" }, + { name = "exceptiongroup", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504, upload-time = "2025-01-03T09:24:13.761Z" } wheels = [ From 4304763d31edff3834326ae2935018483e63e3ef Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 17:03:22 +0100 Subject: [PATCH 4/8] AI-2588: fix repo name in kaibench job condition The repo was renamed from keboola/keboola-mcp-server to keboola/mcp-server, causing the if condition to evaluate false. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1245abd9..58706e3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,7 +151,7 @@ jobs: name: KaiBench Evaluation needs: build # Run only for same-repo pushes (not fork PRs, which lack secrets) - if: github.event_name == 'push' && github.repository == 'keboola/keboola-mcp-server' + if: github.event_name == 'push' && github.repository == 'keboola/mcp-server' uses: ./.github/workflows/kaibench.yml secrets: inherit From a9ac2ff2fa40b5386fb4b13e2586c675ea140de3 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 17:24:37 +0100 Subject: [PATCH 5/8] AI-2588: remove kai-assistant build-from-source path Always use pre-built kai-assistant Docker image tag instead of building from the UI repo source. This removes the need for UI repo access and TURBO_TOKEN. The image tag is provided via vars.KAI_ASSISTANT_IMAGE_TAG. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 2 ++ .github/workflows/kaibench.yml | 52 ++++------------------------------ 2 files changed, 8 insertions(+), 46 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 58706e3c..cac7de0a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,8 @@ jobs: # Run only for same-repo pushes (not fork PRs, which lack secrets) if: github.event_name == 'push' && github.repository == 'keboola/mcp-server' uses: ./.github/workflows/kaibench.yml + with: + kai_assistant_image_tag: ${{ vars.KAI_ASSISTANT_IMAGE_TAG }} secrets: inherit deploy_to_pypi: diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml index 76ffb263..97ba6d53 100644 --- a/.github/workflows/kaibench.yml +++ b/.github/workflows/kaibench.yml @@ -15,10 +15,9 @@ on: type: boolean default: false kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)' - required: false + description: 'Pre-built kai-assistant Docker Hub tag' + required: true type: string - default: '' workflow_call: inputs: question_types: @@ -32,10 +31,9 @@ on: type: boolean default: false kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag (leave empty to build from main)' - required: false + description: 'Pre-built kai-assistant Docker Hub tag' + required: true type: string - default: '' secrets: KAIBENCH_STATIC_TOKEN: required: true @@ -53,8 +51,6 @@ on: required: true KAI_GOOGLE_VERTEX_LOCATION: required: true - TURBO_TOKEN: - required: false outputs: status: description: 'Test status (passed, failed, error)' @@ -113,15 +109,6 @@ jobs: token: ${{ secrets.KAIBENCH_REPO_TOKEN }} path: kaibench - - name: Checkout UI repo (for kai-assistant build) - if: inputs.kai_assistant_image_tag == '' - uses: actions/checkout@v4 - with: - repository: keboola/ui - token: ${{ secrets.KAIBENCH_REPO_TOKEN }} - ref: main - path: ui - # ── Docker setup ─────────────────────────────────────────── - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -144,36 +131,9 @@ jobs: cache-to: type=gha,mode=max # ── kai-assistant image ──────────────────────────────────── - - name: Build kai-assistant from main - if: inputs.kai_assistant_image_tag == '' - uses: docker/build-push-action@v6 - with: - load: true - tags: keboola/kai-assistant:kaibench-test - context: ./ui - file: ./ui/apps/kai-assistant-backend/Dockerfile - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - VERSION=kaibench-test - KEBOOLA_STACK=dev-keboola-canary-orion - KEBOOLA_STORAGE_API_URL=${{ secrets.KAIBENCH_API_URL }} - secrets: | - turbo_token=${{ secrets.TURBO_TOKEN }} - - name: Pull pre-built kai-assistant image - if: inputs.kai_assistant_image_tag != '' run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }} - - name: Resolve kai-assistant image tag - id: resolve-kai-image - run: | - if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then - echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT - else - echo "tag=kaibench-test" >> $GITHUB_OUTPUT - fi - # ── Start services ───────────────────────────────────────── - name: Start MCP server run: | @@ -196,7 +156,7 @@ jobs: - name: Run database migrations env: - KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} + KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }} run: | docker run --rm --network host \ -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ @@ -206,7 +166,7 @@ jobs: - name: Start kai-assistant env: - KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} + KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }} KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }} GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }} From 1bbd159da228a25f63d668838b78ce9329bc2ca0 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 17:28:17 +0100 Subject: [PATCH 6/8] AI-2588: auto-resolve latest kai-assistant tag from Docker Hub Instead of requiring a static image tag, the workflow now queries Docker Hub for the latest production-kai-assi-* tag when none is provided. This ensures CI always tests against the newest kai-assistant without manual variable updates. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 2 -- .github/workflows/kaibench.yml | 42 +++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cac7de0a..58706e3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,8 +153,6 @@ jobs: # Run only for same-repo pushes (not fork PRs, which lack secrets) if: github.event_name == 'push' && github.repository == 'keboola/mcp-server' uses: ./.github/workflows/kaibench.yml - with: - kai_assistant_image_tag: ${{ vars.KAI_ASSISTANT_IMAGE_TAG }} secrets: inherit deploy_to_pypi: diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml index 97ba6d53..37302330 100644 --- a/.github/workflows/kaibench.yml +++ b/.github/workflows/kaibench.yml @@ -15,9 +15,10 @@ on: type: boolean default: false kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag' - required: true + description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)' + required: false type: string + default: '' workflow_call: inputs: question_types: @@ -31,9 +32,10 @@ on: type: boolean default: false kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag' - required: true + description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)' + required: false type: string + default: '' secrets: KAIBENCH_STATIC_TOKEN: required: true @@ -131,8 +133,32 @@ jobs: cache-to: type=gha,mode=max # ── kai-assistant image ──────────────────────────────────── - - name: Pull pre-built kai-assistant image - run: docker pull keboola/kai-assistant:${{ inputs.kai_assistant_image_tag }} + - name: Resolve kai-assistant image tag + id: resolve-kai-image + env: + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + run: | + if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then + echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT + echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}" + else + echo "No tag provided, resolving latest production tag from Docker Hub..." + TOKEN=$(curl -sf -H "Content-Type: application/json" \ + -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \ + https://hub.docker.com/v2/users/login/ | jq -r .token) + TAG=$(curl -sf -H "Authorization: JWT $TOKEN" \ + "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi" \ + | jq -r '.results[0].name') + if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then + echo "::error::Failed to resolve latest kai-assistant production tag" + exit 1 + fi + echo "tag=$TAG" >> $GITHUB_OUTPUT + echo "Resolved latest production tag: $TAG" + fi + + - name: Pull kai-assistant image + run: docker pull keboola/kai-assistant:${{ steps.resolve-kai-image.outputs.tag }} # ── Start services ───────────────────────────────────────── - name: Start MCP server @@ -156,7 +182,7 @@ jobs: - name: Run database migrations env: - KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }} + KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} run: | docker run --rm --network host \ -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ @@ -166,7 +192,7 @@ jobs: - name: Start kai-assistant env: - KAI_IMAGE_TAG: ${{ inputs.kai_assistant_image_tag }} + KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }} GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }} From 8eb32d9a1b96ca96e630428147b9bdcabcdc9671 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 17:36:08 +0100 Subject: [PATCH 7/8] AI-2588: add debug output to Docker Hub tag resolution Remove silent curl flags to surface errors when Docker Hub API calls fail during kai-assistant tag resolution. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/kaibench.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml index 37302330..4131ae49 100644 --- a/.github/workflows/kaibench.yml +++ b/.github/workflows/kaibench.yml @@ -143,14 +143,22 @@ jobs: echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}" else echo "No tag provided, resolving latest production tag from Docker Hub..." - TOKEN=$(curl -sf -H "Content-Type: application/json" \ + LOGIN_RESPONSE=$(curl -s -H "Content-Type: application/json" \ -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \ - https://hub.docker.com/v2/users/login/ | jq -r .token) - TAG=$(curl -sf -H "Authorization: JWT $TOKEN" \ - "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi" \ - | jq -r '.results[0].name') + https://hub.docker.com/v2/users/login/) + TOKEN=$(echo "$LOGIN_RESPONSE" | jq -r .token) + if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then + echo "::error::Docker Hub login failed" + echo "Response: $LOGIN_RESPONSE" | head -c 200 + exit 1 + fi + echo "Docker Hub login successful" + TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \ + "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi") + TAG=$(echo "$TAGS_RESPONSE" | jq -r '.results[0].name') if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then echo "::error::Failed to resolve latest kai-assistant production tag" + echo "Response: $TAGS_RESPONSE" | head -c 500 exit 1 fi echo "tag=$TAG" >> $GITHUB_OUTPUT From 6ad0e8278551e7ccdca9c3b81310538f412424f2 Mon Sep 17 00:00:00 2001 From: jordanrburger Date: Sat, 21 Feb 2026 18:50:36 +0100 Subject: [PATCH 8/8] AI-2588: replace local eval with KaiBench dispatch trigger Instead of running the full evaluation locally (which needs UI repo access, TURBO_TOKEN, and registry credentials), dispatch the eval to the KaiBench repo where all secrets are centralized. Results are posted back as a commit status on the MCP server repo. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/kaibench.yml | 394 ++------------------------------- 1 file changed, 23 insertions(+), 371 deletions(-) diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml index 4131ae49..b674d9ae 100644 --- a/.github/workflows/kaibench.yml +++ b/.github/workflows/kaibench.yml @@ -1,5 +1,9 @@ name: KaiBench Evaluation +# Thin trigger that dispatches the full evaluation workflow in the KaiBench repo. +# All infrastructure (kai-assistant build, eval framework, secrets) lives there. +# Results are posted back as a commit status on this repo. + permissions: contents: read @@ -8,389 +12,37 @@ on: inputs: question_types: description: 'Question types (comma-separated, or "all")' - default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning' + default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation' type: string regression_only: description: 'Only regression-flagged questions' type: boolean default: false kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)' + description: 'Pre-built kai-assistant image tag (leave empty to build from UI main)' required: false type: string default: '' workflow_call: - inputs: - question_types: - description: 'Question types (comma-separated)' - required: false - type: string - default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning' - regression_only: - description: 'Only regression-flagged questions' - required: false - type: boolean - default: false - kai_assistant_image_tag: - description: 'Pre-built kai-assistant Docker Hub tag (leave empty to auto-resolve latest production tag)' - required: false - type: string - default: '' - secrets: - KAIBENCH_STATIC_TOKEN: - required: true - KAIBENCH_MANAGEMENT_TOKEN: - required: false - KAIBENCH_API_URL: - required: true - KAIBENCH_REPO_TOKEN: - required: true - DOCKERHUB_TOKEN: - required: true - KAI_GOOGLE_VERTEX_CREDENTIALS: - required: true - KAI_GOOGLE_VERTEX_PROJECT: - required: true - KAI_GOOGLE_VERTEX_LOCATION: - required: true - outputs: - status: - description: 'Test status (passed, failed, error)' - value: ${{ jobs.evaluate.outputs.status }} - passed: - value: ${{ jobs.evaluate.outputs.passed }} - failed: - value: ${{ jobs.evaluate.outputs.failed }} - total: - value: ${{ jobs.evaluate.outputs.total }} - pass_rate: - value: ${{ jobs.evaluate.outputs.pass_rate }} - duration: - value: ${{ jobs.evaluate.outputs.duration }} jobs: - evaluate: - name: Run KaiBench evaluation + trigger: + name: Trigger KaiBench evaluation runs-on: ubuntu-latest - timeout-minutes: 60 - outputs: - status: ${{ steps.parse.outputs.status }} - passed: ${{ steps.parse.outputs.passed }} - failed: ${{ steps.parse.outputs.failed }} - total: ${{ steps.parse.outputs.total }} - pass_rate: ${{ steps.parse.outputs.pass_rate }} - duration: ${{ steps.parse.outputs.duration }} - - services: - postgres: - image: postgres:16 - env: - POSTGRES_DB: kai_db - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - ports: - - 5432:5432 - options: --health-cmd="pg_isready -U postgres" --health-interval=10s --health-timeout=5s --health-retries=5 - redis: - image: redis:7-alpine - ports: - - 6379:6379 - options: --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5 - steps: - # ── Checkouts ────────────────────────────────────────────── - - name: Checkout MCP server - uses: actions/checkout@v4 - with: - path: mcp-server - - - name: Checkout KaiBench - uses: actions/checkout@v4 - with: - repository: keboola-rnd/KaiBench - token: ${{ secrets.KAIBENCH_REPO_TOKEN }} - path: kaibench - - # ── Docker setup ─────────────────────────────────────────── - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Docker Hub login - uses: docker/login-action@v3 - with: - username: keboolabot - password: ${{ secrets.DOCKERHUB_TOKEN }} - - # ── Build MCP server from this PR ────────────────────────── - - name: Build MCP server image - uses: docker/build-push-action@v6 - with: - load: true - tags: keboola/mcp-server:kaibench-test - context: ./mcp-server - file: ./mcp-server/Dockerfile - cache-from: type=gha - cache-to: type=gha,mode=max - - # ── kai-assistant image ──────────────────────────────────── - - name: Resolve kai-assistant image tag - id: resolve-kai-image - env: - DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} - run: | - if [ -n "${{ inputs.kai_assistant_image_tag }}" ]; then - echo "tag=${{ inputs.kai_assistant_image_tag }}" >> $GITHUB_OUTPUT - echo "Using provided tag: ${{ inputs.kai_assistant_image_tag }}" - else - echo "No tag provided, resolving latest production tag from Docker Hub..." - LOGIN_RESPONSE=$(curl -s -H "Content-Type: application/json" \ - -X POST -d "{\"username\":\"keboolabot\",\"password\":\"$DOCKERHUB_TOKEN\"}" \ - https://hub.docker.com/v2/users/login/) - TOKEN=$(echo "$LOGIN_RESPONSE" | jq -r .token) - if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then - echo "::error::Docker Hub login failed" - echo "Response: $LOGIN_RESPONSE" | head -c 200 - exit 1 - fi - echo "Docker Hub login successful" - TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \ - "https://hub.docker.com/v2/repositories/keboola/kai-assistant/tags/?page_size=1&ordering=-last_updated&name=production-kai-assi") - TAG=$(echo "$TAGS_RESPONSE" | jq -r '.results[0].name') - if [ -z "$TAG" ] || [ "$TAG" = "null" ]; then - echo "::error::Failed to resolve latest kai-assistant production tag" - echo "Response: $TAGS_RESPONSE" | head -c 500 - exit 1 - fi - echo "tag=$TAG" >> $GITHUB_OUTPUT - echo "Resolved latest production tag: $TAG" - fi - - - name: Pull kai-assistant image - run: docker pull keboola/kai-assistant:${{ steps.resolve-kai-image.outputs.tag }} - - # ── Start services ───────────────────────────────────────── - - name: Start MCP server - run: | - docker run -d --name mcp-server --network host \ - keboola/mcp-server:kaibench-test \ - --transport streamable-http --host 0.0.0.0 --port 8000 - echo "Waiting for MCP server..." - for i in $(seq 1 30); do - if curl -so /dev/null http://localhost:8000/mcp 2>&1; then - echo "MCP server is ready" - break - fi - if [ $i -eq 30 ]; then - echo "::error::MCP server failed to start" - docker logs mcp-server - exit 1 - fi - sleep 2 - done - - - name: Run database migrations - env: - KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} - run: | - docker run --rm --network host \ - -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ - --workdir /app/apps/kai-assistant-backend \ - keboola/kai-assistant:$KAI_IMAGE_TAG \ - node dist/lib/db/migrate.js - - - name: Start kai-assistant - env: - KAI_IMAGE_TAG: ${{ steps.resolve-kai-image.outputs.tag }} - KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} - GOOGLE_SERVICE_ACCOUNT_JSON: ${{ secrets.KAI_GOOGLE_VERTEX_CREDENTIALS }} - GOOGLE_VERTEX_PROJECT: ${{ secrets.KAI_GOOGLE_VERTEX_PROJECT }} - GOOGLE_VERTEX_LOCATION: ${{ secrets.KAI_GOOGLE_VERTEX_LOCATION }} - run: | - AUTH_SECRET=$(openssl rand -base64 32) - docker run -d --name kai-assistant --network host \ - -e AUTH_SECRET="$AUTH_SECRET" \ - -e POSTGRES_URL="postgresql://postgres:postgres@localhost:5432/kai_db" \ - -e REDIS_URL="redis://localhost:6379" \ - -e MCP_SERVER_URL="http://localhost:8000/mcp" \ - -e KEBOOLA_STORAGE_API_URL="$KAIBENCH_API_URL" \ - -e KEBOOLA_STACK="dev-keboola-canary-orion" \ - -e CLOUD_LLM_PROVIDER="google-vertex" \ - -e GOOGLE_SERVICE_ACCOUNT_JSON="$GOOGLE_SERVICE_ACCOUNT_JSON" \ - -e GOOGLE_VERTEX_PROJECT="$GOOGLE_VERTEX_PROJECT" \ - -e GOOGLE_VERTEX_LOCATION="$GOOGLE_VERTEX_LOCATION" \ - keboola/kai-assistant:$KAI_IMAGE_TAG - echo "Waiting for kai-assistant..." - for i in $(seq 1 60); do - if curl -sf http://localhost:3000/ping > /dev/null 2>&1; then - echo "kai-assistant is ready" - break - fi - if [ $i -eq 60 ]; then - echo "::error::kai-assistant failed to start" - docker logs kai-assistant - exit 1 - fi - sleep 2 - done - - # ── Run evaluation ───────────────────────────────────────── - - name: Setup uv - uses: astral-sh/setup-uv@v5 - - - name: Install Python - run: uv python install 3.11 - - - name: Install dependencies - working-directory: kaibench - run: uv sync - - - name: Run evaluation - working-directory: kaibench + - name: Dispatch evaluation to KaiBench repo env: - KAIBENCH_STATIC_TOKEN: ${{ secrets.KAIBENCH_STATIC_TOKEN }} - KAIBENCH_STATIC_HOST: connection.canary-orion.keboola.dev - KAIBENCH_STATIC_PROJECT_ID: '293' - KAIBENCH_MANAGEMENT_TOKEN: ${{ secrets.KAIBENCH_MANAGEMENT_TOKEN }} - KAIBENCH_API_URL: ${{ secrets.KAIBENCH_API_URL }} - KAIBENCH_ORGANIZATION_ID: '58' - KAIBENCH_EVAL_PARALLEL_WORKERS: '4' - KAIBENCH_EVAL_KAI_BACKEND_URL: http://localhost:3000 - run: | - TYPES="${{ inputs.question_types }}" - CMD="uv run kaibench run" - # Split comma-separated types into -t flags - IFS=',' read -ra TYPE_ARRAY <<< "$TYPES" - for t in "${TYPE_ARRAY[@]}"; do - trimmed=$(echo "$t" | xargs) - CMD="$CMD -t \"$trimmed\"" - done - if [ "${{ inputs.regression_only }}" = "true" ]; then - CMD="$CMD --regression-only" - fi - eval $CMD - continue-on-error: true - - # ── Results ──────────────────────────────────────────────── - - name: Parse results - id: parse - if: always() - working-directory: kaibench - run: | - LATEST=$(ls -td results/run_* 2>/dev/null | head -1) - if [ -z "$LATEST" ]; then - echo "status=error" >> $GITHUB_OUTPUT - echo "passed=0" >> $GITHUB_OUTPUT - echo "failed=0" >> $GITHUB_OUTPUT - echo "total=0" >> $GITHUB_OUTPUT - echo "pass_rate=0" >> $GITHUB_OUTPUT - echo "duration=0" >> $GITHUB_OUTPUT - exit 0 - fi - python3 << 'PYEOF' >> $GITHUB_OUTPUT - import json - from pathlib import Path - - latest = sorted(Path('results').glob('run_*'))[-1] - s = json.loads((latest / 'summary.json').read_text()) - m = s['metrics'] - - print(f"passed={m['passed']}") - print(f"failed={m['failed']}") - print(f"total={m['total_questions']}") - print(f"pass_rate={m['overall_pass_rate']:.2f}") - print(f"duration={m['duration_seconds']:.0f}") - status = 'passed' if m['failed'] == 0 and m['errors'] == 0 else 'failed' - print(f"status={status}") - PYEOF - - - name: Step summary - if: always() - working-directory: kaibench - run: | - LATEST=$(ls -td results/run_* 2>/dev/null | head -1) - [ -z "$LATEST" ] && exit 0 - python3 << 'PYEOF' >> $GITHUB_STEP_SUMMARY - import json, os - from pathlib import Path - - latest = sorted(Path('results').glob('run_*'))[-1] - s = json.loads((latest / 'summary.json').read_text()) - m = s['metrics'] - - results = [] - results_file = latest / 'results.jsonl' - if results_file.exists(): - for line in results_file.read_text().splitlines(): - if line.strip(): - results.append(json.loads(line)) - - evaluated = [r for r in results if r.get('status') != 'skipped'] - total_tools = sum(len(r.get('trace', {}).get('tool_calls', [])) for r in evaluated) - avg_tools = total_tools / len(evaluated) if evaluated else 0 - avg_duration = sum(r.get('duration_ms', 0) for r in evaluated) / len(evaluated) / 1000 if evaluated else 0 - total_tokens = sum(r.get('trace', {}).get('total_tokens', 0) or 0 for r in evaluated) - avg_tokens = total_tokens / len(evaluated) if evaluated else 0 - - print('## KaiBench Evaluation Results') - print() - print('| Metric | Value |') - print('|--------|-------|') - print(f'| Run ID | `{s["run_id"]}` |') - print(f'| MCP Server | `PR build` |') - print(f'| Duration | {m["duration_seconds"]:.0f}s |') - print(f'| **Total** | **{m["total_questions"]}** |') - print(f'| Passed | {m["passed"]} |') - print(f'| Failed | {m["failed"]} |') - print(f'| Skipped | {m["skipped"]} |') - print(f'| Errors | {m["errors"]} |') - print(f'| **Pass Rate** | **{m["overall_pass_rate"]:.1%}** |') - print(f'| Total Tool Calls | {total_tools} |') - print(f'| Avg Tool Calls/Question | {avg_tools:.1f} |') - print(f'| Avg Duration/Question | {avg_duration:.0f}s |') - print(f'| Total Tokens | {total_tokens:,} |') - print(f'| Avg Tokens/Question | {avg_tokens:,.0f} |') - - for t in s.get('by_question_type', []): - print() - print(f'### {t["question_type"]}') - print(f'{t["passed_count"]}/{t["total_count"]} passed ({t["pass_rate"]:.1%})') - if t['skipped_count']: - print(f'_{t["skipped_count"]} skipped_') - - if evaluated: - print() - print('### Per-Question Results') - print() - print('| Q | Type | Status | Tools | Tokens | Duration | Expected | Extracted | Notes |') - print('|---|------|--------|-------|--------|----------|----------|-----------|-------|') - for r in sorted(evaluated, key=lambda x: int(x.get('question_id', 0))): - qid = r.get('question_id', '?') - qtype = (r.get('question_type') or '')[:12] - status = r.get('status', '?') - emoji = {'passed': ':white_check_mark:', 'failed': ':x:', 'error': ':warning:'}.get(status, status) - tools = len(r.get('trace', {}).get('tool_calls', [])) - tokens = r.get('trace', {}).get('total_tokens', 0) or 0 - tokens_str = f'{tokens:,}' if tokens else '-' - dur = f'{r.get("duration_ms", 0)/1000:.0f}s' - expected = str(r.get('expected_answer') or '-')[:25] - extracted = str(r.get('extracted_answer') or '-')[:25] - notes = (r.get('verification', {}).get('notes') or '')[:40] - print(f'| {qid} | {qtype} | {emoji} | {tools} | {tokens_str} | {dur} | {expected} | {extracted} | {notes} |') - PYEOF - - - name: Dump container logs - if: always() - run: | - echo "=== kai-assistant logs ===" - docker logs kai-assistant 2>&1 || true - echo "" - echo "=== mcp-server logs ===" - docker logs mcp-server 2>&1 || true - - - name: Upload results - if: always() - uses: actions/upload-artifact@v4 - with: - name: kaibench-results-${{ github.run_id }} - path: kaibench/results/ - retention-days: 90 + GH_TOKEN: ${{ secrets.KAIBENCH_REPO_TOKEN }} + run: | + gh workflow run evaluate.yml \ + --repo keboola-rnd/KaiBench \ + --field mcp_server_repo="${{ github.repository }}" \ + --field mcp_server_ref="${{ github.sha }}" \ + --field callback_repo="${{ github.repository }}" \ + --field callback_sha="${{ github.sha }}" \ + --field question_types="${{ inputs.question_types || 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation' }}" \ + --field regression_only="${{ inputs.regression_only || 'false' }}" \ + --field kai_assistant_image_tag="${{ inputs.kai_assistant_image_tag || '' }}" + echo "Dispatched KaiBench evaluation" + echo "Results will appear as a commit status on ${{ github.sha }}" + echo "Monitor at: https://github.com/keboola-rnd/KaiBench/actions"