diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 645e1af7..58706e3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,6 +147,14 @@ jobs: path: ./integtest-results.xml reporter: 'java-junit' + kaibench: + name: KaiBench Evaluation + needs: build + # Run only for same-repo pushes (not fork PRs, which lack secrets) + if: github.event_name == 'push' && github.repository == 'keboola/mcp-server' + uses: ./.github/workflows/kaibench.yml + secrets: inherit + deploy_to_pypi: name: Deploy to pypi.org needs: diff --git a/.github/workflows/kaibench.yml b/.github/workflows/kaibench.yml new file mode 100644 index 00000000..b674d9ae --- /dev/null +++ b/.github/workflows/kaibench.yml @@ -0,0 +1,48 @@ +name: KaiBench Evaluation + +# Thin trigger that dispatches the full evaluation workflow in the KaiBench repo. +# All infrastructure (kai-assistant build, eval framework, secrets) lives there. +# Results are posted back as a commit status on this repo. + +permissions: + contents: read + +on: + workflow_dispatch: + inputs: + question_types: + description: 'Question types (comma-separated, or "all")' + default: 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation' + type: string + regression_only: + description: 'Only regression-flagged questions' + type: boolean + default: false + kai_assistant_image_tag: + description: 'Pre-built kai-assistant image tag (leave empty to build from UI main)' + required: false + type: string + default: '' + workflow_call: + +jobs: + trigger: + name: Trigger KaiBench evaluation + runs-on: ubuntu-latest + steps: + - name: Dispatch evaluation to KaiBench repo + env: + GH_TOKEN: ${{ secrets.KAIBENCH_REPO_TOKEN }} + run: | + gh workflow run evaluate.yml \ + --repo keboola-rnd/KaiBench \ + --field mcp_server_repo="${{ github.repository }}" \ + --field mcp_server_ref="${{ github.sha }}" \ + --field callback_repo="${{ github.repository }}" \ + --field callback_sha="${{ github.sha }}" \ + --field question_types="${{ inputs.question_types || 'Data Analysis Query,Configuration Reasoning,Storage Object Reasoning,MCP Tool Validation' }}" \ + --field regression_only="${{ inputs.regression_only || 'false' }}" \ + --field kai_assistant_image_tag="${{ inputs.kai_assistant_image_tag || '' }}" + echo "Dispatched KaiBench evaluation" + echo "Results will appear as a commit status on ${{ github.sha }}" + echo "Monitor at: https://github.com/keboola-rnd/KaiBench/actions" diff --git a/pyproject.toml b/pyproject.toml index e3c7f14b..cc90be67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "sqlglot ~= 28.5", "toon-format ~= 0.9.0b1", "pyyaml ~= 6.0", + "requests ~= 2.32", ] [project.optional-dependencies] codestyle = [ diff --git a/uv.lock b/uv.lock index 20ed7b74..5f221532 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1237,6 +1237,7 @@ dependencies = [ { name = "pydantic" }, { name = "pyjwt" }, { name = "pyyaml" }, + { name = "requests" }, { name = "sqlglot" }, { name = "toon-format" }, ] @@ -1303,6 +1304,7 @@ requires-dist = [ { name = "python-dateutil", marker = "extra == 'tests'", specifier = "~=2.9" }, { name = "python-dotenv", marker = "extra == 'tests'", specifier = "~=1.2" }, { name = "pyyaml", specifier = "~=6.0" }, + { name = "requests", specifier = "~=2.32" }, { name = "sqlglot", specifier = "~=28.5" }, { name = "toon-format", specifier = "~=0.9.0b1" }, { name = "tox", marker = "extra == 'dev'", specifier = "~=4.32" }, @@ -2489,8 +2491,8 @@ name = "taskgroup" version = "0.2.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "exceptiongroup" }, - { name = "typing-extensions" }, + { name = "exceptiongroup", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504, upload-time = "2025-01-03T09:24:13.761Z" } wheels = [