feat: MCP Server Architecture for Checkpoint-Based Workflow Execution #632
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Claude Code Integration Test | |
| on: | |
| # Manual trigger for testing | |
| workflow_dispatch: | |
| inputs: | |
| debug: | |
| description: 'Enable debug logging' | |
| required: false | |
| default: 'false' | |
| type: boolean | |
| # Run on all PRs (shows as check, but steps skip unless in merge queue) | |
| pull_request: | |
| branches: [main] | |
| # Run in the merge queue to validate before merging | |
| merge_group: | |
| branches: [main] | |
| # Ensure only one instance runs at a time per PR/branch | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: true | |
| # Minimal permissions for this workflow | |
| permissions: | |
| contents: read | |
| jobs: | |
| # Job 1: Validate skill generation from fixtures (no API key needed) | |
| # Runs on all events, but actual work only happens in merge_group/workflow_dispatch | |
| # This ensures the check name exists for PRs (needed for GitHub's merge queue) | |
| validate-generation: | |
| runs-on: ubuntu-latest | |
| steps: | |
| # For PRs: just pass quickly (actual tests run in merge queue) | |
| - name: Skip on PR | |
| if: github.event_name == 'pull_request' | |
| run: echo "Validation will run in merge queue. Passing for PR." | |
| - uses: actions/checkout@v4 | |
| if: github.event_name != 'pull_request' | |
| - name: Install uv | |
| if: github.event_name != 'pull_request' | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: Set up Python | |
| if: github.event_name != 'pull_request' | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Install dependencies | |
| if: github.event_name != 'pull_request' | |
| run: uv sync --extra dev | |
| - name: Validate fruits fixture parses and install generates correct structure | |
| if: github.event_name != 'pull_request' | |
| run: | | |
| # Verify the fruits fixture parses correctly via deepwork's parser | |
| uv run python -c " | |
| from pathlib import Path | |
| from deepwork.core.parser import parse_job_definition | |
| job = parse_job_definition(Path('tests/fixtures/jobs/fruits')) | |
| assert job.name == 'fruits' | |
| assert job.version == '1.0.0' | |
| assert len(job.steps) == 2 | |
| assert [s.id for s in job.steps] == ['identify', 'classify'] | |
| # Identify step: user input -> file output | |
| identify = job.steps[0] | |
| assert identify.inputs[0].is_user_input() | |
| assert identify.inputs[0].name == 'raw_items' | |
| assert identify.outputs[0].name == 'identified_fruits.md' | |
| assert identify.dependencies == [] | |
| # Classify step: file input from identify -> file output | |
| classify = job.steps[1] | |
| assert classify.inputs[0].is_file_input() | |
| assert classify.inputs[0].file == 'identified_fruits.md' | |
| assert classify.inputs[0].from_step == 'identify' | |
| assert classify.outputs[0].name == 'classified_fruits.md' | |
| assert classify.dependencies == ['identify'] | |
| # Workflow definition | |
| assert len(job.workflows) == 1 | |
| assert job.workflows[0].name == 'full' | |
| assert job.workflows[0].steps == ['identify', 'classify'] | |
| # Validations pass | |
| job.validate_dependencies() | |
| job.validate_file_inputs() | |
| job.validate_workflows() | |
| print('All fruits fixture validations passed!') | |
| " | |
| - name: Generate skills and validate structure | |
| if: github.event_name != 'pull_request' | |
| run: | | |
| # Create a test environment | |
| mkdir -p test_project/.deepwork/jobs | |
| mkdir -p test_project/.claude # Required for platform detection | |
| cp -r tests/fixtures/jobs/fruits test_project/.deepwork/jobs/ | |
| # Set up git repo in test project | |
| cd test_project | |
| git init | |
| git config user.email "test@test.com" | |
| git config user.name "Test" | |
| echo "# Test" > README.md | |
| git add . && git commit -m "init" | |
| cd .. | |
| # Run deepwork install to set up the project (this also runs sync) | |
| uv run deepwork install --platform claude --path test_project | |
| # Validate generated skills exist | |
| echo "Checking generated skills..." | |
| ls -la test_project/.claude/skills/ | |
| # MCP variant: only the /deepwork entry point skill is generated | |
| # (per-step skills are no longer created; MCP server handles orchestration) | |
| test -f test_project/.claude/skills/deepwork/SKILL.md || (echo "Missing deepwork MCP entry point skill" && exit 1) | |
| # Verify the deepwork skill references MCP tools | |
| grep -qi "deepwork" test_project/.claude/skills/deepwork/SKILL.md | |
| echo "Skill generation validated successfully!" | |
| # Job 2: Full end-to-end test with Claude Code | |
| # Tests the COMPLETE workflow: | |
| # Runs on all events, but actual work only happens in merge_group/workflow_dispatch | |
| # This ensures the check name exists for PRs (needed for GitHub's merge queue) | |
| claude-code-e2e: | |
| runs-on: ubuntu-latest | |
| needs: validate-generation | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| steps: | |
| # For PRs: just pass quickly (actual tests run in merge queue) | |
| - name: Skip on PR | |
| if: github.event_name == 'pull_request' | |
| run: echo "E2E tests will run in merge queue. Passing for PR." | |
| - uses: actions/checkout@v4 | |
| if: github.event_name != 'pull_request' | |
| - name: Check for API key | |
| if: github.event_name != 'pull_request' | |
| id: check-key | |
| run: | | |
| if [ -z "$ANTHROPIC_API_KEY" ]; then | |
| echo "has_key=false" >> $GITHUB_OUTPUT | |
| echo "::warning::ANTHROPIC_API_KEY not set, skipping Claude Code e2e test" | |
| else | |
| echo "has_key=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Install Node.js (for Claude Code CLI) | |
| if: steps.check-key.outputs.has_key == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: Install Claude Code CLI | |
| if: steps.check-key.outputs.has_key == 'true' | |
| run: npm install -g @anthropic-ai/claude-code | |
| - name: Install uv | |
| if: steps.check-key.outputs.has_key == 'true' | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: Set up Python | |
| if: steps.check-key.outputs.has_key == 'true' | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Install deepwork | |
| if: steps.check-key.outputs.has_key == 'true' | |
| run: | | |
| uv sync | |
| # Add the uv virtualenv bin directory to PATH for all subsequent steps. | |
| # | |
| # Why: `deepwork install` registers an MCP server in .mcp.json with | |
| # the command "deepwork serve --path .". When Claude Code starts, it | |
| # reads .mcp.json and spawns the MCP server as a subprocess using the | |
| # bare "deepwork" command. Without this PATH addition, that subprocess | |
| # fails because "deepwork" only exists inside the uv virtualenv | |
| # (accessible via "uv run deepwork" but not as a bare command). | |
| # | |
| # Without the MCP server running, Claude cannot use the DeepWork MCP | |
| # tools (get_workflows, start_workflow, finished_step) and falls back | |
| # to ad-hoc file creation, which produces the wrong output format. | |
| echo "$(pwd)/.venv/bin" >> $GITHUB_PATH | |
| - name: Set up fresh test project | |
| if: steps.check-key.outputs.has_key == 'true' | |
| run: | | |
| # Create a fresh project with NO pre-existing job definitions | |
| mkdir -p test_project/.claude | |
| cd test_project | |
| git init | |
| git config user.email "test@test.com" | |
| git config user.name "Test" | |
| echo "# CI Test Project - DeepWork E2E Test" > README.md | |
| git add . && git commit -m "init" | |
| cd .. | |
| # Install deepwork (this sets up .deepwork/ with standard jobs only) | |
| uv run deepwork install --platform claude --path test_project | |
| # Merge broad CI permissions into settings.json WITHOUT overwriting it. | |
| # | |
| # Why merge instead of overwrite: `deepwork install` writes MCP tool | |
| # permissions (mcp__deepwork__get_workflows, mcp__deepwork__start_workflow, | |
| # etc.) into settings.json. These are required for Claude to call the | |
| # DeepWork MCP server tools. Overwriting settings.json with only generic | |
| # permissions (Bash, Read, Write, etc.) removes the MCP permissions, | |
| # causing Claude to silently fail when trying to use /deepwork. | |
| python3 -c " | |
| import json | |
| settings_path = 'test_project/.claude/settings.json' | |
| with open(settings_path) as f: | |
| settings = json.load(f) | |
| ci_permissions = ['Bash(*)', 'Read(./**)', 'Edit(./**)', 'Write(./**)', 'Skill(*)'] | |
| for perm in ci_permissions: | |
| if perm not in settings.setdefault('permissions', {}).setdefault('allow', []): | |
| settings['permissions']['allow'].append(perm) | |
| with open(settings_path, 'w') as f: | |
| json.dump(settings, f, indent=2) | |
| " | |
| echo "Fresh test project setup complete" | |
| echo "Available skills:" | |
| ls -la test_project/.claude/skills/ | |
| # STEP 1: Use /deepwork to CREATE the fruits job via MCP workflow | |
| # | |
| # This invokes Claude with the /deepwork skill, which uses MCP tools to | |
| # walk through the deepwork_jobs/new_job workflow (define → implement → | |
| # test → iterate). The workflow includes quality gates that spawn Claude | |
| # subprocesses, so it needs a generous timeout. | |
| - name: Create job with /deepwork | |
| if: steps.check-key.outputs.has_key == 'true' | |
| working-directory: test_project | |
| timeout-minutes: 10 | |
| run: | | |
| echo "=== Running /deepwork to create fruits job ===" | |
| mkdir fruits | |
| # Use --debug to capture detailed logs for diagnosing failures. | |
| # The debug log is dumped in the failure handler below. | |
| claude --print --debug --model claude-sonnet-4-5 <<'PROMPT_EOF' | |
| /deepwork I want to create a simple job called "fruits" for identifying and classifying fruits. | |
| Here are the EXACT specifications. | |
| Intent: A simple workflow that takes a list of mixed items, identifies which are fruits, then classifies them by category. Designed for CI testing. | |
| Steps: | |
| 1. Step: identify | |
| Name: Identify Fruits | |
| Description: Filter a list of items to include only the fruits | |
| **CRITICAL**: The output MUST be stored in `fruits/identified_fruits.md`. | |
| 2. Step: classify | |
| Name: Classify Fruits | |
| Description: Organize identified fruits into categories (citrus, tropical, berries, etc.). | |
| **CRITICAL**: must put the classified fruit list in `./fruits/classified_fruits.md`. | |
| **Key Instructions:** | |
| - Do not ask questions - just make the job | |
| - Rules are explicitly not desired. Tell the review agents that. | |
| - Do not give long commentary of what you did - just make the job with no commentary. | |
| - IMPORTANT: Once the job.yml and step instruction files have been created (i.e. after the "define" and "implement" steps are done), STOP. Do NOT continue into the "test" or "iterate" steps. Abort the workflow at that point. We only need the job definition files created, not the full workflow run. | |
| PROMPT_EOF | |
| # Verify the job.yml was created | |
| echo "=== Checking job.yml was created ===" | |
| if [ -f ".deepwork/jobs/fruits/job.yml" ]; then | |
| echo "SUCCESS: job.yml created" | |
| cat .deepwork/jobs/fruits/job.yml | |
| else | |
| echo "ERROR: job.yml was not created" | |
| echo "Contents of .deepwork/jobs/:" | |
| ls -la .deepwork/jobs/ || echo "No jobs directory" | |
| exit 1 | |
| fi | |
| # Verify step files were created | |
| echo "=== Checking step files were created ===" | |
| if [ -f ".deepwork/jobs/fruits/steps/identify.md" ] && [ -f ".deepwork/jobs/fruits/steps/classify.md" ]; then | |
| echo "SUCCESS: Step instruction files created" | |
| echo "--- identify.md ---" | |
| cat .deepwork/jobs/fruits/steps/identify.md | |
| echo "" | |
| echo "--- classify.md ---" | |
| cat .deepwork/jobs/fruits/steps/classify.md | |
| else | |
| echo "ERROR: Step files were not created" | |
| ls -la .deepwork/jobs/fruits/steps/ || echo "No steps directory" | |
| exit 1 | |
| fi | |
| # Run sync to regenerate skills after new job was created | |
| echo "=== Running deepwork sync to regenerate skills ===" | |
| cd .. | |
| uv run deepwork sync --path test_project | |
| echo "=== Checking generated skills ===" | |
| ls -la test_project/.claude/skills/ | |
| # MCP variant: only the /deepwork entry point skill is generated | |
| if [ -f "test_project/.claude/skills/deepwork/SKILL.md" ]; then | |
| echo "SUCCESS: /deepwork MCP entry point skill generated" | |
| else | |
| echo "ERROR: /deepwork skill was not generated" | |
| exit 1 | |
| fi | |
| # Dump Claude debug log if the job creation step failed or timed out. | |
| # This captures MCP server communication, tool calls, and error details. | |
| - name: Dump Claude debug log on failure | |
| if: failure() && steps.check-key.outputs.has_key == 'true' | |
| working-directory: test_project | |
| run: | | |
| echo "=== Claude debug log ===" | |
| # Claude --debug writes to ~/.claude/debug.log | |
| if [ -f "$HOME/.claude/debug.log" ]; then | |
| echo "--- Last 200 lines of debug.log ---" | |
| tail -200 "$HOME/.claude/debug.log" | |
| else | |
| echo "No debug.log found at ~/.claude/debug.log" | |
| echo "Searching for debug logs..." | |
| find "$HOME/.claude" -name "*.log" -type f 2>/dev/null || echo "No log files found" | |
| fi | |
| echo "" | |
| echo "=== MCP server config ===" | |
| cat .mcp.json 2>/dev/null || echo "No .mcp.json found" | |
| echo "" | |
| echo "=== Settings.json ===" | |
| cat .claude/settings.json 2>/dev/null || echo "No settings.json found" | |
| echo "" | |
| echo "=== DeepWork session state ===" | |
| ls -la .deepwork/tmp/ 2>/dev/null || echo "No tmp directory" | |
| for f in .deepwork/tmp/session_*.json; do | |
| [ -f "$f" ] && echo "--- $f ---" && cat "$f" | |
| done | |
| # STEP 3: Execute the fruits workflow via /deepwork MCP entry point | |
| - name: Run Workflow | |
| if: steps.check-key.outputs.has_key == 'true' | |
| working-directory: test_project | |
| timeout-minutes: 3 | |
| run: | | |
| echo "=== Running fruits workflow with test input via /deepwork ===" | |
| claude --print --model claude-sonnet-4-5 <<'PROMPT_EOF' | |
| /deepwork Run the fruits full workflow. Process the list to the file and don't give any extra commentary or text output. | |
| raw_items: apple, car, banana, chair, orange, table, mango, laptop, grape, bicycle | |
| PROMPT_EOF | |
| echo "=== Workflow finished - looking for output file ===" | |
| # Verify both outputs were created | |
| if [ -f "fruits/identified_fruits.md" ]; then | |
| echo "SUCCESS: identified_fruits.md created" | |
| echo "--- Output ---" | |
| cat fruits/identified_fruits.md | |
| else | |
| echo "ERROR: identified_fruits.md was not created" | |
| exit 1 | |
| fi | |
| if [ -f "fruits/classified_fruits.md" ]; then | |
| echo "SUCCESS: classified_fruits.md created" | |
| echo "--- Output ---" | |
| cat fruits/classified_fruits.md | |
| else | |
| echo "ERROR: classified_fruits.md was not created" | |
| exit 1 | |
| fi | |
| # STEP 4: Validate the complete workflow output | |
| - name: Validate Workflow Output | |
| if: steps.check-key.outputs.has_key == 'true' | |
| working-directory: test_project/fruits | |
| run: | | |
| echo "=== Validating complete workflow ===" | |
| # Check identified_fruits.md contains expected fruits | |
| echo "Checking identified_fruits.md..." | |
| grep -qi "apple" identified_fruits.md || (echo "Missing: apple" && exit 1) | |
| grep -qi "banana" identified_fruits.md || (echo "Missing: banana" && exit 1) | |
| grep -qi "orange" identified_fruits.md || (echo "Missing: orange" && exit 1) | |
| grep -qi "mango" identified_fruits.md || (echo "Missing: mango" && exit 1) | |
| grep -qi "grape" identified_fruits.md || (echo "Missing: grape" && exit 1) | |
| echo " ✓ All expected fruits found in identified_fruits.md" | |
| # Check classified_fruits.md has expected structure | |
| echo "Checking classified_fruits.md..." | |
| grep -qi "citrus\|tropical\|pome\|berr" classified_fruits.md || (echo "Missing fruit categories" && exit 1) | |
| echo " ✓ Fruit categories found in classified_fruits.md" | |
| echo "" | |
| echo "==========================================" | |
| echo " ALL E2E TESTS PASSED SUCCESSFULLY!" | |
| echo "==========================================" | |
| echo "" | |
| echo "Workflow tested: /deepwork fruits full - Executed full fruits workflow (identify + classify)" | |
| echo "" | |
| - name: Upload test artifacts | |
| if: steps.check-key.outputs.has_key == 'true' && always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: claude-code-e2e-outputs | |
| path: | | |
| test_project/.deepwork/jobs/fruits/ | |
| test_project/.claude/skills/deepwork/ | |
| test_project/fruits/identified_fruits.md | |
| test_project/fruits/classified_fruits.md | |
| retention-days: 7 |