diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 15050096..14a8282b 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -7,7 +7,7 @@ This directory contains CI/CD workflows for the DeepWork project. We use GitHub' | Workflow | File | Purpose | |----------|------|---------| | **Validate** | `validate.yml` | Linting (ruff) and unit tests | -| **Integration Tests** | `claude-code-test.yml` | Command generation and e2e tests | +| **Integration Tests** | `claude-code-test.yml` | Skill generation and e2e tests | | **CLA Assistant** | `cla.yml` | Contributor License Agreement verification | | **Release** | `release.yml` | PyPI publishing on tags | @@ -85,7 +85,7 @@ All checks will pass in both PR and merge queue contexts (either by running or b - **Triggers**: `pull_request` (main), `merge_group` (main), `workflow_dispatch` - **Jobs**: - `pr-check`: Runs on PRs only, always passes (lightweight check) - - `validate-generation`: Tests command generation from fixtures (no API key needed) + - `validate-generation`: Tests skill generation from fixtures (no API key needed) - `claude-code-e2e`: Full end-to-end test with Claude Code CLI (requires `ANTHROPIC_API_KEY`) - `validate-generation` and `claude-code-e2e` skip on PRs, run in merge queue and manual dispatch diff --git a/.github/workflows/claude-code-test.yml b/.github/workflows/claude-code-test.yml index 0c1633a0..1a93cf4d 100644 --- a/.github/workflows/claude-code-test.yml +++ b/.github/workflows/claude-code-test.yml @@ -26,7 +26,7 @@ permissions: contents: read jobs: - # Job 1: Validate command generation from fixtures (no API key needed) + # Job 1: Validate skill generation from fixtures (no API key needed) # Runs on all events, but actual work only happens in merge_group/workflow_dispatch # This ensures the check name exists for PRs (needed for GitHub's merge queue) validate-generation: @@ -60,7 +60,7 @@ jobs: if: github.event_name != 'pull_request' run: uv run pytest tests/integration/test_fruits_workflow.py -v - - name: Generate commands and validate structure + - name: Generate skills and validate structure if: github.event_name != 'pull_request' run: | # Create a test environment @@ -80,27 +80,30 @@ jobs: # Run deepwork install to set up the project (this also runs sync) uv run deepwork install --platform claude --path test_project - # Validate generated commands exist - echo "Checking generated commands..." - ls -la test_project/.claude/commands/ + # Validate generated skills exist + echo "Checking generated skills..." + ls -la test_project/.claude/skills/ - # Verify command files exist - test -f test_project/.claude/commands/fruits.identify.md || (echo "Missing fruits.identify.md" && exit 1) - test -f test_project/.claude/commands/fruits.classify.md || (echo "Missing fruits.classify.md" && exit 1) + # Verify skill directories and SKILL.md files exist + # Meta-skill for the job itself + test -f test_project/.claude/skills/fruits/SKILL.md || (echo "Missing fruits meta-skill" && exit 1) + # Step skills + test -f test_project/.claude/skills/fruits.identify/SKILL.md || (echo "Missing fruits.identify skill" && exit 1) + test -f test_project/.claude/skills/fruits.classify/SKILL.md || (echo "Missing fruits.classify skill" && exit 1) - # Verify command content - grep -q "# fruits.identify" test_project/.claude/commands/fruits.identify.md - grep -q "raw_items" test_project/.claude/commands/fruits.identify.md - grep -q "identified_fruits.md" test_project/.claude/commands/fruits.identify.md + # Verify skill content + grep -q "# fruits.identify" test_project/.claude/skills/fruits.identify/SKILL.md + grep -q "raw_items" test_project/.claude/skills/fruits.identify/SKILL.md + grep -q "identified_fruits.md" test_project/.claude/skills/fruits.identify/SKILL.md - grep -q "# fruits.classify" test_project/.claude/commands/fruits.classify.md - grep -q "identified_fruits.md" test_project/.claude/commands/fruits.classify.md - grep -q "classified_fruits.md" test_project/.claude/commands/fruits.classify.md + grep -q "# fruits.classify" test_project/.claude/skills/fruits.classify/SKILL.md + grep -q "identified_fruits.md" test_project/.claude/skills/fruits.classify/SKILL.md + grep -q "classified_fruits.md" test_project/.claude/skills/fruits.classify/SKILL.md - echo "Command generation validated successfully!" + echo "Skill generation validated successfully!" # Job 2: Full end-to-end test with Claude Code - # Tests the COMPLETE workflow: define job -> implement -> execute + # Tests the COMPLETE workflow: # Runs on all events, but actual work only happens in merge_group/workflow_dispatch # This ensures the check name exists for PRs (needed for GitHub's merge queue) claude-code-e2e: @@ -171,46 +174,53 @@ jobs: # Install deepwork (this sets up .deepwork/ with standard jobs only) uv run deepwork install --platform claude --path test_project + # Create permissive settings.json to allow file operations in CI + cat > test_project/.claude/settings.json << 'SETTINGS_EOF' + { + "permissions": { + "allow": [ + "Bash(*)", + "Read(./**)", + "Edit(./**)", + "Write(./**)", + "Skill(*)" + ] + } + } + SETTINGS_EOF + echo "Fresh test project setup complete" - echo "Available commands:" - ls -la test_project/.claude/commands/ + echo "Available skills:" + ls -la test_project/.claude/skills/ # STEP 1: Use /deepwork_jobs.define to CREATE the fruits job - - name: Create job with /deepwork_jobs.define + - name: Create job with /deepwork_jobs if: steps.check-key.outputs.has_key == 'true' working-directory: test_project timeout-minutes: 10 run: | - echo "=== Running /deepwork_jobs.define to create fruits job ===" + echo "=== Running /deepwork_jobs to create fruits job ===" # Provide detailed, deterministic instructions for creating the job - claude --yes --print "/deepwork_jobs.define" <<'PROMPT_EOF' - I want to create a simple job called "fruits" for identifying and classifying fruits. - - Here are the EXACT specifications - please create the job.yml with these exact details: - - Job name: fruits - Version: 1.0.0 - Summary: Identify and classify fruits from a mixed list of items + claude --print <<'PROMPT_EOF' + /deepwork_jobs I want to create a simple job called "fruits" for identifying and classifying fruits. - Description: A simple workflow that takes a list of mixed items, identifies which are fruits, then classifies them by category. Designed for CI testing. + Here are the EXACT specifications. + + Intent: A simple workflow that takes a list of mixed items, identifies which are fruits, then classifies them by category. Designed for CI testing. Steps: - 1. Step ID: identify + 1. Step: identify Name: Identify Fruits - Description: Filter a list of items to identify only the fruits - Input: raw_items (user parameter) - A comma-separated list of items - Output: identified_fruits.md - Dependencies: none + Description: Filter a list of items to include only the fruits - 2. Step ID: classify + 2. Step: classify Name: Classify Fruits Description: Organize identified fruits into categories (citrus, tropical, berries, etc.) Input: identified_fruits.md (file from step identify) Output: classified_fruits.md - Dependencies: identify - Please create this job definition now. Do not ask questions - use these exact specifications. + Please create this job now. Do not ask questions. PROMPT_EOF # Verify the job.yml was created @@ -225,30 +235,6 @@ jobs: exit 1 fi - # STEP 2: Use /deepwork_jobs.implement to generate step instructions - - name: Generate step instructions with /deepwork_jobs.implement - if: steps.check-key.outputs.has_key == 'true' - working-directory: test_project - timeout-minutes: 10 - run: | - echo "=== Running /deepwork_jobs.implement to generate step instructions ===" - - claude --yes --print "/deepwork_jobs.implement" <<'PROMPT_EOF' - Please implement the "fruits" job that was just defined. - - For the identify step, create instructions that: - - Parse the comma-separated raw_items input - - Identify which items are fruits (apple, banana, orange, mango, grape, etc.) - - Output a markdown file listing the identified fruits - - For the classify step, create instructions that: - - Read identified_fruits.md from the previous step - - Classify fruits into categories: Citrus (orange, lemon), Tropical (banana, mango), Pome (apple, pear), Berries, etc. - - Output a markdown file with fruits organized by category - - Generate the step instruction files now. - PROMPT_EOF - # Verify step files were created echo "=== Checking step files were created ===" if [ -f ".deepwork/jobs/fruits/steps/identify.md" ] && [ -f ".deepwork/jobs/fruits/steps/classify.md" ]; then @@ -264,34 +250,34 @@ jobs: exit 1 fi - # Run sync to generate the slash commands - echo "=== Running deepwork sync to generate commands ===" + # Run sync to generate the skills + echo "=== Running deepwork sync to generate skills ===" cd .. uv run deepwork sync --path test_project - echo "=== Checking generated commands ===" - ls -la test_project/.claude/commands/ + echo "=== Checking generated skills ===" + ls -la test_project/.claude/skills/ - if [ -f "test_project/.claude/commands/fruits.identify.md" ] && [ -f "test_project/.claude/commands/fruits.classify.md" ]; then - echo "SUCCESS: Slash commands generated" + if [ -f "test_project/.claude/skills/fruits.identify/SKILL.md" ] && [ -f "test_project/.claude/skills/fruits.classify/SKILL.md" ]; then + echo "SUCCESS: Skills generated" else - echo "ERROR: Slash commands were not generated" + echo "ERROR: Skills were not generated" exit 1 fi - # STEP 3: Execute the generated /fruits.identify command - - name: Run /fruits.identify + # STEP 3: Execute the /fruits workflow (runs all steps automatically) + - name: Run /fruits workflow if: steps.check-key.outputs.has_key == 'true' working-directory: test_project - timeout-minutes: 5 + timeout-minutes: 10 run: | - echo "=== Running /fruits.identify with test input ===" + echo "=== Running /fruits workflow with test input ===" - claude --yes --print "/fruits.identify" <<'PROMPT_EOF' + claude --print "/fruits" <<'PROMPT_EOF' raw_items: apple, car, banana, chair, orange, table, mango, laptop, grape, bicycle PROMPT_EOF - # Verify output was created + # Verify both outputs were created if [ -f "identified_fruits.md" ]; then echo "SUCCESS: identified_fruits.md created" echo "--- Output ---" @@ -301,17 +287,6 @@ jobs: exit 1 fi - # STEP 4: Execute the generated /fruits.classify command - - name: Run /fruits.classify - if: steps.check-key.outputs.has_key == 'true' - working-directory: test_project - timeout-minutes: 5 - run: | - echo "=== Running /fruits.classify ===" - - claude --yes --print "/fruits.classify" - - # Verify output was created if [ -f "classified_fruits.md" ]; then echo "SUCCESS: classified_fruits.md created" echo "--- Output ---" @@ -321,7 +296,7 @@ jobs: exit 1 fi - # STEP 5: Validate the complete workflow output + # STEP 4: Validate the complete workflow output - name: Validate complete workflow if: steps.check-key.outputs.has_key == 'true' working-directory: test_project @@ -355,10 +330,8 @@ jobs: echo "==========================================" echo "" echo "Workflow tested:" - echo " 1. /deepwork_jobs.define - Created job specification" - echo " 2. /deepwork_jobs.implement - Generated step instructions" - echo " 3. /fruits.identify - Executed identify step" - echo " 4. /fruits.classify - Executed classify step" + echo " 1. /deepwork_jobs - Created job" + echo " 2. /fruits - Executed full fruits workflow (identify + classify)" echo "" - name: Upload test artifacts @@ -368,7 +341,7 @@ jobs: name: claude-code-e2e-outputs path: | test_project/.deepwork/jobs/fruits/ - test_project/.claude/commands/fruits.*.md + test_project/.claude/skills/fruits*/ test_project/identified_fruits.md test_project/classified_fruits.md retention-days: 7 diff --git a/tests/e2e/test_claude_code_integration.py b/tests/e2e/test_claude_code_integration.py index bba2e8ce..b98fbc28 100644 --- a/tests/e2e/test_claude_code_integration.py +++ b/tests/e2e/test_claude_code_integration.py @@ -1,10 +1,10 @@ """End-to-end tests for DeepWork with Claude Code integration. -These tests validate that DeepWork-generated commands work correctly +These tests validate that DeepWork-generated skills work correctly with Claude Code. The tests can run in two modes: -1. **Generation-only mode** (default): Tests command generation and structure -2. **Full e2e mode**: Actually executes commands with Claude Code +1. **Generation-only mode** (default): Tests skill generation and structure +2. **Full e2e mode**: Actually executes skills with Claude Code Set ANTHROPIC_API_KEY and DEEPWORK_E2E_FULL=true to run full e2e tests. """ @@ -55,11 +55,11 @@ def run_full_e2e() -> bool: ) -class TestCommandGenerationE2E: - """End-to-end tests for command generation.""" +class TestSkillGenerationE2E: + """End-to-end tests for skill generation.""" - def test_generate_fruits_commands_in_temp_project(self) -> None: - """Test generating fruits commands in a realistic project structure.""" + def test_generate_fruits_skills_in_temp_project(self) -> None: + """Test generating fruits skills in a realistic project structure.""" with tempfile.TemporaryDirectory() as tmpdir: project_dir = Path(tmpdir) @@ -182,8 +182,8 @@ class TestClaudeCodeExecution: """ @pytest.fixture - def project_with_commands(self) -> Path: - """Create a test project with generated commands.""" + def project_with_skills(self) -> Path: + """Create a test project with generated skills.""" tmpdir = tempfile.mkdtemp() project_dir = Path(tmpdir) @@ -231,99 +231,44 @@ def project_with_commands(self) -> Path: # Cleanup shutil.rmtree(tmpdir, ignore_errors=True) - def test_identify_step_execution(self, project_with_commands: Path) -> None: - """Test executing the identify step with Claude Code.""" - # Run Claude Code with the identify command + def test_fruits_workflow_execution(self, project_with_skills: Path) -> None: + """Test executing the complete fruits workflow with Claude Code. + + Invokes /fruits once, which automatically runs all steps (identify + classify). + """ + # Run Claude Code with the fruits skill - this executes the full workflow result = subprocess.run( - [ - "claude", - "--yes", - "--print", - f"/fruits.identify raw_items: {TEST_INPUT}", - ], - cwd=project_with_commands, + ["claude", "--print", "/fruits"], + input=f"raw_items: {TEST_INPUT}", + cwd=project_with_skills, capture_output=True, text=True, - timeout=120, + timeout=300, # 5 minutes for full workflow ) assert result.returncode == 0, f"Claude Code failed: {result.stderr}" - # Check output file was created - output_file = project_with_commands / "identified_fruits.md" - assert output_file.exists(), "identified_fruits.md was not created" + # Verify identify step output was created + identify_output = project_with_skills / "identified_fruits.md" + assert identify_output.exists(), "identified_fruits.md was not created" - # Validate content - content = output_file.read_text().lower() + # Validate identify output content + identify_content = identify_output.read_text().lower() for fruit in EXPECTED_FRUITS: - assert fruit in content, f"Expected fruit '{fruit}' not found in output" - - def test_classify_step_execution(self, project_with_commands: Path) -> None: - """Test executing the classify step with Claude Code.""" - # First, create the input file (simulate identify step output) - identify_output = project_with_commands / "identified_fruits.md" - identify_output.write_text( - "# Identified Fruits\n\n- apple\n- banana\n- orange\n- mango\n- grape\n" - ) - - # Run Claude Code with the classify command - result = subprocess.run( - ["claude", "--yes", "--print", "/fruits.classify"], - cwd=project_with_commands, - capture_output=True, - text=True, - timeout=120, - ) - - assert result.returncode == 0, f"Claude Code failed: {result.stderr}" + assert fruit in identify_content, ( + f"Expected fruit '{fruit}' not found in identified_fruits.md" + ) - # Check output file was created - output_file = project_with_commands / "classified_fruits.md" - assert output_file.exists(), "classified_fruits.md was not created" + # Verify classify step output was created + classify_output = project_with_skills / "classified_fruits.md" + assert classify_output.exists(), "classified_fruits.md was not created" - # Validate content has category structure - content = output_file.read_text().lower() - # Should have at least one category mentioned + # Validate classify output has category structure + classify_content = classify_output.read_text().lower() categories = ["citrus", "tropical", "pome", "berries", "grape"] - has_category = any(cat in content for cat in categories) - assert has_category, f"No fruit categories found in output: {content[:500]}" - - def test_full_workflow_execution(self, project_with_commands: Path) -> None: - """Test executing the complete fruits workflow with Claude Code.""" - # Run identify step - result1 = subprocess.run( - [ - "claude", - "--yes", - "--print", - f"/fruits.identify raw_items: {TEST_INPUT}", - ], - cwd=project_with_commands, - capture_output=True, - text=True, - timeout=120, - ) - assert result1.returncode == 0, f"Identify step failed: {result1.stderr}" - - # Verify identify output exists - identify_output = project_with_commands / "identified_fruits.md" - assert identify_output.exists(), "Identify step did not create output" - - # Run classify step - result2 = subprocess.run( - ["claude", "--yes", "--print", "/fruits.classify"], - cwd=project_with_commands, - capture_output=True, - text=True, - timeout=120, - ) - assert result2.returncode == 0, f"Classify step failed: {result2.stderr}" - - # Verify classify output exists - classify_output = project_with_commands / "classified_fruits.md" - assert classify_output.exists(), "Classify step did not create output" + has_category = any(cat in classify_content for cat in categories) + assert has_category, f"No fruit categories found in output: {classify_content[:500]}" # Validate final output quality - content = classify_output.read_text() - assert len(content) > 100, "Output seems too short" - assert "##" in content, "Output lacks markdown structure" + assert len(classify_content) > 100, "Output seems too short" + assert "##" in classify_output.read_text(), "Output lacks markdown structure"