diff --git a/JOURNAL.md b/JOURNAL.md index b59766d..b6dc868 100644 --- a/JOURNAL.md +++ b/JOURNAL.md @@ -1226,3 +1226,117 @@ The Ralph realignment is working! The CLI now: - `npm run typecheck` - `npm test` - `npm run build` + +## 2026-01-28 - CRITICAL FIX: Multi-task plan processing loop + +### Problem +The `ghcralph run --file PLAN.md` command only processed ONE task per invocation, then exited. After successfully completing the first task in a plan file, the CLI would terminate instead of continuing to the remaining tasks. This was a critical bug that broke the core functionality of the CLI. + +**Root Cause**: The `run` command in `src/commands/run.ts` only processed **one task per invocation**. There was no outer loop to continue processing the remaining pending tasks after the first task completed. + +### Fix +Implemented Option A from the remediation plan (`plans/LOOP_MAJOR_BUG_REMEDIATION_PLAN.md`): + +1. **Core Multi-Task Loop** (`src/commands/run.ts`): + - Added outer `while (currentTask)` loop that processes ALL pending tasks + - Creates **fresh AI agent instance** for each task (Ralph pattern core principle) + - Added task-level retry loop with configurable `maxRetriesPerTask` (default: 2) + - Prints final summary with total tasks processed/completed/failed + +2. **New CLI Flag**: + - Added `--pause-between-tasks` flag for strict Ralph mode (human review after each task) + +3. **New Configuration Options** (`src/core/config-schema.ts`): + - `maxRetriesPerTask: number` (default: 2) - retries per task before marking failed + - `autoPush: boolean` (default: false) - auto-push after each task completion + +4. **New CheckpointManager Methods** (`src/core/checkpoint-manager.ts`): + - `createTaskCheckpoint()` - commits after successful task completion + - `createFailureCheckpoint()` - commits after failed task attempt (preserves state for post-mortem) + +5. **New GitBranchManager Methods** (`src/core/git-branch-manager.ts`): + - `pushToRemote()` - pushes current branch to remote + - `hasRemote()` - checks if a remote exists + +6. **New ProgressTracker Methods** (`src/core/progress-tracker.ts`): + - `loadPreviousTaskResults()` - loads previous task results for context injection + - `appendTaskResult()` - appends task result to progress file for tracking + +7. **New PlanManager Interface Method** (`src/core/plan-manager.ts`): + - `reload?()` - optional method to reload plan from source (already implemented in LocalMarkdownPlan) + +8. **Prompt Engineering for Honesty** (`src/core/context-builder.ts`): + - Added `HONESTY_GUIDANCE` section to prompt template + - Encourages agents to be honest about failures + - Documents blockers instead of false completion claims + +9. **New STUCK Action** (`src/core/response-parser.ts`, `src/core/action-executor.ts`): + - Added `[ACTION:STUCK]` action type for graceful failure signaling + - Agents can report: attempted actions, blockers, and suggestions + - STUCK triggers retry with fresh agent (benefits from progress documentation) + +10. **Utility Function** (`src/utils/shell.ts`): + - Added `waitForKeypress()` for `--pause-between-tasks` mode + +### Files Modified +- `src/commands/run.ts` - Core fix with multi-task loop +- `src/core/config-schema.ts` - New config options +- `src/core/checkpoint-manager.ts` - Task-level checkpoints +- `src/core/git-branch-manager.ts` - Push to remote +- `src/core/progress-tracker.ts` - Multi-task progress tracking +- `src/core/plan-manager.ts` - Optional reload method +- `src/core/context-builder.ts` - Honesty guidance in prompt +- `src/core/response-parser.ts` - STUCK action type +- `src/core/action-executor.ts` - STUCK action handling +- `src/utils/shell.ts` - waitForKeypress utility +- `src/core/config-schema.test.ts` - Updated test for new config keys + +### Validation +- `npm run typecheck` ✅ +- `npm test` ✅ (285 tests passing) +- `npm run build` ✅ + +## 2026-01-28 - Model Compatibility Improvements + +### Context +Following the multi-task loop fix, analyzed the `MODEL_COMPAT_TEST_PLAN.md` to address model compatibility concerns: +1. The `ghcralph init` command had a hardcoded list of 5 models +2. GitHub Copilot CLI actually offers 14+ models +3. The SDK provides `client.listModels()` API for dynamic model discovery +4. No tests existed to validate parsing across different model output styles + +### Changes + +1. **Dynamic Model Listing** (`src/integrations/copilot-agent.ts`): + - Added `listAvailableModels()` instance method - fetches models from existing client + - Added static `fetchAvailableModels()` method - creates temporary client to fetch models + - Re-exported `ModelInfo` type from SDK for consumers + +2. **Dynamic Model Selection in Init** (`src/commands/init.ts`): + - Added `fetchModelOptions()` helper that calls `CopilotAgent.fetchAvailableModels()` + - Updated model selection prompt to use dynamically fetched models + - Falls back to hardcoded list if SDK fetch fails + - Maintains "Custom (enter manually)" option + +3. **Model Compatibility Tests** (`src/core/model-compatibility.test.ts`): + - Created parameterized test suite for response parsing across model variations + - Tests CREATE, EDIT, EXECUTE, COMPLETE, and STUCK action parsing + - Documents current parser behavior with different formatting styles + - Tests edge cases: Windows line endings, mixed case action types, malformed blocks + +4. **Updated CopilotAgent Tests** (`src/integrations/copilot-agent.test.ts`): + - Added `mockListModels` for SDK mock + - Added tests for `listAvailableModels()` and `fetchAvailableModels()` + - Tests error handling when SDK fetch fails + +### Files Modified +- `src/integrations/copilot-agent.ts` - listAvailableModels methods +- `src/integrations/index.ts` - Export ModelInfo type +- `src/commands/init.ts` - Dynamic model fetching +- `src/core/model-compatibility.test.ts` - New parameterized tests +- `src/integrations/copilot-agent.test.ts` - listModels tests + +### Validation +- `npm run typecheck` ✅ +- `npm test` ✅ (305 tests passing) +- `npm run build` ✅ diff --git a/README.md b/README.md index 714fbb4..b7096ee 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Run **autonomous, checkpointed coding loops** with GitHub Copilot—designed to - 🌿 **Branch isolation**: works on a `ghcralph/*` branch (never modifies `main`/`master` directly) - 💾 **Automatic checkpoints**: commits after each iteration for easy rollback +- 🔄 **Multi-task processing**: processes ALL tasks in plan files automatically - 🛡️ **Guardrails**: iteration limits, token budgets, timeouts, circuit breaker on repeated failures - 📋 **Flexible plan sources**: GitHub Issues or local Markdown task lists - 💻 **Cross-platform**: Windows, macOS, Linux @@ -76,14 +77,15 @@ This approach prioritizes **safety** (automatic checkpoints, git isolation) and ## Key Features -- 🔄 **Autonomous Loop**: Repeatedly invokes AI agent until task completion +- 🔄 **Multi-Task Loop**: Processes ALL tasks in a plan file automatically with fresh AI agent per task - 📋 **Flexible Plan Sources**: GitHub Issues or local Markdown task lists - 🛡️ **Safety First**: Git branch isolation, file deletion safeguards -- 💾 **Automatic Checkpoints**: Git commits after each iteration for easy rollback +- 💾 **Automatic Checkpoints**: Git commits after each task completion for easy rollback - 📊 **Progress Tracking**: Real-time status, token usage, and session logs -- ⚡ **Guardrails**: Iteration limits, token budgets, timeout controls +- ⚡ **Guardrails**: Iteration limits, token budgets, timeout controls, task-level retries - 🔧 **Highly Configurable**: Customize behavior via CLI, env vars, or config files - 💻 **Cross-Platform**: Works on Windows, macOS, and Linux +- 🤖 **Dynamic Model Discovery**: Fetches available models from Copilot SDK ## Commands @@ -157,6 +159,9 @@ ghcralph run --github # Control iterations, tokens, and model via configuration # (set maxIterations / maxTokens / defaultModel in .ghcralph/config.json) +# Pause between tasks for human review (strict Ralph mode) +ghcralph run --file PLAN.md --pause-between-tasks + # Specify context files ghcralph run --task "Fix tests" --context "src/**/*.test.ts" @@ -184,19 +189,21 @@ GitHub Copilot Ralph uses a hierarchical configuration system: ### Configuration Options -| Option | Default | Description | -| --------------- | ----------- | ----------------------------------------------------- | -| `planSource` | `local` | Plan source: `github` or `local` | -| `maxIterations` | `10` | Maximum loop iterations | -| `maxTokens` | `100000` | Token budget | -| `defaultModel` | `gpt-4.1` | Copilot model to use | -| `autoCommit` | `true` | Auto-commit after iterations | -| `branchPrefix` | `ghcralph/` | Prefix for GitHub Copilot Ralph branches | -| `githubRepo` | - | GitHub repository (owner/repo) for GitHub plan source | -| `githubLabel` | - | Default GitHub issue label filter for GitHub plan | -| `githubMilestone` | - | Default GitHub issue milestone filter for GitHub plan | -| `githubAssignee` | - | Default GitHub issue assignee filter for GitHub plan | -| `localPlanFile` | - | Path to local plan file | +| Option | Default | Description | +| ------------------ | ----------- | ----------------------------------------------------- | +| `planSource` | `local` | Plan source: `github` or `local` | +| `maxIterations` | `10` | Maximum loop iterations per task | +| `maxTokens` | `100000` | Token budget per task | +| `defaultModel` | `gpt-4.1` | Copilot model to use (dynamically fetched from SDK) | +| `autoCommit` | `true` | Auto-commit after iterations | +| `branchPrefix` | `ghcralph/` | Prefix for GitHub Copilot Ralph branches | +| `maxRetriesPerTask`| `2` | Retries per task before marking as failed | +| `autoPush` | `false` | Auto-push to remote after each task completion | +| `githubRepo` | - | GitHub repository (owner/repo) for GitHub plan source | +| `githubLabel` | - | Default GitHub issue label filter for GitHub plan | +| `githubMilestone` | - | Default GitHub issue milestone filter for GitHub plan | +| `githubAssignee` | - | Default GitHub issue assignee filter for GitHub plan | +| `localPlanFile` | - | Path to local plan file | ### Environment Variables @@ -208,6 +215,8 @@ export GHCRALPH_MAX_TOKENS=50000 export GHCRALPH_DEFAULT_MODEL=gpt-4.1 export GHCRALPH_AUTO_COMMIT=true export GHCRALPH_BRANCH_PREFIX=ghcralph/ +export GHCRALPH_MAX_RETRIES_PER_TASK=3 +export GHCRALPH_AUTO_PUSH=true export GHCRALPH_PLAN_SOURCE=local export GHCRALPH_GITHUB_REPO=owner/repo export GHCRALPH_GITHUB_LABEL=ralph-ready @@ -225,6 +234,8 @@ export GHCRALPH_GITHUB_ASSIGNEE=octocat "defaultModel": "gpt-4.1", "autoCommit": true, "branchPrefix": "ghcralph/", + "maxRetriesPerTask": 2, + "autoPush": false, "githubRepo": "owner/repo", "githubLabel": "ralph-ready", "githubMilestone": "v1.0", diff --git a/docs/architecture.md b/docs/architecture.md index 191c1ec..8924417 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -457,6 +457,8 @@ graph LR | **Context accumulation** | Model drifts with long context | Conversation history accumulates | ✅ FIXED | | **Complex prompt template** | Meta-info confuses weaker models | Iteration/token counts in prompt | ✅ FIXED | | **Model sensitivity** | Weaker models perform poorly | Prompt relies on implicit understanding | ✅ FIXED | +| **Single task per run** | Only first task processed, then exits | No outer loop for multi-task iteration | ✅ FIXED v0.1.2 | +| **Hardcoded model list** | Init shows outdated model options | Model list not fetched from SDK | ✅ FIXED v0.1.2 | ### Current vs Expected Flow @@ -538,13 +540,14 @@ graph LR The action executor component has been implemented in `src/core/action-executor.ts`: **Supported Actions:** -| Action | Description | Example | -| ---------- | ------------------ | ----------------------------------------------- | -| `CREATE` | Create a new file | `[ACTION:CREATE] path: file.txt` | -| `EDIT` | Edit existing file | `[ACTION:EDIT] path: file.txt [OLD]...[NEW]...` | -| `DELETE` | Delete a file | `[ACTION:DELETE] path: file.txt` | -| `EXECUTE` | Run shell command | `[ACTION:EXECUTE] command: npm test` | -| `COMPLETE` | Mark task done | `[ACTION:COMPLETE] reason: Tests pass` | +| Action | Description | Example | +| ---------- | -------------------------- | ----------------------------------------------- | +| `CREATE` | Create a new file | `[ACTION:CREATE] path: file.txt` | +| `EDIT` | Edit existing file | `[ACTION:EDIT] path: file.txt [OLD]...[NEW]...` | +| `DELETE` | Delete a file | `[ACTION:DELETE] path: file.txt` | +| `EXECUTE` | Run shell command | `[ACTION:EXECUTE] command: npm test` | +| `COMPLETE` | Mark task done | `[ACTION:COMPLETE] reason: Tests pass` | +| `STUCK` | Signal blocked/unable | `[ACTION:STUCK] attempted:... blocker:...` | **Safety Features:** - Path validation (prevents escaping working directory) @@ -552,6 +555,23 @@ The action executor component has been implemented in `src/core/action-executor. - Command timeout (30 seconds default) - Dry run mode for testing +### 2.1.1 STUCK Action ✅ NEW in v0.1.2 + +The STUCK action allows the AI agent to signal when it cannot complete a task: + +``` +[ACTION:STUCK] +attempted: What the agent tried to do +blocker: What is preventing completion +suggestion: Optional suggestion for next steps +``` + +**Behavior:** +- STUCK triggers a task retry with a fresh AI agent +- The progress file documents the failed attempt for context +- After `maxRetriesPerTask` (default: 2) STUCKs, the task is marked failed +- Prevents false completion claims - encourages honest failure reporting + ### 2.2 Verification Hooks ✅ IMPLEMENTED The verification hooks component has been implemented in `src/core/verification-hooks.ts`: @@ -898,16 +918,19 @@ graph TB The current architecture successfully: - ✅ Authenticates with GitHub Copilot - ✅ Manages iteration loops with limits and guards +- ✅ **Processes ALL tasks in plan files** (multi-task loop) +- ✅ Creates **fresh AI agent per task** (Ralph pattern core) - ✅ Builds context-rich prompts - ✅ Sends/receives from Copilot SDK - ✅ Tracks progress and tokens - -The current architecture lacks: -- ❌ Structured output format specification -- ❌ Response parsing for file operations -- ❌ Action execution (file create/edit/delete) -- ❌ Command execution for verification -- ❌ Feedback loop to inform AI of results -- ❌ Clear task completion detection - -To work reliably with models like gpt-4.1, the CLI needs to move from a "chat wrapper" to a true "agent executor" that defines explicit action formats, parses responses, executes actions, and provides feedback. +- ✅ Parses structured ACTION responses +- ✅ Executes file and shell actions +- ✅ Supports graceful failure with STUCK action +- ✅ Dynamic model discovery from SDK + +The CLI has evolved from a "chat wrapper" to a true "agent executor" that: +1. Defines explicit action formats (CREATE, EDIT, DELETE, EXECUTE, COMPLETE, STUCK) +2. Parses AI responses for structured actions +3. Executes actions on the filesystem +4. Provides feedback to inform subsequent iterations +5. Processes multiple tasks with task-level retries and checkpoints diff --git a/docs/cookbook.md b/docs/cookbook.md index e8275ca..cd9780c 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -92,6 +92,32 @@ ghcralph run --task "Implement user authentication with JWT" \ - [ ] Add integration tests ``` +### Multi-Task Processing + +When you run `ghcralph run --file PLAN.md`, Ralph will: + +1. **Process ALL tasks** in the plan file automatically +2. **Create a fresh AI agent** for each task (prevents context pollution) +3. **Retry failed tasks** up to `maxRetriesPerTask` times (default: 2) +4. **Commit after each task** with `createTaskCheckpoint()` +5. **Print a final summary** showing tasks processed/completed/failed + +```bash +# Process all tasks in a plan file +ghcralph run --file TODO.md + +# Pause between tasks for human review (strict Ralph mode) +ghcralph run --file TODO.md --pause-between-tasks +``` + +**Configuration:** +```json +{ + "maxRetriesPerTask": 2, + "autoPush": false +} +``` + --- ## Pattern: Refactoring Session @@ -246,6 +272,28 @@ ghcralph rollback --list ghcralph rollback --iterations 1 ``` +### Task marked as STUCK + +If a task is marked as STUCK (agent signaled it cannot complete): + +```bash +# Check the progress file for details on what was attempted +cat .ghcralph/progress.md + +# The agent will retry with fresh context up to maxRetriesPerTask times +# If all retries fail, review the blocker and consider: +# 1. Breaking the task into smaller pieces +# 2. Providing more context with --context +# 3. Resolving the blocker manually and re-running +``` + +**Configure retry behavior:** +```json +{ + "maxRetriesPerTask": 3 +} +``` + ### Token budget exhausted ```bash diff --git a/package-lock.json b/package-lock.json index 58ef21c..6350fb7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "ghcralph", - "version": "0.1.1", + "version": "0.1.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "ghcralph", - "version": "0.1.1", + "version": "0.1.2", "license": "MIT", "dependencies": { "@github/copilot-sdk": "^0.1.17", diff --git a/package.json b/package.json index 93fbda2..2343ae3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ghcralph", - "version": "0.1.1", + "version": "0.1.2", "description": "GitHub Copilot Ralph - A cross-platform CLI for running autonomous agentic coding loops using the Ralph Wiggum pattern with GitHub Copilot", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/plans/LOOP_MAJOR_BUG_REMEDIATION_PLAN.md b/plans/LOOP_MAJOR_BUG_REMEDIATION_PLAN.md new file mode 100644 index 0000000..52d62c9 --- /dev/null +++ b/plans/LOOP_MAJOR_BUG_REMEDIATION_PLAN.md @@ -0,0 +1,466 @@ +# GHC Ralph CLI - Remediation Plan + +**Date**: January 28, 2026 +**Issue**: CLI gets stuck after 2 iterations, processing only the first task of a multi-task plan +**Severity**: Critical (core functionality broken) +**Reported via**: `ghcralph run --file ./PLAN.md --verbose` in ghc-ralph-cli-demo + +--- + +## Executive Summary + +The `ghcralph` CLI fails to process all tasks in a Markdown plan file. After successfully completing the first task ("Create calculator.sh with basic structure"), the CLI terminates prematurely instead of continuing to the remaining 11 tasks in the plan. + +--- + +## ⚠️ Critical Pattern Analysis (from Original Source) + +After reviewing the original Ralph Wiggum pattern documentation from Geoffrey Huntley: +- [ghuntley.com/ralph](https://ghuntley.com/ralph/) +- [ghuntley.com/loop](https://ghuntley.com/loop/) + +### Key Quotes from Original Pattern + +**From /ralph/**: +> "In its purest form, Ralph is a Bash loop: `while :; do cat PROMPT.md | claude-code ; done`" + +> "Ralph is monolithic. Ralph works autonomously in a single repository as a single process that **performs one task per loop**." + +**From /loop/**: +> "In practice this means doing the loop manually via prompting or via automation **with a pause that involves having to press CTRL+C to progress onto the next task**. This is still ralphing..." + +> "It's important to watch the loop as that is where your personal development and learning will come from." + +### Pattern Interpretation Options + +There are **two valid interpretations** of how the CLI should behave with multi-task plan files: + +| Aspect | Option 1: Strict Ralph | Option 2: Automated Multi-Task | +| -------------- | ---------------------------------------- | --------------------------------- | +| **Philosophy** | Human-in-the-loop per task | Fully automated plan execution | +| **Flow** | One task → Exit → Human reviews → Re-run | All tasks → Continuous until done | +| **Control** | Press CTRL+C to continue | `--continuous` flag opt-in | +| **Learning** | Forces human observation | Async background processing | + +**The original pattern leans toward Option 1** (human pause between tasks), but the CLI's README and `--file PLAN.md` UX suggests **Option 2** is the expected behavior for this tool. + +### Recommended Approach: Hybrid + +Support both modes: +- **Default**: Process all tasks automatically (user expectation for `--file PLAN.md`) +- **`--pause-between-tasks`**: Stop after each task for human review (strict Ralph mode) + +--- + +## Expected Process Flow (User's Requirement) + +``` +1. Parse plan file → Extract list of tasks (grouped by phases or not) +2. Get next non-completed task → Delegate to fresh AI agent instance +3. On agent result (success/failure) → Document in progress file, mark task status +4. Check for remaining tasks → If yes, go to step 2; If no, terminate +``` + +--- + +## 1. Investigation Findings + +### 1.1 Observed Behavior + +When running `ghcralph run --file ./PLAN.md`: + +1. ✅ CLI correctly parses PLAN.md and identifies 12 tasks +2. ✅ CLI selects the first pending task: "Create calculator.sh with basic structure" +3. ✅ Loop engine runs 2 iterations successfully +4. ✅ AI creates a basic `calculator.sh` file (incomplete implementation) +5. ✅ AI marks the task as complete with `[ACTION:COMPLETE]` +6. ✅ Task checkbox is updated in PLAN.md (`[x]`) +7. ❌ **CLI terminates instead of processing the next pending task** + +### 1.2 Evidence from Test Run + +**Progress file state** (`.ghcralph/progress.md`): +- Status: ✅ Completed +- Iterations: 2/10 +- Tokens Used: 2,285 +- Only first task processed + +**PLAN.md state**: +- Only 1 of 12 tasks marked complete: `[x] Create calculator.sh with basic structure` +- 11 remaining tasks still pending (unchecked) + +**calculator.sh output**: +- Basic skeleton with argument parsing +- No arithmetic operations implemented +- Does not meet expected outcomes from PLAN.md + +### 1.3 Root Cause Analysis + +**PRIMARY ROOT CAUSE: Missing Task Iteration Loop** + +The `run` command in [src/commands/run.ts](src/commands/run.ts) only processes **one task per invocation**. There is no outer loop to continue processing the remaining pending tasks after the first task completes. + +**Architecture Documentation Gap**: The [docs/architecture.md](docs/architecture.md) sequence diagram shows the flow ending at "Phase 5: Completion" with no loop back to process additional tasks. This suggests the multi-task iteration was **never implemented**, even though the README and user expectations imply it should work. + +**Code Flow Analysis:** + +``` +run.ts (lines 270-280): + 1. planManager.getNextTask() ← Gets first pending task + 2. task = nextTask ← Assigns to single variable + 3. engine.start(task) ← Runs loop for ONE task + 4. planManager.completeTask() ← Marks task complete + 5. EXIT ← No loop to get next task! +``` + +**Key Issue Location**: [run.ts#L476-L480](src/commands/run.ts#L476-L480) + +```typescript +if (finalState.status === 'completed') { + // Mark task as complete in plan file if using a plan + if (planManager) { + await planManager.completeTask(task.id); + info(`Task marked as complete in plan file`); + } + success('Loop completed successfully'); // ← Exits here! +} +``` + +After marking the task complete, the CLI simply exits instead of: +1. Calling `planManager.getNextTask()` to get the next pending task +2. Creating a **fresh AI agent instance** for that task (per Ralph pattern) +3. Starting a new loop for that task +4. Repeating until all tasks are complete + +### 1.4 Secondary Issues Identified + +| Issue | Description | Severity | +| ---------------------------------------- | ----------------------------------------------------------------- | -------- | +| **Incomplete Implementation Quality** | AI marked task complete with only a skeleton implementation | Medium | +| **No Verification Hook Failure** | If tests existed, verification should have failed | Medium | +| **Progress Not Persisted Across Tasks** | Progress file only tracks current task, not overall plan progress | Low | +| **No Task-Level Retry with Fresh Agent** | Failed tasks are marked failed, no retry with learning | Medium | +| **No Honesty Prompt Guidance** | Prompt doesn't encourage agent to be honest about failures | Medium | + +--- + +## 2. Proposed Solutions + +### Option A: Hybrid Mode with `--continuous` Flag (Recommended) + +**Description**: Add multi-task iteration loop that runs by default, with an optional `--pause-between-tasks` flag for strict Ralph pattern adherence. Include task-level retry with fresh agent that benefits from progress documentation. + +**Behavior**: +- `ghcralph run --file PLAN.md` → Processes ALL tasks automatically (default) +- `ghcralph run --file PLAN.md --pause-between-tasks` → Stops after each task for human review +- Failed tasks can be retried with a fresh agent up to `maxRetriesPerTask` times (default: 2) +- Each retry creates a fresh agent but includes progress document learnings in context + +**Pros**: +- Meets user expectations for `--file PLAN.md` workflow +- Respects original Ralph pattern philosophy with opt-in pause mode +- Failed tasks get retried with fresh agent + learned context +- Encourages honest failure reporting through prompt engineering +- `PlanManager` interface already supports `getNextTask()` + +**Cons**: +- Additional flag to document +- Need to handle edge cases (interrupts, cumulative budgets) + +**Implementation Sketch**: +```typescript +// In run.ts - after initial task selection +let task: Task | null = await planManager.getNextTask(); +let taskNumber = 0; + +// CheckpointManager is already created earlier and handles per-iteration commits +// We'll add a final "task complete" checkpoint after each task + +while (task) { + taskNumber++; + let taskAttempt = 0; + let taskCompleted = false; + + while (!taskCompleted && taskAttempt < config.maxRetriesPerTask) { + taskAttempt++; + + if (taskAttempt === 1) { + info(`\n📋 Task ${taskNumber}: ${task.title}`); + } else { + info(`\n🔄 Retry ${taskAttempt}/${config.maxRetriesPerTask} for task: ${task.title}`); + } + + // Create FRESH agent instance for each attempt (Ralph pattern core principle) + const agent = new CopilotAgent({ model, maxTokensPerRequest: 4096 }); + + // Build context that includes learnings from progress document + const previousProgress = await progressTracker.loadPreviousTaskResults(); + const engineConfigWithContext = { + ...engineConfig, + contextConfig: { + ...engineConfig.contextConfig, + previousTaskProgress: previousProgress, // Inject learnings from prior attempts/tasks + } + }; + + const engine = new LoopEngine(agent, engineConfigWithContext); + + // NOTE: The engine's event handlers already create checkpoints (commits) + // after each successful ITERATION via checkpointManager.createCheckpoint() + + const finalState = await engine.start(task); + + // Document result in progress file (success OR failure - for learning) + await progressTracker.appendTaskResult(task, finalState, taskAttempt); + + if (finalState.status === 'completed') { + taskCompleted = true; + await planManager.completeTask(task.id); + + // Create a "task complete" checkpoint commit + await checkpointManager.createTaskCheckpoint(task, finalState); + + // Push changes to remote (opinionated addition) + if (config.autoPush) { + await gitManager.pushToRemote(); + } + + success(`✓ Task completed: ${task.title}`); + } else { + warn(`✗ Task attempt ${taskAttempt} failed: ${task.title}`); + // On failure, commit progress so far for documentation + await checkpointManager.createFailureCheckpoint(task, finalState, taskAttempt); + } + + // Cleanup agent before next attempt or next task + await agent.destroy(); + } + + if (!taskCompleted) { + // All retries exhausted + await planManager.failTask(task.id); + error(`❌ Task failed after ${config.maxRetriesPerTask} attempts: ${task.title}`); + } + + // Optional pause for human review (strict Ralph mode) + if (options.pauseBetweenTasks) { + info('Press Enter to continue to next task, or Ctrl+C to stop...'); + await waitForKeypress(); + } + + // Get next task + await planManager.reload?.(); + task = await planManager.getNextTask(); +} + +success(`\n🎉 All ${taskNumber} tasks in the plan are complete!`); +``` + +#### Key Behaviors Preserved/Enhanced: + +| Behavior | Current Implementation | After Fix | +| ------------------------------- | --------------------------------------------------------------- | -------------------------------------------------- | +| **Branch Isolation** | ✅ Creates `ghcralph/*` branch at run start | ✅ Preserved | +| **Per-Iteration Commits** | ✅ `iterationEnd` event → `checkpointManager.createCheckpoint()` | ✅ Preserved | +| **Per-Task Commits** | ❌ Not implemented | ✅ Add `createTaskCheckpoint()` | +| **Push to Remote** | ❌ Not implemented | ✅ Add `autoPush` config + `pushToRemote()` | +| **Task Retry with Fresh Agent** | ❌ Not implemented | ✅ Add retry loop with new agent per attempt | +| **Learn from Progress Doc** | ⚠️ Only within single agent session | ✅ Load previous task results for new agent context | + +#### Prompt Engineering for Honesty & Graceful Failure + +The prompt template in `src/core/context-builder.ts` should be enhanced to encourage honest reporting and graceful failure handling. Add the following to the prompt: + +```typescript +// Add to DEFAULT_PROMPT_TEMPLATE in context-builder.ts + +const HONESTY_GUIDANCE = ` +## Failure Handling & Honesty + +**IMPORTANT**: Be honest about your progress and limitations. + +- If you **cannot complete** the task, do NOT use [ACTION:COMPLETE] +- Instead, document what you tried and why it failed +- Use [ACTION:EXECUTE] to verify your work before claiming completion +- If tests fail or you encounter blocking issues, report them honestly + +**When you cannot proceed**, respond with: +\`\`\` +[ACTION:STUCK] +attempted: +blocker: +suggestion: +\`\`\` + +This honest reporting helps: +1. The next agent attempt learn from your experience +2. Humans understand what went wrong +3. The progress document serve as accurate documentation +`; +``` + +This guidance will be injected into the prompt, encouraging agents to: +1. **Be honest** about whether the task is truly complete +2. **Document failures** properly for learning +3. **Use verification** (tests) before claiming completion +4. **Report blockers** clearly instead of false completion + +--- + +### Option B: Recursive Re-invocation via Child Process + +**Description**: After completing a task, spawn a new `ghcralph run` process for the same plan file. + +**Pros**: +- Clean state between tasks (fresh agent per task automatically) +- Simpler implementation + +**Cons**: +- Overhead of process spawning +- Loss of session context +- Harder to manage cumulative token limits +- Not idiomatic + +--- + +### Option C: Create a Separate `ghcralph run-plan` Command + +**Description**: Create a new command specifically for running entire plan files, keeping `run` for single tasks. + +**Pros**: +- Clear separation of concerns +- Backward compatible + +**Cons**: +- User confusion (which command to use?) +- Code duplication +- Inconsistent with current `--file` behavior expectations + +--- + +## 3. Recommended Solution + +**Option A: Add Task Iteration Loop in run.ts** + +This is the most natural fix because: +1. The `PlanManager` interface already has `getNextTask()` and `completeTask()` methods designed for this pattern +2. Users expect `--file PLAN.md` to process all tasks in the plan +3. Minimal disruption to existing codebase +4. Easy to test + +--- + +## 4. Implementation Plan + +### Phase 1: Core Fix (Critical) + +| Task | File | Effort | +| ----------------------------------------------- | --------------------------------- | ------ | +| Add outer while loop for task iteration | `src/commands/run.ts` | 2h | +| Add task-level retry loop with fresh agent | `src/commands/run.ts` | 1.5h | +| Add `--pause-between-tasks` flag | `src/commands/run.ts` | 0.5h | +| Add `waitForKeypress()` utility | `src/utils/index.ts` | 0.5h | +| Reset engine state between tasks | `src/core/loop-engine.ts` | 1h | +| Add `reload()` method to LocalMarkdownPlan | `src/core/local-markdown-plan.ts` | 0.5h | +| Update progress tracker for multi-task sessions | `src/core/progress-tracker.ts` | 1h | +| Add `loadPreviousTaskResults()` method | `src/core/progress-tracker.ts` | 1h | +| Add `createTaskCheckpoint()` method | `src/core/checkpoint-manager.ts` | 0.5h | +| Add `createFailureCheckpoint()` method | `src/core/checkpoint-manager.ts` | 0.5h | +| Add `pushToRemote()` method | `src/core/git-branch-manager.ts` | 0.5h | +| Add `autoPush` and `maxRetriesPerTask` config | `src/core/config-schema.ts` | 0.5h | + +### Phase 1b: Prompt Engineering (Honesty & Failure Handling) + +| Task | File | Effort | +| ----------------------------------------- | ----------------------------- | ------ | +| Add `HONESTY_GUIDANCE` to prompt template | `src/core/context-builder.ts` | 0.5h | +| Add `[ACTION:STUCK]` action type | `src/core/response-parser.ts` | 1h | +| Handle STUCK action in action-executor | `src/core/action-executor.ts` | 0.5h | +| Include previous task results in context | `src/core/context-builder.ts` | 1h | + +### Phase 2: Token Budget Management + +| Task | File | Effort | +| ----------------------------------------- | --------------------- | ------ | +| Decide per-task vs cumulative token limit | Design decision | 0.5h | +| Implement token budget carry-over | `src/commands/run.ts` | 1h | +| Add `--per-task-budget` flag | `src/commands/run.ts` | 0.5h | + +### Phase 3: Testing & Verification + +| Task | File | Effort | +| ---------------------------------------- | ------------------------------- | ------ | +| Add integration test for multi-task plan | `test/integration/` | 2h | +| Test task retry with fresh agent | `test/integration/` | 1h | +| Test pause-between-tasks mode | `test/integration/` | 1h | +| Test commit/push after task completion | `test/integration/` | 1h | +| Test progress document learning | `test/integration/` | 1h | +| Test STUCK action handling | `test/integration/` | 1h | +| Test interruption/resume behavior | `test/integration/` | 1h | +| Update documentation | `README.md`, `docs/cookbook.md` | 1h | + +--- + +## 5. Success Criteria + +After implementing this fix: + +### Core Functionality +1. ✅ Running `ghcralph run --file PLAN.md` processes ALL pending tasks sequentially (default) +2. ✅ Running `ghcralph run --file PLAN.md --pause-between-tasks` stops for human review after each task +3. ✅ Each task is delegated to a **fresh AI agent instance** (clean context per task) +4. ✅ Each task is marked complete/failed in PLAN.md after processing +5. ✅ Progress is documented in the progress file after each task (success or failure) +6. ✅ CLI only exits when all tasks are complete OR on error/interrupt +7. ✅ Token budget is properly managed (per-task or cumulative, based on design decision) + +### Task Retry with Fresh Agent & Progress Learning +8. ✅ Failed tasks are retried with a **brand new agent instance** (fresh context window) +9. ✅ Fresh agents **benefit from progress document learnings** (previous task results injected) +10. ✅ Each retry attempt is logged with attempt number in progress document +11. ✅ Failure checkpoints preserve state for post-mortem analysis + +### Prompt Engineering for Honest Failure Reporting +12. ✅ Agent prompt includes **honesty guidance** encouraging accurate result reporting +13. ✅ Agent can use `[ACTION:STUCK]` to gracefully signal inability to complete task +14. ✅ STUCK action triggers: failure checkpoint, retry with fresh agent, helpful error message +15. ✅ After max retries, task marked as failed with diagnostic information preserved + +### Git Integration (Opinionated Additions - PRESERVED) +16. ✅ **Branch Isolation**: Run starts on isolated `ghcralph/*` branch (existing behavior preserved) +17. ✅ **Per-Iteration Commits**: Checkpoint commit after each successful iteration (existing behavior preserved) +18. ✅ **Per-Task Commits**: Additional checkpoint commit when task is marked complete (new) +19. ✅ **Auto-Push**: Push to remote after each successful task completion (new, via `autoPush` config) + +--- + +## 6. Rollback Plan + +If the fix introduces regressions: +1. Revert the commit +2. Document the issue for further investigation +3. Consider Option C (separate command) as a fallback + +--- + +## 7. References + +### Internal Code + +- [loop-engine.ts](src/core/loop-engine.ts) - Core loop implementation +- [run.ts](src/commands/run.ts) - Run command (fix location) +- [local-markdown-plan.ts](src/core/local-markdown-plan.ts) - Plan manager +- [plan-manager.ts](src/core/plan-manager.ts) - PlanManager interface +- [docs/architecture.md](docs/architecture.md) - Architecture documentation (needs update) + +### External References + +- [ghuntley.com/ralph](https://ghuntley.com/ralph/) - Original Ralph Wiggum pattern description: + > "In its purest form, Ralph is a Bash loop: `while :; do cat PROMPT.md | claude-code ; done`" + > "Ralph works autonomously in a single repository as a single process that performs one task per loop." + +- [ghuntley.com/loop](https://ghuntley.com/loop/) - Loop pattern details: + > "In practice this means doing the loop manually via prompting or via automation with a pause that involves having to press CTRL+C to progress onto the next task." + > "It's important to watch the loop as that is where your personal development and learning will come from." + +- [README.md](README.md) - User-facing documentation of expected behavior diff --git a/plans/MODEL_COMPAT_TEST_PLAN.md b/plans/MODEL_COMPAT_TEST_PLAN.md new file mode 100644 index 0000000..cbf70fb --- /dev/null +++ b/plans/MODEL_COMPAT_TEST_PLAN.md @@ -0,0 +1,50 @@ +# Model Compatibility Test Plan + +Goal: validate that each advertised/default model works end-to-end with `ghcralph run` (no hangs, produces ACTION blocks, can complete the calculator scenario). + +## Scope +- Use the existing realistic scenario: `test/integration/calculator/PLAN.md` +- Run the same task with each model, capturing: + - time-to-first-response (iteration 1) + - whether iteration progresses beyond "Executing prompt" + - whether the loop can reach COMPLETE + - whether process exits cleanly (no external `timeout` kill) + +## Models to Test +- [ ] gpt-4.1 +- [ ] gpt-4 +- [ ] gpt-5 +- [ ] gpt-5.2-codex +- [ ] claude-sonnet-4.5 + +## Recommended Test Setup (per model) +1. Build CLI: + - [ ] `npm run -s build` +2. Reset the calculator directory state (optional but recommended): + - [ ] `cd test/integration/calculator && git checkout -- . && rm -f calculator.sh && rm -rf .ghcralph` +3. Configure model (local config preferred): + - [ ] `cd test/integration/calculator && node ../../../bin/ghcralph.js config set defaultModel ` +4. Run the plan with hard timeout and verbose logs: + - [ ] `cd test/integration/calculator && timeout 180s node ../../../bin/ghcralph.js run --file PLAN.md --force --verbose` +5. Validate output: + - [ ] `cd /workspaces/ghc-ralph-cli && npx vitest run --config vitest.integration.config.ts test/integration/calculator/calculator.test.ts` + +## Expected Observations +- Iteration 1 should progress past: + - `Executing prompt (...)` + within a reasonable time (suggested budget: <60s). +- If the loop completes but the process does not exit, record it as "exit hang". +- If the loop never progresses past `Executing prompt`, record it as "request hang". + +## Results Table (fill in) +| Model | Iteration 1 response (s) | Completes? | Exits cleanly? | Notes | +| ----------------- | ------------------------ | ---------- | -------------- | ----- | +| gpt-4.1 | | | | | +| gpt-4 | | | | | +| gpt-5 | | | | | +| gpt-5.2-codex | | | | | +| claude-sonnet-4.5 | | | | | + +## Follow-ups +- [ ] If a model hangs only with `sendAndWait`, prefer event-driven wait (session.idle). +- [ ] If a model hangs on exit, investigate Copilot SDK client shutdown (`stop()` vs `forceStop()`) and outstanding async tasks. diff --git a/src/commands/init.ts b/src/commands/init.ts index bfb8f94..f71d862 100644 --- a/src/commands/init.ts +++ b/src/commands/init.ts @@ -10,6 +10,15 @@ import type { Command } from 'commander'; import { info, success, error, warn, debug, heading, code, dim } from '../utils/index.js'; import { ConfigManager } from '../core/config-manager.js'; import type { PlanSource } from '../core/config-schema.js'; +import { CopilotAgent, type ModelInfo } from '../integrations/copilot-agent.js'; + +// Fallback models if SDK fetch fails +const FALLBACK_MODELS = [ + 'gpt-4.1', + 'claude-sonnet-4.5', + 'gpt-5', + 'gpt-5.2-codex', +] as const; /** * Check if we're in a git repository @@ -130,6 +139,37 @@ async function promptSelect( } } +/** + * Fetch available models from the Copilot SDK + * Falls back to hardcoded list if fetch fails + */ +async function fetchModelOptions(_currentDefault: string): Promise> { + debug('Fetching available models from Copilot SDK...'); + + try { + const models = await CopilotAgent.fetchAvailableModels(); + + if (models.length > 0) { + debug(`Found ${models.length} models from SDK`); + const options = models.map((m: ModelInfo) => ({ + label: m.name || m.id, + value: m.id, + })); + // Add custom option at the end + options.push({ label: 'Custom (enter manually)', value: '__custom__' }); + return options; + } + } catch (err) { + debug(`Failed to fetch models: ${err instanceof Error ? err.message : String(err)}`); + } + + // Fallback to hardcoded list + debug('Using fallback model list'); + const fallbackOptions: Array<{ label: string; value: string }> = FALLBACK_MODELS.map(m => ({ label: m, value: m })); + fallbackOptions.push({ label: 'Custom (enter manually)', value: '__custom__' }); + return fallbackOptions; +} + export interface InitOptions { force?: boolean; planSource?: PlanSource; @@ -255,21 +295,15 @@ See also: const maxTokens = await promptNumber(rl, 'Max tokens', current.maxTokens); configManager.set('maxTokens', maxTokens); - const modelOptions = [ - { label: 'gpt-4.1', value: 'gpt-4.1' }, - { label: 'gpt-4', value: 'gpt-4' }, - { label: 'gpt-5', value: 'gpt-5' }, - { label: 'gpt-5.2-codex', value: 'gpt-5.2-codex' }, - { label: 'claude-sonnet-4.5', value: 'claude-sonnet-4.5' }, - { label: 'Custom (enter manually)', value: '__custom__' }, - ] as const; + // Fetch available models dynamically from SDK + const modelOptions = await fetchModelOptions(current.defaultModel); const selectedModel = await promptSelect( rl, 'Model', modelOptions, (modelOptions.some(o => o.value === current.defaultModel) - ? (current.defaultModel as (typeof modelOptions)[number]['value']) + ? current.defaultModel : '__custom__') ); diff --git a/src/commands/run.ts b/src/commands/run.ts index fafb7c4..53e2777 100644 --- a/src/commands/run.ts +++ b/src/commands/run.ts @@ -9,6 +9,7 @@ import path from 'node:path'; import { execSync } from 'node:child_process'; import type { Command } from 'commander'; import { info, success, error, warn, debug, spinner, heading, code, dim, parseNonNegativeInt } from '../utils/index.js'; +import { waitForKeypress } from '../utils/shell.js'; import { CopilotAgent } from '../integrations/index.js'; import { LoopEngine, @@ -34,6 +35,7 @@ export interface RunOptions { timeout?: string; allowDelete?: boolean; dryRun?: boolean; + pauseBetweenTasks?: boolean; } /** @@ -143,6 +145,7 @@ export function registerRunCommand(program: Command): void { .option('--timeout ', 'Maximum duration in minutes') .option('--allow-delete', 'Allow deletion of pre-existing files') .option('--dry-run', 'Show what would happen without executing') + .option('--pause-between-tasks', 'Pause for human review after each task (strict Ralph mode)') .addHelpText('after', ` Config-backed settings (set via .ghcralph/config.json or GHCRALPH_* env vars): - maxIterations, maxTokens, defaultModel, autoCommit, branchPrefix @@ -382,12 +385,6 @@ See also: console.log(` ${dim('Branch:')} ${code(branchInfo.branchName)}`); } - // Create agent and engine with context configuration - const agent = new CopilotAgent({ - model, - maxTokensPerRequest: 4096, - }); - // Build context config, only including contextGlobs if provided const contextConfig: { contextGlobs?: string[]; @@ -403,14 +400,6 @@ See also: contextConfig.contextGlobs = options.context; } - const engine = new LoopEngine(agent, { - maxIterations, - maxTokens, - maxDurationMinutes, - allowUnlimited: options.unlimited === true, - contextConfig, - }); - // Create progress tracker const progressTracker = new ProgressTracker(undefined, maxIterations); @@ -425,106 +414,267 @@ See also: }); await fileSafeguard.initialize(); - // Setup signal handlers for graceful shutdown - const cleanupSignalHandlers = setupSignalHandlers(engine); - - // Setup event listeners - const events = engine.getEvents(); const startTime = new Date(); - - events.on('iterationStart', (iteration, state) => { - debug( - `Iteration ${iteration}/${maxIterations} - Tokens: ${state.tokensUsed.toLocaleString()}` - ); - }); - - events.on('iterationEnd', (record, state) => { - const status = record.success ? '✓' : '✗'; - info( - `Iteration ${record.iteration}: ${status} (${record.tokensUsed.toLocaleString()} tokens)` - ); - if (record.summary) { - console.log(` ${dim(record.summary)}`); - } + let exitCode = 0; + let totalTasksProcessed = 0; + let totalTasksCompleted = 0; + let totalTasksFailed = 0; + const maxRetriesPerTask = config.maxRetriesPerTask ?? 2; + const autoPush = config.autoPush ?? false; + + // ========== MULTI-TASK ITERATION LOOP ========== + // This is the core fix: process ALL tasks in the plan, not just the first one + + let currentTask: Task | null = task; + + while (currentTask) { + // Capture task in a const for this iteration (helps TypeScript narrowing) + const activeTask: Task = currentTask; - // Create checkpoint commit after successful iterations - if (record.success && checkpointManager.isAutoCommitEnabled()) { - checkpointManager - .createCheckpoint(record.iteration, record.summary ?? 'iteration complete', record.tokensUsed) - .then((checkpoint) => { - if (checkpoint) { - debug(`Checkpoint created: ${checkpoint.commitHash.substring(0, 7)}`); - // Update progress with commit hash - state.lastCheckpoint = checkpoint.commitHash; - } - }) - .catch(() => { - // Ignore checkpoint errors + totalTasksProcessed++; + let taskAttempt = 0; + let taskCompleted = false; + let taskFailed = false; + + info(`\n📋 Task ${totalTasksProcessed}: ${activeTask.title}`); + + // Retry loop for the current task (with fresh agent per attempt) + while (!taskCompleted && !taskFailed && taskAttempt < maxRetriesPerTask) { + taskAttempt++; + + if (taskAttempt > 1) { + info(`\n🔄 Retry ${taskAttempt}/${maxRetriesPerTask} for task: ${activeTask.title}`); + } + + // Create FRESH agent instance for each attempt (Ralph pattern core principle) + const agent = new CopilotAgent({ + model, + maxTokensPerRequest: 4096, + }); + + const engine = new LoopEngine(agent, { + maxIterations, + maxTokens, + maxDurationMinutes, + allowUnlimited: options.unlimited === true, + contextConfig, + }); + + // Setup signal handlers for graceful shutdown + const cleanupSignalHandlers = setupSignalHandlers(engine); + + // Setup event listeners + const events = engine.getEvents(); + + events.on('iterationStart', (iteration, state) => { + debug( + `Iteration ${iteration}/${maxIterations} - Tokens: ${state.tokensUsed.toLocaleString()}` + ); + }); + + events.on('iterationEnd', (record, state) => { + const status = record.success ? '✓' : '✗'; + info( + `Iteration ${record.iteration}: ${status} (${record.tokensUsed.toLocaleString()} tokens)` + ); + if (record.summary) { + console.log(` ${dim(record.summary)}`); + } + + // Create checkpoint commit after successful iterations + if (record.success && checkpointManager.isAutoCommitEnabled()) { + checkpointManager + .createCheckpoint(record.iteration, record.summary ?? 'iteration complete', record.tokensUsed) + .then((checkpoint) => { + if (checkpoint) { + debug(`Checkpoint created: ${checkpoint.commitHash.substring(0, 7)}`); + // Update progress with commit hash + state.lastCheckpoint = checkpoint.commitHash; + } + }) + .catch(() => { + // Ignore checkpoint errors + }); + } + + // Save progress after each iteration + progressTracker.save(state).catch(() => { + // Ignore save errors }); + }); + + events.on('error', (err) => { + error(`Loop error: ${err.message}`); + }); + + events.on('warning', (type, message) => { + warn(`Warning: ${message}`); + }); + + // Run the loop for this task + const loopSpinner = spinner('Running agentic loop...'); + loopSpinner.start(); + + try { + const finalState = await engine.start(activeTask); + + loopSpinner.stop(); + console.log(''); + + // Print iteration summary + console.log(` ${dim('Status:')} ${finalState.status}`); + console.log(` ${dim('Iterations:')} ${finalState.iteration}/${maxIterations}`); + console.log(` ${dim('Tokens used:')} ${finalState.tokensUsed.toLocaleString()}`); + + const successfulIterations = finalState.iterations.filter((i) => i.success).length; + console.log(` ${dim('Successful iterations:')} ${successfulIterations}`); + console.log(''); + + if (finalState.status === 'completed') { + taskCompleted = true; + + // Mark task as complete in plan file if using a plan + if (planManager) { + await planManager.completeTask(activeTask.id); + info(`Task marked as complete in plan file`); + } + + // Document result in progress file + await progressTracker.appendTaskResult( + activeTask, + 'completed', + taskAttempt, + `Completed in ${finalState.iteration} iterations` + ); + + // Create a task-level checkpoint commit + await checkpointManager.createTaskCheckpoint( + activeTask.title, + activeTask.id, + `Task completed in ${finalState.iteration} iterations` + ); + + // Push to remote if configured + if (autoPush && isGitRepo) { + info('Pushing changes to remote...'); + const pushed = await gitManager.pushToRemote(); + if (pushed) { + success('Changes pushed to remote'); + } + } + + success(`✓ Task completed: ${activeTask.title}`); + totalTasksCompleted++; + + } else if (finalState.status === 'stopped') { + // User requested stop - exit the entire run + warn('Loop was stopped by user'); + + // Document the stop + await progressTracker.appendTaskResult( + activeTask, + 'stuck', + taskAttempt, + 'Stopped by user' + ); + + // Exit the multi-task loop + currentTask = null; + continue; + + } else if (finalState.status === 'failed') { + warn(`✗ Task attempt ${taskAttempt} failed: ${activeTask.title}`); + + // Document failure for learning + await progressTracker.appendTaskResult( + activeTask, + 'failed', + taskAttempt, + 'Loop execution failed' + ); + + // Create failure checkpoint + await checkpointManager.createFailureCheckpoint( + activeTask.title, + activeTask.id, + taskAttempt, + 'Loop execution failed' + ); + } + } catch (err) { + loopSpinner.fail('Loop failed'); + const errMsg = err instanceof Error ? err.message : String(err); + error(errMsg); + + // Document failure + await progressTracker.appendTaskResult( + activeTask, + 'failed', + taskAttempt, + undefined, + errMsg + ); + + // Create failure checkpoint + await checkpointManager.createFailureCheckpoint( + activeTask.title, + activeTask.id, + taskAttempt, + errMsg + ); + } finally { + cleanupSignalHandlers(); + await agent.destroy(); + } } - // Save progress after each iteration - progressTracker.save(state).catch(() => { - // Ignore save errors - }); - }); - - events.on('error', (err) => { - error(`Loop error: ${err.message}`); - }); - - events.on('warning', (type, message) => { - warn(`Warning: ${message}`); - }); - - // Run the loop - const loopSpinner = spinner('Running agentic loop...'); - loopSpinner.start(); - - let exitCode = 0; - - try { - const finalState = await engine.start(task); - - loopSpinner.stop(); - console.log(''); - - // Print summary - console.log(heading('📊 Summary')); - console.log(''); - console.log(` ${dim('Status:')} ${finalState.status}`); - console.log(` ${dim('Iterations:')} ${finalState.iteration}/${maxIterations}`); - console.log(` ${dim('Tokens used:')} ${finalState.tokensUsed.toLocaleString()}`); - console.log(` ${dim('Elapsed time:')} ${formatElapsedTime(startTime)}`); - - const successfulIterations = finalState.iterations.filter((i) => i.success).length; - console.log(` ${dim('Successful iterations:')} ${successfulIterations}`); - - console.log(''); - - if (finalState.status === 'completed') { - // Mark task as complete in plan file if using a plan - if (planManager) { - await planManager.completeTask(task.id); - info(`Task marked as complete in plan file`); - } - success('Loop completed successfully'); - } else if (finalState.status === 'stopped') { - warn('Loop was stopped by user'); - } else if (finalState.status === 'failed') { + // Check if all retries exhausted without completion + if (!taskCompleted && currentTask) { + taskFailed = true; + totalTasksFailed++; + if (planManager) { - await planManager.failTask(task.id); + await planManager.failTask(activeTask.id); } - error('Loop failed'); + + error(`❌ Task failed after ${maxRetriesPerTask} attempts: ${activeTask.title}`); exitCode = 1; } - } catch (err) { - loopSpinner.fail('Loop failed'); - error(err instanceof Error ? err.message : String(err)); - exitCode = 1; - } finally { - cleanupSignalHandlers(); - await agent.destroy(); + + // Optional pause for human review (strict Ralph mode) + if (options.pauseBetweenTasks && planManager && !taskFailed) { + console.log(''); + info('Press any key to continue to next task, or Ctrl+C to stop...'); + await waitForKeypress(); + } + + // Get next task from the plan (if using a plan) + if (planManager) { + // Reload plan file to pick up any external changes + await planManager.reload?.(); + currentTask = await planManager.getNextTask(); + } else { + // Single task mode (--task flag) - exit after one task + currentTask = null; + } + } + + // ========== FINAL SUMMARY ========== + console.log(''); + console.log(heading('📊 Final Summary')); + console.log(''); + console.log(` ${dim('Total tasks processed:')} ${totalTasksProcessed}`); + console.log(` ${dim('Tasks completed:')} ${totalTasksCompleted}`); + console.log(` ${dim('Tasks failed:')} ${totalTasksFailed}`); + console.log(` ${dim('Elapsed time:')} ${formatElapsedTime(startTime)}`); + console.log(''); + + if (totalTasksFailed === 0 && totalTasksCompleted > 0) { + success(`🎉 All ${totalTasksCompleted} tasks completed successfully!`); + } else if (totalTasksCompleted > 0) { + warn(`Completed ${totalTasksCompleted} tasks, ${totalTasksFailed} failed`); + } else if (totalTasksProcessed === 0) { + success('No pending tasks found - all tasks are complete!'); } if (exitCode !== 0) { diff --git a/src/core/action-executor.ts b/src/core/action-executor.ts index 208d30e..ef2945e 100644 --- a/src/core/action-executor.ts +++ b/src/core/action-executor.ts @@ -17,6 +17,7 @@ import type { DeleteAction, ExecuteAction, CompleteAction, + StuckAction, ParseResult, } from './response-parser.js'; @@ -50,6 +51,14 @@ export interface ExecutionResult { taskComplete: boolean; /** Completion reason if task is complete */ completionReason?: string; + /** Whether a STUCK action was found */ + taskStuck: boolean; + /** Stuck details if task is stuck */ + stuckDetails?: { + attempted: string; + blocker: string; + suggestion?: string; + }; /** Summary of executed actions */ summary: string; } @@ -93,6 +102,8 @@ export class ActionExecutor { const results: ActionResult[] = []; let taskComplete = false; let completionReason: string | undefined; + let taskStuck = false; + let stuckDetails: { attempted: string; blocker: string; suggestion?: string } | undefined; for (const action of parseResult.actions) { const result = await this.executeAction(action); @@ -103,8 +114,20 @@ export class ActionExecutor { completionReason = (action as CompleteAction).reason; } - // Stop on first failure (except for COMPLETE which is informational) - if (!result.success && action.type !== 'COMPLETE') { + if (action.type === 'STUCK' && result.success) { + taskStuck = true; + const stuckAction = action as StuckAction; + stuckDetails = { + attempted: stuckAction.attempted, + blocker: stuckAction.blocker, + }; + if (stuckAction.suggestion) { + stuckDetails.suggestion = stuckAction.suggestion; + } + } + + // Stop on first failure (except for COMPLETE/STUCK which are informational) + if (!result.success && action.type !== 'COMPLETE' && action.type !== 'STUCK') { warn(`Action failed: ${result.error}`); // Continue with remaining actions? For now, we'll continue } @@ -117,6 +140,7 @@ export class ActionExecutor { results, allSucceeded, taskComplete, + taskStuck, summary, }; @@ -124,6 +148,10 @@ export class ActionExecutor { executionResult.completionReason = completionReason; } + if (stuckDetails) { + executionResult.stuckDetails = stuckDetails; + } + return executionResult; } @@ -153,6 +181,8 @@ export class ActionExecutor { return await this.executeCommand(action); case 'COMPLETE': return this.executeComplete(action); + case 'STUCK': + return this.executeStuck(action); default: return { action, @@ -353,6 +383,22 @@ export class ActionExecutor { }; } + /** + * Execute a STUCK action (signals inability to proceed) + */ + private executeStuck(action: StuckAction): ActionResult { + warn(`Agent stuck: ${action.blocker}`); + info(` Attempted: ${action.attempted}`); + if (action.suggestion) { + info(` Suggestion: ${action.suggestion}`); + } + return { + action, + success: true, // STUCK is a valid action, not a failure + message: `Agent stuck - blocker: ${action.blocker}`, + }; + } + /** * Build a human-readable summary of executed actions */ diff --git a/src/core/checkpoint-manager.ts b/src/core/checkpoint-manager.ts index 99c5564..409dd77 100644 --- a/src/core/checkpoint-manager.ts +++ b/src/core/checkpoint-manager.ts @@ -284,6 +284,136 @@ export class CheckpointManager { return this.hardRollbackTo(initialCommit); } + + /** + * Create a task completion checkpoint commit + */ + async createTaskCheckpoint( + taskTitle: string, + taskId: string, + summary: string + ): Promise { + if (!this.config.autoCommit) { + debug('Auto-commit disabled, skipping task checkpoint'); + return null; + } + + // Check if there are changes to commit + const hasChanges = await this.hasChangesToCommit(); + if (!hasChanges) { + debug('No changes to commit for task checkpoint'); + return null; + } + + // Get list of modified files before staging + const filesModified = await this.getModifiedFiles(); + + // Stage all changes + const staged = await this.stageAllChanges(); + if (!staged) { + return null; + } + + // Build commit message for task completion + const truncatedTitle = taskTitle.length > 40 + ? taskTitle.substring(0, 37) + '...' + : taskTitle; + + const message = `${this.config.messagePrefix} task complete - ${truncatedTitle}`; + const fullMessage = `${message}\n\nTask ID: ${taskId}\nSummary: ${summary}\nFiles modified: ${filesModified.length}`; + + try { + // Create commit + await execAsync(`git commit -m "${fullMessage.replace(/"/g, '\\"')}"`, { cwd: this.config.cwd }); + + // Get commit hash + const { stdout } = await execAsync('git rev-parse HEAD', { cwd: this.config.cwd }); + const commitHash = stdout.trim(); + + const checkpoint: Checkpoint = { + iteration: 0, // Task-level checkpoint, not iteration-level + commitHash, + message, + filesModified, + timestamp: new Date(), + tokensUsed: 0, + }; + + this.checkpoints.push(checkpoint); + debug(`Created task checkpoint: ${commitHash.substring(0, 7)} - ${message}`); + + return checkpoint; + } catch (err) { + warn('Failed to create task checkpoint commit: ' + (err instanceof Error ? err.message : String(err))); + return null; + } + } + + /** + * Create a failure checkpoint commit (preserves state for post-mortem) + */ + async createFailureCheckpoint( + taskTitle: string, + taskId: string, + attempt: number, + error?: string + ): Promise { + if (!this.config.autoCommit) { + debug('Auto-commit disabled, skipping failure checkpoint'); + return null; + } + + // Check if there are changes to commit + const hasChanges = await this.hasChangesToCommit(); + if (!hasChanges) { + debug('No changes to commit for failure checkpoint'); + return null; + } + + // Get list of modified files before staging + const filesModified = await this.getModifiedFiles(); + + // Stage all changes + const staged = await this.stageAllChanges(); + if (!staged) { + return null; + } + + // Build commit message for task failure + const truncatedTitle = taskTitle.length > 30 + ? taskTitle.substring(0, 27) + '...' + : taskTitle; + + const message = `${this.config.messagePrefix} task failed (attempt ${attempt}) - ${truncatedTitle}`; + const errorInfo = error ? `\nError: ${error.substring(0, 200)}` : ''; + const fullMessage = `${message}\n\nTask ID: ${taskId}\nAttempt: ${attempt}${errorInfo}\nFiles modified: ${filesModified.length}`; + + try { + // Create commit + await execAsync(`git commit -m "${fullMessage.replace(/"/g, '\\"')}"`, { cwd: this.config.cwd }); + + // Get commit hash + const { stdout } = await execAsync('git rev-parse HEAD', { cwd: this.config.cwd }); + const commitHash = stdout.trim(); + + const checkpoint: Checkpoint = { + iteration: 0, // Task-level checkpoint + commitHash, + message, + filesModified, + timestamp: new Date(), + tokensUsed: 0, + }; + + this.checkpoints.push(checkpoint); + debug(`Created failure checkpoint: ${commitHash.substring(0, 7)} - ${message}`); + + return checkpoint; + } catch (err) { + warn('Failed to create failure checkpoint commit: ' + (err instanceof Error ? err.message : String(err))); + return null; + } + } } /** diff --git a/src/core/config-schema.test.ts b/src/core/config-schema.test.ts index aaa0c3f..1790fa6 100644 --- a/src/core/config-schema.test.ts +++ b/src/core/config-schema.test.ts @@ -115,6 +115,8 @@ describe('Config Schema', () => { 'localPlanFile', 'promptTemplate', 'mcpServers', + 'maxRetriesPerTask', + 'autoPush', ]; expect(CONFIG_KEYS).toEqual(expectedKeys); }); diff --git a/src/core/config-schema.ts b/src/core/config-schema.ts index f17c48a..1fe8b6a 100644 --- a/src/core/config-schema.ts +++ b/src/core/config-schema.ts @@ -55,6 +55,10 @@ export interface RalphConfiguration { promptTemplate?: string; /** MCP servers for custom tools */ mcpServers?: MCPServerConfiguration[]; + /** Maximum retries per task before marking as failed (default: 2) */ + maxRetriesPerTask?: number; + /** Whether to auto-push after each task completion (default: false) */ + autoPush?: boolean; } /** @@ -67,6 +71,8 @@ export const DEFAULT_CONFIG: RalphConfiguration = { defaultModel: 'gpt-4.1', autoCommit: true, branchPrefix: 'ghcralph/', + maxRetriesPerTask: 2, + autoPush: false, }; /** @@ -86,6 +92,8 @@ export const CONFIG_KEYS = [ 'localPlanFile', 'promptTemplate', 'mcpServers', + 'maxRetriesPerTask', + 'autoPush', ] as const; export type ConfigKey = (typeof CONFIG_KEYS)[number]; @@ -117,8 +125,14 @@ export function validateConfigValue( } break; case 'autoCommit': + case 'autoPush': if (typeof value !== 'boolean') { - return { valid: false, error: 'autoCommit must be a boolean' }; + return { valid: false, error: `${key} must be a boolean` }; + } + break; + case 'maxRetriesPerTask': + if (typeof value !== 'number' || value < 1) { + return { valid: false, error: 'maxRetriesPerTask must be a positive number' }; } break; case 'defaultModel': @@ -143,8 +157,10 @@ export function parseConfigValue(key: ConfigKey, value: string): unknown { switch (key) { case 'maxIterations': case 'maxTokens': + case 'maxRetriesPerTask': return parseInt(value, 10); case 'autoCommit': + case 'autoPush': return value.toLowerCase() === 'true'; default: return value; diff --git a/src/core/context-builder.ts b/src/core/context-builder.ts index b170f1c..4278e99 100644 --- a/src/core/context-builder.ts +++ b/src/core/context-builder.ts @@ -33,11 +33,38 @@ const DEFAULT_PROMPT_TEMPLATE = `You are an expert software engineer. Your task {output_format} +{honesty_guidance} + ## Instructions - Make small, focused changes - Test your changes with [ACTION:EXECUTE] - Use [ACTION:COMPLETE] when tests pass and task is done`; +/** + * Honesty guidance section to encourage accurate reporting and graceful failure handling + */ +const HONESTY_GUIDANCE = `## Failure Handling & Honesty + +**IMPORTANT**: Be honest about your progress and limitations. + +- If you **cannot complete** the task, do NOT use [ACTION:COMPLETE] +- Instead, document what you tried and why it failed +- Use [ACTION:EXECUTE] to verify your work before claiming completion +- If tests fail or you encounter blocking issues, report them honestly + +**When you cannot proceed**, respond with: +\`\`\` +[ACTION:STUCK] +attempted: +blocker: +suggestion: +\`\`\` + +This honest reporting helps: +1. The next agent attempt learn from your experience +2. Humans understand what went wrong +3. The progress document serve as accurate documentation`; + /** * Legacy prompt template with meta info (for backwards compatibility) */ @@ -242,7 +269,8 @@ export class ContextBuilder { .replace('{context_section}', contextSection) .replace('{previous_progress}', previousProgress) .replace('{feedback_section}', feedbackSection ?? '') - .replace('{output_format}', outputFormat); + .replace('{output_format}', outputFormat) + .replace('{honesty_guidance}', HONESTY_GUIDANCE); // Clean up multiple consecutive newlines prompt = prompt.replace(/\n{3,}/g, '\n\n').trim(); diff --git a/src/core/git-branch-manager.ts b/src/core/git-branch-manager.ts index 9036e1d..0ac7e54 100644 --- a/src/core/git-branch-manager.ts +++ b/src/core/git-branch-manager.ts @@ -381,6 +381,34 @@ export class GitBranchManager { return 0; } } + + /** + * Push current branch to remote + */ + async pushToRemote(remote: string = 'origin', force: boolean = false): Promise { + try { + const currentBranch = await this.getCurrentBranch(); + const forceFlag = force ? '--force-with-lease' : ''; + await execAsync(`git push ${forceFlag} ${remote} ${currentBranch.name}`.trim(), { cwd: this.config.cwd }); + debug(`Pushed ${currentBranch.name} to ${remote}`); + return true; + } catch (err) { + warn('Failed to push to remote: ' + (err instanceof Error ? err.message : String(err))); + return false; + } + } + + /** + * Check if a remote exists + */ + async hasRemote(remote: string = 'origin'): Promise { + try { + await execAsync(`git remote get-url ${remote}`, { cwd: this.config.cwd }); + return true; + } catch { + return false; + } + } } /** diff --git a/src/core/model-compatibility.test.ts b/src/core/model-compatibility.test.ts new file mode 100644 index 0000000..15f859d --- /dev/null +++ b/src/core/model-compatibility.test.ts @@ -0,0 +1,327 @@ +/** + * Model Compatibility Tests + * + * Parameterized tests that validate response parsing across + * different model output styles. Different models may format + * ACTION blocks slightly differently, and these tests ensure + * our parser handles common variations. + * + * These tests document the CURRENT parser behavior. Any failing + * tests indicate the parser doesn't support that variation. + */ + +import { describe, it, expect } from 'vitest'; +import { parseResponse, type ParsedAction } from './response-parser.js'; + +/** + * Simulated model output variations + * Each entry represents a different model's style of formatting ACTION blocks + */ +interface ModelOutputVariation { + /** Model name for test description */ + model: string; + /** The response content from the model */ + response: string; + /** Expected action type */ + expectedType: string; + /** Expected path (for CREATE/EDIT actions) */ + expectedPath?: string; + /** Whether parsing should succeed */ + shouldSucceed: boolean; +} + +const MODEL_CREATE_VARIATIONS: ModelOutputVariation[] = [ + { + model: 'gpt-4.1 (standard format)', + response: `[ACTION:CREATE] +path: src/hello.ts +\`\`\`typescript +export const hello = "world"; +\`\`\``, + expectedType: 'CREATE', + expectedPath: 'src/hello.ts', + shouldSucceed: true, + }, + { + model: 'claude-sonnet-4.5 (extra whitespace)', + response: `[ACTION:CREATE] +path: src/hello.ts +\`\`\`typescript +export const hello = "world"; +\`\`\``, + expectedType: 'CREATE', + expectedPath: 'src/hello.ts', + shouldSucceed: true, + }, + { + model: 'gpt-5 (lowercase action)', + response: `[action:create] +path: src/hello.ts +\`\`\`typescript +export const hello = "world"; +\`\`\``, + expectedType: 'CREATE', + expectedPath: 'src/hello.ts', + shouldSucceed: true, + }, + { + model: 'gemini (no language hint)', + response: `[ACTION:CREATE] +path: src/hello.ts +\`\`\` +export const hello = "world"; +\`\`\``, + expectedType: 'CREATE', + expectedPath: 'src/hello.ts', + shouldSucceed: true, + }, + { + model: 'gpt-4-turbo (with preamble)', + response: `Sure, I'll create that file for you. + +[ACTION:CREATE] +path: src/hello.ts +\`\`\`typescript +export const hello = "world"; +\`\`\` + +This creates a simple TypeScript module.`, + expectedType: 'CREATE', + expectedPath: 'src/hello.ts', + shouldSucceed: true, + }, +]; + +const MODEL_EDIT_VARIATIONS: ModelOutputVariation[] = [ + { + model: 'gpt-4.1 (standard format)', + response: `[ACTION:EDIT] +path: src/index.ts +[OLD] +const x = 1; +[NEW] +const x = 42;`, + expectedType: 'EDIT', + expectedPath: 'src/index.ts', + shouldSucceed: true, + }, + { + model: 'gpt-5 (extra blank lines)', + response: `[ACTION:EDIT] +path: src/index.ts + +[OLD] +const x = 1; + +[NEW] +const x = 42;`, + expectedType: 'EDIT', + expectedPath: 'src/index.ts', + shouldSucceed: true, + }, +]; + +const MODEL_EXECUTE_VARIATIONS: ModelOutputVariation[] = [ + { + model: 'gpt-4.1 (command field format)', + response: `[ACTION:EXECUTE] +command: npm test`, + expectedType: 'EXECUTE', + shouldSucceed: true, + }, + { + model: 'claude-sonnet-4.5 (multiline command)', + response: `[ACTION:EXECUTE] +command: npm run build && npm test`, + expectedType: 'EXECUTE', + shouldSucceed: true, + }, +]; + +const MODEL_COMPLETE_VARIATIONS: ModelOutputVariation[] = [ + { + model: 'gpt-4.1 (with reason)', + response: `[ACTION:COMPLETE] +reason: All tests pass, task complete.`, + expectedType: 'COMPLETE', + shouldSucceed: true, + }, + { + model: 'claude-sonnet-4.5 (simple reason)', + response: `[ACTION:COMPLETE] +reason: Successfully implemented the feature.`, + expectedType: 'COMPLETE', + shouldSucceed: true, + }, +]; + +const MODEL_STUCK_VARIATIONS: ModelOutputVariation[] = [ + { + model: 'gpt-4.1 (full stuck format)', + response: `[ACTION:STUCK] +attempted: Tried to access external API +blocker: API credentials not available +suggestion: Please provide API credentials`, + expectedType: 'STUCK', + shouldSucceed: true, + }, + { + model: 'claude-sonnet-4.5 (no suggestion)', + response: `[ACTION:STUCK] +attempted: Tried to write files +blocker: File system is read-only`, + expectedType: 'STUCK', + shouldSucceed: true, + }, +]; + +describe('Model Compatibility - Response Parsing', () => { + describe('CREATE action variations', () => { + it.each(MODEL_CREATE_VARIATIONS)( + 'parses $model correctly', + ({ response, expectedType, expectedPath, shouldSucceed }) => { + const result = parseResponse(response); + + if (shouldSucceed) { + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].type).toBe(expectedType); + if (expectedPath) { + expect((result.actions[0] as ParsedAction & { path: string }).path).toBe(expectedPath); + } + } else { + expect(result.hasActions).toBe(false); + } + } + ); + }); + + describe('EDIT action variations', () => { + it.each(MODEL_EDIT_VARIATIONS)( + 'parses $model correctly', + ({ response, expectedType, expectedPath, shouldSucceed }) => { + const result = parseResponse(response); + + if (shouldSucceed) { + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].type).toBe(expectedType); + if (expectedPath) { + expect((result.actions[0] as ParsedAction & { path: string }).path).toBe(expectedPath); + } + } else { + expect(result.hasActions).toBe(false); + } + } + ); + }); + + describe('EXECUTE action variations', () => { + it.each(MODEL_EXECUTE_VARIATIONS)( + 'parses $model correctly', + ({ response, expectedType, shouldSucceed }) => { + const result = parseResponse(response); + + if (shouldSucceed) { + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].type).toBe(expectedType); + } else { + expect(result.hasActions).toBe(false); + } + } + ); + }); + + describe('COMPLETE action variations', () => { + it.each(MODEL_COMPLETE_VARIATIONS)( + 'parses $model correctly', + ({ response, expectedType, shouldSucceed }) => { + const result = parseResponse(response); + + if (shouldSucceed) { + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].type).toBe(expectedType); + } else { + expect(result.hasActions).toBe(false); + } + } + ); + }); + + describe('STUCK action variations', () => { + it.each(MODEL_STUCK_VARIATIONS)( + 'parses $model correctly', + ({ response, expectedType, shouldSucceed }) => { + const result = parseResponse(response); + + if (shouldSucceed) { + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].type).toBe(expectedType); + } else { + expect(result.hasActions).toBe(false); + } + } + ); + }); + + describe('Multiple actions in single response', () => { + it('parses multiple actions from verbose model output', () => { + const response = `I'll implement this in two steps. + +First, let me create the file: + +[ACTION:CREATE] +path: src/feature.ts +\`\`\`typescript +export function feature() { return true; } +\`\`\` + +Now let me run the tests: + +[ACTION:EXECUTE] +command: npm test`; + + const result = parseResponse(response); + + expect(result.hasActions).toBe(true); + expect(result.actions).toHaveLength(2); + expect(result.actions[0].type).toBe('CREATE'); + expect(result.actions[1].type).toBe('EXECUTE'); + }); + }); + + describe('Edge cases across models', () => { + it('handles Windows-style line endings', () => { + const response = '[ACTION:CREATE]\r\npath: src/hello.ts\r\n```typescript\r\nconst x = 1;\r\n```'; + const result = parseResponse(response); + + expect(result.hasActions).toBe(true); + expect(result.actions[0].type).toBe('CREATE'); + }); + + it('handles mixed case action types', () => { + const response = `[Action:Create] +path: src/hello.ts +\`\`\`typescript +const x = 1; +\`\`\``; + const result = parseResponse(response); + + expect(result.hasActions).toBe(true); + expect(result.actions[0].type).toBe('CREATE'); + }); + + it('rejects malformed action blocks gracefully', () => { + const response = `[ACTION:UNKNOWN_TYPE] +some content`; + const result = parseResponse(response); + + // Should either have no valid actions or have errors + expect(result.errors.length).toBeGreaterThan(0); + }); + }); +}); + diff --git a/src/core/plan-manager.ts b/src/core/plan-manager.ts index cc11b93..d7283ea 100644 --- a/src/core/plan-manager.ts +++ b/src/core/plan-manager.ts @@ -69,4 +69,9 @@ export interface PlanManager { * Update task progress */ updateProgress(id: string, progress: string): Promise; + + /** + * Reload the plan from source (optional, for refreshing state) + */ + reload?(): Promise; } diff --git a/src/core/progress-tracker.ts b/src/core/progress-tracker.ts index 146291e..1b4658e 100644 --- a/src/core/progress-tracker.ts +++ b/src/core/progress-tracker.ts @@ -253,4 +253,94 @@ export class ProgressTracker { // File doesn't exist } } + + /** + * Load previous task results from progress file for context injection. + * Returns a formatted summary of previous task attempts. + */ + async loadPreviousTaskResults(): Promise { + const filePath = this.getProgressFilePath(); + + try { + const content = await fs.readFile(filePath, 'utf-8'); + + // Extract iteration log section if it exists + const iterationLogMatch = content.match(/### Iteration Log\n\n([\s\S]*?)(?=\n## |$)/); + const taskResultsMatch = content.match(/### Task Results\n\n([\s\S]*?)(?=\n## |$)/); + + const parts: string[] = []; + + if (iterationLogMatch?.[1]) { + parts.push('## Previous Iteration Progress\n' + iterationLogMatch[1].trim()); + } + + if (taskResultsMatch?.[1]) { + parts.push('## Previous Task Results\n' + taskResultsMatch[1].trim()); + } + + return parts.join('\n\n'); + } catch { + return ''; + } + } + + /** + * Append a task result to the progress file. + * Used to track multi-task execution progress. + */ + async appendTaskResult( + task: { id: string; title: string }, + status: 'completed' | 'failed' | 'stuck', + attempt: number, + summary?: string, + error?: string + ): Promise { + const filePath = this.getProgressFilePath(); + const dir = path.dirname(filePath); + + await fs.mkdir(dir, { recursive: true }); + + const timestamp = new Date().toISOString(); + const statusEmoji = status === 'completed' ? '✅' : status === 'stuck' ? '🔄' : '❌'; + + let entry = `\n#### ${statusEmoji} Task: ${task.title}\n\n`; + entry += `- **ID**: ${task.id}\n`; + entry += `- **Status**: ${status}\n`; + entry += `- **Attempt**: ${attempt}\n`; + entry += `- **Timestamp**: ${timestamp}\n`; + + if (summary) { + entry += `- **Summary**: ${summary}\n`; + } + + if (error) { + entry += `- **Error**: ${error}\n`; + } + + entry += '\n'; + + try { + // Check if file exists + let content = ''; + try { + content = await fs.readFile(filePath, 'utf-8'); + } catch { + // File doesn't exist, create with header + content = '# Ralph Progress Log\n\n## Task Results\n'; + } + + // If Task Results section doesn't exist, add it + if (!content.includes('## Task Results')) { + content += '\n## Task Results\n'; + } + + // Append the entry at the end + content += entry; + + await fs.writeFile(filePath, content, 'utf-8'); + debug(`Appended task result for ${task.id} to ${filePath}`); + } catch (err) { + debug(`Failed to append task result: ${err instanceof Error ? err.message : String(err)}`); + } + } } diff --git a/src/core/response-parser.ts b/src/core/response-parser.ts index 2fa4d80..29e40e7 100644 --- a/src/core/response-parser.ts +++ b/src/core/response-parser.ts @@ -9,7 +9,7 @@ /** * Action types that the AI can request */ -export type ActionType = 'CREATE' | 'EDIT' | 'DELETE' | 'EXECUTE' | 'COMPLETE'; +export type ActionType = 'CREATE' | 'EDIT' | 'DELETE' | 'EXECUTE' | 'COMPLETE' | 'STUCK'; /** * Base action interface @@ -62,10 +62,20 @@ export interface CompleteAction extends BaseAction { reason: string; } +/** + * Signal that the agent is stuck and cannot proceed + */ +export interface StuckAction extends BaseAction { + type: 'STUCK'; + attempted: string; + blocker: string; + suggestion?: string; +} + /** * Union type of all actions */ -export type Action = CreateAction | EditAction | DeleteAction | ExecuteAction | CompleteAction; +export type Action = CreateAction | EditAction | DeleteAction | ExecuteAction | CompleteAction | StuckAction; /** * Result of parsing a response @@ -84,7 +94,7 @@ export interface ParseResult { /** * Action block pattern - matches [ACTION:TYPE] blocks */ -const ACTION_BLOCK_PATTERN = /\[ACTION:(CREATE|EDIT|DELETE|EXECUTE|COMPLETE)\]([\s\S]*?)(?=\[ACTION:|$)/gi; +const ACTION_BLOCK_PATTERN = /\[ACTION:(CREATE|EDIT|DELETE|EXECUTE|COMPLETE|STUCK)\]([\s\S]*?)(?=\[ACTION:|$)/gi; /** * Parse an AI response to extract structured actions @@ -152,6 +162,8 @@ function parseActionBody(type: ActionType, body: string, raw: string): Action | return parseExecuteAction(body, raw); case 'COMPLETE': return parseCompleteAction(body, raw); + case 'STUCK': + return parseStuckAction(body, raw); default: return null; } @@ -284,6 +296,43 @@ function parseCompleteAction(body: string, raw: string): CompleteAction { }; } +/** + * Parse a STUCK action + * Expected format: + * attempted: what was tried + * blocker: what is preventing completion + * suggestion: optional suggestion for next steps + */ +function parseStuckAction(body: string, raw: string): StuckAction { + const attemptedMatch = body.match(/^attempted:\s*(.+)$/m); + if (!attemptedMatch?.[1]) { + throw new Error('Missing attempted field'); + } + const attempted = attemptedMatch[1].trim(); + + const blockerMatch = body.match(/^blocker:\s*(.+)$/m); + if (!blockerMatch?.[1]) { + throw new Error('Missing blocker field'); + } + const blocker = blockerMatch[1].trim(); + + const suggestionMatch = body.match(/^suggestion:\s*(.+)$/m); + const suggestion = suggestionMatch?.[1]?.trim(); + + const result: StuckAction = { + type: 'STUCK', + attempted, + blocker, + raw, + }; + + if (suggestion) { + result.suggestion = suggestion; + } + + return result; +} + /** * Check if a response contains a COMPLETE action */ @@ -291,6 +340,13 @@ export function hasCompleteAction(result: ParseResult): boolean { return result.actions.some((a) => a.type === 'COMPLETE'); } +/** + * Check if a response contains a STUCK action + */ +export function hasStuckAction(result: ParseResult): boolean { + return result.actions.some((a) => a.type === 'STUCK'); +} + /** * Get the COMPLETE action if present */ @@ -299,6 +355,14 @@ export function getCompleteAction(result: ParseResult): CompleteAction | null { return action?.type === 'COMPLETE' ? action : null; } +/** + * Get the STUCK action if present + */ +export function getStuckAction(result: ParseResult): StuckAction | null { + const action = result.actions.find((a) => a.type === 'STUCK'); + return action?.type === 'STUCK' ? action : null; +} + /** * Filter actions by type */ diff --git a/src/integrations/copilot-agent.test.ts b/src/integrations/copilot-agent.test.ts index d3f5f00..ee569c1 100644 --- a/src/integrations/copilot-agent.test.ts +++ b/src/integrations/copilot-agent.test.ts @@ -5,12 +5,15 @@ let mockSendAndWait: | ((args: { prompt: string }) => Promise<{ type: string; data: { content?: string } } | undefined>) | null = null; +let mockListModels: (() => Promise>) | null = null; + vi.mock( '@github/copilot-sdk', (): { CopilotClient: new () => { start: () => Promise; stop: () => Promise; + listModels: () => Promise>; createSession: () => Promise<{ sendAndWait: ( args: { prompt: string }, @@ -24,6 +27,12 @@ vi.mock( CopilotClient: class { async start(): Promise {} async stop(): Promise {} + async listModels(): Promise> { + if (mockListModels) { + return await mockListModels(); + } + return []; + } async createSession(): Promise<{ sendAndWait: ( args: { prompt: string }, @@ -60,6 +69,7 @@ vi.mock( describe('CopilotAgent', () => { beforeEach((): void => { mockSendAndWait = null; + mockListModels = null; }); afterEach((): void => { @@ -97,4 +107,43 @@ describe('CopilotAgent', () => { expect(result.success).toBe(false); expect(result.error).toContain('Boom'); }); + + describe('listAvailableModels', () => { + it('returns models from SDK when available', async (): Promise => { + mockListModels = async (): Promise> => [ + { id: 'gpt-4.1', name: 'GPT-4.1', capabilities: { supports: { vision: false } } }, + { id: 'claude-sonnet-4.5', name: 'Claude Sonnet 4.5', capabilities: { supports: { vision: true } } }, + ]; + + const models = await CopilotAgent.fetchAvailableModels(); + + expect(models).toHaveLength(2); + expect(models[0].id).toBe('gpt-4.1'); + expect(models[1].id).toBe('claude-sonnet-4.5'); + }); + + it('returns empty array when SDK fetch fails', async (): Promise => { + mockListModels = async (): Promise> => { + throw new Error('Network error'); + }; + + const models = await CopilotAgent.fetchAvailableModels(); + + expect(models).toEqual([]); + }); + + it('instance method returns models from existing client', async (): Promise => { + const agent = new CopilotAgent({ model: 'gpt-4.1' }); + await agent.initialize(); + + mockListModels = async (): Promise> => [ + { id: 'gpt-5', name: 'GPT-5', capabilities: { supports: { vision: true } } }, + ]; + + const models = await agent.listAvailableModels(); + + expect(models).toHaveLength(1); + expect(models[0].id).toBe('gpt-5'); + }); + }); }); diff --git a/src/integrations/copilot-agent.ts b/src/integrations/copilot-agent.ts index 6beea23..a897b18 100644 --- a/src/integrations/copilot-agent.ts +++ b/src/integrations/copilot-agent.ts @@ -5,11 +5,14 @@ * Uses the @github/copilot-sdk for actual Copilot API access */ -import { CopilotClient, type CopilotSession } from '@github/copilot-sdk'; +import { CopilotClient, type CopilotSession, type ModelInfo } from '@github/copilot-sdk'; import { debug, error as logError, info, warn } from '../utils/index.js'; import { getGitHubAuth, type AuthResult } from './auth.js'; import { TokenTracker, estimateTokens, type TokenUsage } from './tokens.js'; +// Re-export ModelInfo for consumers +export type { ModelInfo } from '@github/copilot-sdk'; + /** * Available Copilot models */ @@ -274,4 +277,61 @@ export class CopilotAgent { getConfig(): CopilotAgentConfig { return { ...this.config }; } + + /** + * List available models from the Copilot API + * Returns models with their capabilities and metadata + */ + async listAvailableModels(): Promise { + // Create a temporary client if not initialized + if (!this.client) { + const tempClient = new CopilotClient({ + autoStart: true, + logLevel: 'error', + }); + + try { + await tempClient.start(); + const models = await tempClient.listModels(); + await tempClient.stop(); + return models; + } catch (err) { + const errorMsg = err instanceof Error ? err.message : String(err); + warn(`Failed to list models: ${errorMsg}`); + await tempClient.stop().catch(() => {}); + return []; + } + } + + // Use existing client + try { + return await this.client.listModels(); + } catch (err) { + const errorMsg = err instanceof Error ? err.message : String(err); + warn(`Failed to list models: ${errorMsg}`); + return []; + } + } + + /** + * Static method to list available models without requiring agent initialization + */ + static async fetchAvailableModels(): Promise { + const tempClient = new CopilotClient({ + autoStart: true, + logLevel: 'error', + }); + + try { + await tempClient.start(); + const models = await tempClient.listModels(); + await tempClient.stop(); + return models; + } catch (err) { + const errorMsg = err instanceof Error ? err.message : String(err); + warn(`Failed to fetch models: ${errorMsg}`); + await tempClient.stop().catch(() => {}); + return []; + } + } } diff --git a/src/integrations/index.ts b/src/integrations/index.ts index 0adfa0e..71884f0 100644 --- a/src/integrations/index.ts +++ b/src/integrations/index.ts @@ -9,7 +9,7 @@ */ export { CopilotAgent, CopilotError } from './copilot-agent.js'; -export type { CopilotModel, CopilotAgentConfig, ExecutionResult } from './copilot-agent.js'; +export type { CopilotModel, CopilotAgentConfig, ExecutionResult, ModelInfo } from './copilot-agent.js'; export { getGitHubAuth, isAuthenticated } from './auth.js'; export type { AuthResult } from './auth.js'; diff --git a/src/utils/shell.ts b/src/utils/shell.ts index 715939d..06055a9 100644 --- a/src/utils/shell.ts +++ b/src/utils/shell.ts @@ -20,6 +20,30 @@ export interface ShellInfo { isWindows: boolean; } +/** + * Wait for any keypress from stdin. + * Used for --pause-between-tasks mode (strict Ralph pattern). + */ +export async function waitForKeypress(): Promise { + return new Promise((resolve) => { + // Check if stdin is a TTY + if (!process.stdin.isTTY) { + // Non-interactive mode - just continue + resolve(); + return; + } + + const wasRaw = process.stdin.isRaw; + process.stdin.setRawMode(true); + process.stdin.resume(); + process.stdin.once('data', () => { + process.stdin.setRawMode(wasRaw); + process.stdin.pause(); + resolve(); + }); + }); +} + /** * Detect the current shell environment */