From 807e5e4da666abd75b864cbb47e5049c7dc17286 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Thu, 12 Feb 2026 23:41:12 -0500 Subject: [PATCH 01/21] feat: add Docker containerization and parallel multi-agent execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add parallel mode that runs N containerized Claude Code agents simultaneously against the same PRD, with network sandboxing, resource limits, and git-based story claiming. New directories: - docker/ — Dockerfile, container entrypoint, iptables firewall scripts - parallel/ — orchestrator, stop, status, parallel prompt, lib helpers Upstream ralph.sh and all existing files are untouched. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 5 + AGENTS.md | 11 + README.md | 56 ++++ docker/Dockerfile | 41 +++ docker/agent-loop.sh | 326 +++++++++++++++++++++++ docker/init-firewall-builder.sh | 53 ++++ docker/init-firewall-researcher.sh | 9 + parallel/CLAUDE-parallel.md | 153 +++++++++++ parallel/README.md | 148 +++++++++++ parallel/lib/auth.sh | 77 ++++++ parallel/lib/docker-helpers.sh | 88 +++++++ parallel/lib/logging.sh | 16 ++ parallel/lib/network-setup.sh | 42 +++ parallel/ralph-parallel.sh | 409 +++++++++++++++++++++++++++++ parallel/status.sh | 137 ++++++++++ parallel/stop.sh | 60 +++++ 16 files changed, 1631 insertions(+) create mode 100644 docker/Dockerfile create mode 100755 docker/agent-loop.sh create mode 100755 docker/init-firewall-builder.sh create mode 100755 docker/init-firewall-researcher.sh create mode 100644 parallel/CLAUDE-parallel.md create mode 100644 parallel/README.md create mode 100644 parallel/lib/auth.sh create mode 100644 parallel/lib/docker-helpers.sh create mode 100644 parallel/lib/logging.sh create mode 100644 parallel/lib/network-setup.sh create mode 100755 parallel/ralph-parallel.sh create mode 100755 parallel/status.sh create mode 100755 parallel/stop.sh diff --git a/.gitignore b/.gitignore index f583b7e5..9e76d3f4 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,8 @@ progress.txt #Claude .claude/ + +# Parallel mode state +.ralph/ +agent_logs/ +progress-agent-*.txt diff --git a/AGENTS.md b/AGENTS.md index 9da9ecd1..0f00dcea 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -45,3 +45,14 @@ npm run dev - Memory persists via git history, `progress.txt`, and `prd.json` - Stories should be small enough to complete in one context window - Always update AGENTS.md with discovered patterns for future iterations + +## Parallel Mode + +Ralph supports running multiple agents in parallel via Docker containers. See `parallel/README.md` for details. + +- Parallel scripts live in `parallel/` — orchestrator, status, stop +- Docker image and container entrypoint live in `docker/` +- Agents claim stories via `claimed_by` field in prd.json using git atomic push +- Each agent writes to its own `progress-.txt` to avoid merge conflicts +- Builder agents have restricted network access (Claude API + npm only) +- Researcher agents have full internet access diff --git a/README.md b/README.md index d79d8b62..dffac24d 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,8 @@ Ralph will: | `skills/ralph/` | Skill for converting PRDs to JSON (works with Amp and Claude Code) | | `.claude-plugin/` | Plugin manifest for Claude Code marketplace discovery | | `flowchart/` | Interactive visualization of how Ralph works | +| `docker/` | Dockerfile and container scripts for parallel mode | +| `parallel/` | Parallel mode orchestrator, status, and stop scripts | ## Flowchart @@ -232,6 +234,60 @@ After copying `prompt.md` (for Amp) or `CLAUDE.md` (for Claude Code) to your pro Ralph automatically archives previous runs when you start a new feature (different `branchName`). Archives are saved to `archive/YYYY-MM-DD-feature-name/`. +## Parallel Mode (Docker) + +Ralph includes a parallel mode that runs N containerized Claude Code agents simultaneously against the same PRD. Each agent runs in a Docker container with: + +- **Network restrictions** — builder agents can only reach Claude API and npm registry +- **Resource limits** — configurable memory and CPU caps per container +- **Story claiming** — agents claim stories via git atomic push to avoid duplicate work +- **Automatic recovery** — stale claims are cleared, crashed containers are restarted + +### Prerequisites (Parallel Mode) + +- Docker installed and running +- A Claude Code auth token (env var, file, or 1Password) +- `jq` installed + +### Quick Start (Parallel Mode) + +```bash +# Set your Claude auth token +export RALPH_CLAUDE_TOKEN='' + +# Run 3 agents in parallel +./parallel/ralph-parallel.sh --agents 3 + +# Check status +./parallel/status.sh + +# Graceful shutdown +./parallel/stop.sh +``` + +### Options + +```bash +./parallel/ralph-parallel.sh \ + --agents 3 \ # number of builder agents (default: 2) + --model claude-sonnet-4-5-20250929 \ # model (default: sonnet) + --memory 4g \ # per-container memory limit + --cpus 2 \ # per-container CPU limit + --researcher 1 \ # researcher agents with full internet access + [max_iterations] # per-agent iteration cap (default: 0 = until PRD complete) +``` + +### Auth Token + +Priority order (first wins): +1. `RALPH_CLAUDE_TOKEN` environment variable +2. `.ralph/token` file in the project directory +3. 1Password via `op read` (interactive, startup only) + +To refresh the token mid-run without stopping, write a new token to `.ralph/token_refresh`. The orchestrator picks it up within 30 seconds and restarts all containers. + +See [parallel/README.md](parallel/README.md) for full documentation. + ## References - [Geoffrey Huntley's Ralph article](https://ghuntley.com/ralph/) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..0a25a72f --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,41 @@ +FROM node:20-slim + +# System deps for networking, git, and general tooling +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + iptables \ + ipset \ + iproute2 \ + dnsutils \ + jq \ + curl \ + sudo \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install claude-code globally +RUN npm install -g @anthropic-ai/claude-code + +# Create non-root agent user +RUN useradd -m -s /bin/bash -u 1000 agent + +# Allow agent to run firewall init scripts via sudo (no password) +RUN echo "agent ALL=(root) NOPASSWD: /opt/ralph/init-firewall-builder.sh, /opt/ralph/init-firewall-researcher.sh" \ + > /etc/sudoers.d/agent-firewall && chmod 0440 /etc/sudoers.d/agent-firewall + +# Copy scripts +COPY agent-loop.sh /opt/ralph/agent-loop.sh +COPY init-firewall-builder.sh /opt/ralph/init-firewall-builder.sh +COPY init-firewall-researcher.sh /opt/ralph/init-firewall-researcher.sh +RUN chmod +x /opt/ralph/agent-loop.sh /opt/ralph/init-firewall-builder.sh /opt/ralph/init-firewall-researcher.sh + +# Workspace for cloned repo +RUN mkdir -p /workspace && chown agent:agent /workspace + +# Claude config directory +RUN mkdir -p /home/agent/.claude && chown agent:agent /home/agent/.claude + +USER agent +WORKDIR /workspace + +ENTRYPOINT ["/opt/ralph/agent-loop.sh"] diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh new file mode 100755 index 00000000..edecb723 --- /dev/null +++ b/docker/agent-loop.sh @@ -0,0 +1,326 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# agent-loop.sh — Container entrypoint for Ralph parallel agents. +# +# Clones from a bind-mounted project directory, claims stories from prd.json +# via git atomic push, runs Claude Code per iteration, and pushes results. +# +# Expected environment variables: +# AGENT_ID - Unique agent identifier (e.g., "agent-1") +# AGENT_ROLE - One of: builder, researcher +# CLAUDE_CODE_OAUTH_TOKEN - OAuth token JSON for Claude authentication +# MAX_ITERATIONS - Max loop iterations (0 = infinite, default: 0) +# CLAUDE_MODEL - Model to use (default: claude-sonnet-4-5-20250929) +# + +AGENT_ID="${AGENT_ID:?AGENT_ID is required}" +AGENT_ROLE="${AGENT_ROLE:?AGENT_ROLE is required}" +CLAUDE_CODE_OAUTH_TOKEN="${CLAUDE_CODE_OAUTH_TOKEN:?CLAUDE_CODE_OAUTH_TOKEN is required}" +MAX_ITERATIONS="${MAX_ITERATIONS:-0}" +CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-5-20250929}" + +REPO_PATH="/repo.git" +WORKSPACE="/workspace" +PROMPT_DIR="/parallel-prompt" +PROMPT_FILE="$PROMPT_DIR/CLAUDE-parallel.md" +STOP_FILE="/harness-state/stop_requested" +LOG_DIR="/agent-logs" +ITERATION=0 + +echo "[$AGENT_ID] Starting agent loop (role=$AGENT_ROLE, model=$CLAUDE_MODEL, max_iterations=$MAX_ITERATIONS)" + +# --- Step 1: Initialize firewall based on role --- +echo "[$AGENT_ID] Initializing firewall for role: $AGENT_ROLE" +case "$AGENT_ROLE" in + researcher) + sudo /opt/ralph/init-firewall-researcher.sh + ;; + *) + sudo /opt/ralph/init-firewall-builder.sh + ;; +esac + +# --- Step 2: Write Claude auth credentials --- +echo "[$AGENT_ID] Configuring Claude authentication" +mkdir -p ~/.claude +echo "$CLAUDE_CODE_OAUTH_TOKEN" > ~/.claude/.credentials.json +chmod 600 ~/.claude/.credentials.json +export CLAUDE_CODE_OAUTH_TOKEN + +# --- Step 3: Clone or update workspace from project mount --- +setup_workspace() { + if [ -d "$WORKSPACE/.git" ]; then + echo "[$AGENT_ID] Fetching latest changes" + cd "$WORKSPACE" + git fetch origin + git reset --hard origin/main 2>/dev/null || git reset --hard origin/master 2>/dev/null || true + else + echo "[$AGENT_ID] Cloning bare repo into workspace" + git clone "$REPO_PATH" "$WORKSPACE" + cd "$WORKSPACE" + fi +} + +# --- Step 4: Set git identity --- +setup_git_identity() { + git config user.name "$AGENT_ID" + git config user.email "${AGENT_ID}@ralph-agent.local" + git config pull.rebase true +} + +# --- Step 5: Check out the correct branch from prd.json --- +checkout_prd_branch() { + if [ ! -f "$WORKSPACE/prd.json" ]; then + echo "[$AGENT_ID] WARNING: No prd.json found in workspace" + return 1 + fi + + local branch_name + branch_name=$(jq -r '.branchName // empty' "$WORKSPACE/prd.json" 2>/dev/null || echo "") + + if [ -z "$branch_name" ]; then + echo "[$AGENT_ID] No branchName in prd.json, staying on current branch" + return 0 + fi + + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + + if [ "$current_branch" = "$branch_name" ]; then + echo "[$AGENT_ID] Already on branch: $branch_name" + return 0 + fi + + echo "[$AGENT_ID] Checking out branch: $branch_name" + if git show-ref --verify --quiet "refs/heads/$branch_name" 2>/dev/null; then + git checkout "$branch_name" + elif git show-ref --verify --quiet "refs/remotes/origin/$branch_name" 2>/dev/null; then + git checkout -b "$branch_name" "origin/$branch_name" + else + git checkout -b "$branch_name" + fi +} + +# --- Step 6: Claim a story in prd.json --- +# Returns 0 and prints story ID if claimed, returns 1 if no stories available +claim_story() { + cd "$WORKSPACE" + + # Pull latest prd.json + git pull --rebase 2>&1 || { + git rebase --abort 2>/dev/null || true + git fetch origin + git reset --hard "origin/$(git branch --show-current)" + } + + if [ ! -f prd.json ]; then + echo "[$AGENT_ID] No prd.json found" >&2 + return 1 + fi + + # Find highest-priority unclaimed story (passes: false AND no claimed_by) + local story_id + story_id=$(jq -r ' + .userStories + | map(select(.passes == false and (.claimed_by == null or .claimed_by == ""))) + | sort_by(.priority) + | first + | .id // empty + ' prd.json 2>/dev/null || echo "") + + if [ -z "$story_id" ]; then + echo "[$AGENT_ID] No unclaimed stories available" >&2 + return 1 + fi + + # Claim it by setting claimed_by and claimed_at + local timestamp + timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + jq --arg agent "$AGENT_ID" --arg ts "$timestamp" --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then + .claimed_by = $agent | .claimed_at = $ts + else . end + ) + ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + + git add prd.json + git commit -m "[$AGENT_ID] Claim: $story_id" 2>&1 || { + echo "[$AGENT_ID] Failed to commit claim for $story_id" >&2 + git checkout -- prd.json 2>/dev/null || true + return 1 + } + + # Atomic push — if this fails, another agent claimed something concurrently + if git push 2>&1; then + echo "$story_id" + return 0 + else + echo "[$AGENT_ID] Push failed (concurrent claim). Resetting and retrying..." >&2 + git reset --hard HEAD~1 + git pull --rebase 2>&1 || { + git rebase --abort 2>/dev/null || true + git fetch origin + git reset --hard "origin/$(git branch --show-current)" + } + return 1 + fi +} + +# --- Step 7: Check if all stories are complete --- +all_stories_complete() { + if [ ! -f "$WORKSPACE/prd.json" ]; then + return 1 + fi + + local incomplete + incomplete=$(jq '[.userStories[] | select(.passes == false)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] +} + +# --- Step 8: Push changes with retry --- +push_with_retry() { + local max_attempts=3 + local attempt=0 + local branch + branch=$(git branch --show-current 2>/dev/null || echo "main") + + while [ $attempt -lt $max_attempts ]; do + if git push origin "$branch" 2>&1; then + echo "[$AGENT_ID] Push successful." + return 0 + fi + attempt=$((attempt + 1)) + echo "[$AGENT_ID] Push failed (attempt $attempt/$max_attempts). Rebasing..." + git pull --rebase origin "$branch" 2>&1 || { + echo "[$AGENT_ID] Rebase conflict. Aborting rebase and resetting." + git rebase --abort 2>/dev/null || true + git fetch origin + git reset --hard "origin/$branch" + return 1 + } + done + + echo "[$AGENT_ID] Failed to push after $max_attempts attempts." + return 1 +} + +# --- Step 9: Prepare the prompt with agent identity --- +prepare_prompt() { + if [ ! -f "$PROMPT_FILE" ]; then + echo "[$AGENT_ID] ERROR: Prompt file not found at $PROMPT_FILE" + return 1 + fi + + # Inject agent identity into prompt + sed "s/{{AGENT_ID}}/$AGENT_ID/g" "$PROMPT_FILE" +} + +# --- Main loop --- +setup_workspace +setup_git_identity +checkout_prd_branch + +echo "[$AGENT_ID] Entering main loop" + +while true; do + # Check stop signal + if [ -f "$STOP_FILE" ]; then + echo "[$AGENT_ID] Stop requested. Exiting gracefully." + exit 0 + fi + + # Check iteration limit + if [ "$MAX_ITERATIONS" -gt 0 ] && [ "$ITERATION" -ge "$MAX_ITERATIONS" ]; then + echo "[$AGENT_ID] Reached max iterations ($MAX_ITERATIONS). Exiting." + exit 0 + fi + + # Check if all stories are done + if all_stories_complete; then + echo "[$AGENT_ID] All stories complete. Exiting." + exit 0 + fi + + ITERATION=$((ITERATION + 1)) + COMMIT=$(git rev-parse --short=6 HEAD 2>/dev/null || echo "000000") + LOGFILE="${LOG_DIR}/${AGENT_ID}_iter${ITERATION}_${COMMIT}.log" + TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + echo "[$AGENT_ID] === Iteration $ITERATION (commit: $COMMIT) at $TIMESTAMP ===" + + # Attempt to claim a story (retry up to 3 times with different stories) + CLAIMED_STORY="" + CLAIM_ATTEMPTS=0 + while [ $CLAIM_ATTEMPTS -lt 3 ] && [ -z "$CLAIMED_STORY" ]; do + CLAIMED_STORY=$(claim_story) && break || true + CLAIM_ATTEMPTS=$((CLAIM_ATTEMPTS + 1)) + sleep 2 + done + + if [ -z "$CLAIMED_STORY" ]; then + echo "[$AGENT_ID] Could not claim any story. Checking if all complete..." + if all_stories_complete; then + echo "[$AGENT_ID] All stories complete. Exiting." + exit 0 + fi + echo "[$AGENT_ID] Stories exist but couldn't claim. Waiting 30s..." + sleep 30 + continue + fi + + echo "[$AGENT_ID] Claimed story: $CLAIMED_STORY" + + # Prepare prompt + PROMPT=$(prepare_prompt) || { + echo "[$AGENT_ID] Failed to prepare prompt. Sleeping 10s..." + sleep 10 + continue + } + + # Run Claude + echo "[$AGENT_ID] Running Claude (model: $CLAUDE_MODEL) for story: $CLAIMED_STORY" + claude --dangerously-skip-permissions \ + --print \ + --model "$CLAUDE_MODEL" \ + -p "$PROMPT" \ + &> "$LOGFILE" || { + echo "[$AGENT_ID] Claude exited with error (code: $?). Check log: $LOGFILE" + } + + echo "[$AGENT_ID] Claude session complete. Pushing changes..." + + # Stage and commit any remaining unstaged changes + if ! git diff --quiet || ! git diff --cached --quiet; then + git add -A + if ! git diff --cached --quiet; then + git commit -m "[$AGENT_ID] Iteration $ITERATION: $CLAIMED_STORY" || true + fi + fi + + # Push with retry + push_with_retry + + # Write per-agent progress + { + echo "## $TIMESTAMP - $CLAIMED_STORY (Iteration $ITERATION)" + echo "- Agent: $AGENT_ID" + echo "- Commit: $COMMIT" + echo "---" + } >> "$WORKSPACE/progress-${AGENT_ID}.txt" + + # Check for completion sentinel in output + if [ -f "$LOGFILE" ] && grep -q "COMPLETE" "$LOGFILE"; then + echo "[$AGENT_ID] Completion sentinel detected. Verifying all stories..." + git pull --rebase 2>/dev/null || true + if all_stories_complete; then + echo "[$AGENT_ID] All stories confirmed complete. Exiting." + exit 0 + fi + fi + + echo "[$AGENT_ID] Iteration $ITERATION complete. Sleeping 5s..." + sleep 5 +done diff --git a/docker/init-firewall-builder.sh b/docker/init-firewall-builder.sh new file mode 100755 index 00000000..27dd696b --- /dev/null +++ b/docker/init-firewall-builder.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# Builder firewall: whitelist only Claude API, Statsig, and npm registry. +# Everything else is denied. Must run as root (called via sudo). +# + +# Flush existing rules +iptables -F OUTPUT 2>/dev/null || true +iptables -F INPUT 2>/dev/null || true + +# Allow loopback +iptables -A OUTPUT -o lo -j ACCEPT +iptables -A INPUT -i lo -j ACCEPT + +# Allow established/related connections (responses to our outbound requests) +iptables -A INPUT -m state --state ESTABLISHED,RELATED -j ACCEPT +iptables -A OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT + +# Allow DNS (needed to resolve hostnames before we can whitelist IPs) +iptables -A OUTPUT -p udp --dport 53 -j ACCEPT +iptables -A OUTPUT -p tcp --dport 53 -j ACCEPT + +# Resolve and allow each whitelisted domain +ALLOWED_DOMAINS=( + "api.anthropic.com" + "statsig.anthropic.com" + "registry.npmjs.org" +) + +for domain in "${ALLOWED_DOMAINS[@]}"; do + # Resolve all IPs for the domain + ips=$(dig +short "$domain" 2>/dev/null | grep -E '^[0-9]+\.' || true) + for ip in $ips; do + iptables -A OUTPUT -p tcp -d "$ip" --dport 443 -j ACCEPT + echo "[firewall] Allowed: $domain -> $ip:443" + done + + # Also resolve CNAME targets (CDNs etc) + cnames=$(dig +short "$domain" 2>/dev/null | grep -v -E '^[0-9]+\.' || true) + for cname in $cnames; do + cname_ips=$(dig +short "$cname" 2>/dev/null | grep -E '^[0-9]+\.' || true) + for ip in $cname_ips; do + iptables -A OUTPUT -p tcp -d "$ip" --dport 443 -j ACCEPT + echo "[firewall] Allowed: $domain (via $cname) -> $ip:443" + done + done +done + +# Default deny all other outbound traffic +iptables -A OUTPUT -j DROP + +echo "[firewall] Builder firewall initialized. Only API + npm registry allowed." diff --git a/docker/init-firewall-researcher.sh b/docker/init-firewall-researcher.sh new file mode 100755 index 00000000..a98855e3 --- /dev/null +++ b/docker/init-firewall-researcher.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# Researcher firewall: no-op. +# Docker default networking allows all outbound traffic. +# Researcher agents need full internet access for web searches, docs, etc. +# + +echo "[firewall] Researcher firewall: no restrictions (full internet access)." diff --git a/parallel/CLAUDE-parallel.md b/parallel/CLAUDE-parallel.md new file mode 100644 index 00000000..0b98cd18 --- /dev/null +++ b/parallel/CLAUDE-parallel.md @@ -0,0 +1,153 @@ +# Ralph Parallel Agent Instructions + +You are **{{AGENT_ID}}**, an autonomous coding agent running in parallel with other agents on this project. You are in a sandboxed Docker container with `--dangerously-skip-permissions`. + +## Your Task + +1. Read the PRD at `prd.json` +2. Read ALL progress files: `progress.txt` and any `progress-*.txt` files (check Codebase Patterns section first) +3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. +4. **Claim** the highest priority user story where `passes: false` AND `claimed_by` is empty (see Claim Protocol below) +5. Implement that single user story +6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) +7. Update AGENTS.md files if you discover reusable patterns (see below) +8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]` +9. Update the PRD to set `passes: true` for the completed story +10. Append your progress to `progress-{{AGENT_ID}}.txt` + +## Claim Protocol + +You are running alongside other agents. To avoid duplicate work, you must **claim** a story before working on it using git's atomic push as a lock: + +1. `git pull --rebase` to get latest prd.json +2. Find the highest-priority story where `passes: false` AND (`claimed_by` is null or empty) +3. Set `claimed_by: "{{AGENT_ID}}"` and `claimed_at: ""` in prd.json for that story +4. `git add prd.json && git commit -m "[{{AGENT_ID}}] Claim: "` +5. `git push` +6. **If push fails** — another agent claimed something concurrently. Run `git pull --rebase` and pick a different unclaimed story. Repeat up to 3 times. +7. After completing work, set `passes: true` in prd.json, commit, and push. + +### Example claim in prd.json: +```json +{ + "id": "US-001", + "title": "Add priority field", + "passes": false, + "claimed_by": "{{AGENT_ID}}", + "claimed_at": "2025-01-15T10:30:00Z", + "priority": 1 +} +``` + +### 3-Strike Rule for Claims +If your `git push` fails 3 times in a row when trying to claim, document the situation in `progress-{{AGENT_ID}}.txt` and wait 30 seconds before retrying. Do not spin indefinitely. + +## Per-Agent Progress Files + +- **Write** your progress to: `progress-{{AGENT_ID}}.txt` +- **Read** all progress files before starting: `progress.txt` and all `progress-*.txt` + +This avoids merge conflicts on a shared progress file. Your per-agent progress file follows the same format: + +``` +## [Date/Time] - [Story ID] +- What was implemented +- Files changed +- **Learnings for future iterations:** + - Patterns discovered + - Gotchas encountered + - Useful context +--- +``` + +## Conflict Resolution + +Since multiple agents push to the same branch: + +1. **Always `git pull --rebase` before pushing.** Never merge. +2. If rebase has conflicts: + - Try to resolve them (prefer keeping both changes) + - If you can't resolve: `git rebase --abort`, `git fetch origin`, `git reset --hard origin/`, and redo your changes +3. **3-strike push rule**: If push fails 3 times after rebase, document the blocker in `progress-{{AGENT_ID}}.txt` under "## Blockers" and move on to a different story. + +## Push Protocol + +Always follow this sequence: +1. `git add -A` +2. `git commit -m "[{{AGENT_ID}}] "` +3. `git pull --rebase origin ` +4. `git push origin ` +5. If push fails, repeat from step 3 (max 3 retries) + +## Progress Report Format + +APPEND to `progress-{{AGENT_ID}}.txt` (never replace, always append): +``` +## [Date/Time] - [Story ID] +- What was implemented +- Files changed +- **Learnings for future iterations:** + - Patterns discovered (e.g., "this codebase uses X for Y") + - Gotchas encountered (e.g., "don't forget to update Z when changing W") + - Useful context (e.g., "the evaluation panel is in component X") +--- +``` + +## Consolidate Patterns + +If you discover a **reusable pattern** that future iterations should know, add it to the `## Codebase Patterns` section at the TOP of `progress-{{AGENT_ID}}.txt` (create it if it doesn't exist). Only add patterns that are **general and reusable**, not story-specific details. + +## Update AGENTS.md Files + +Before committing, check if any edited files have learnings worth preserving in nearby AGENTS.md files: + +1. **Identify directories with edited files** - Look at which directories you modified +2. **Check for existing AGENTS.md** - Look for AGENTS.md in those directories or parent directories +3. **Add valuable learnings** - If you discovered something future developers/agents should know: + - API patterns or conventions specific to that module + - Gotchas or non-obvious requirements + - Dependencies between files + - Testing approaches for that area + - Configuration or environment requirements + +**Do NOT add:** +- Story-specific implementation details +- Temporary debugging notes +- Information already in progress files + +## Quality Requirements + +- ALL commits must pass your project's quality checks (typecheck, lint, test) +- Do NOT commit broken code +- Keep changes focused and minimal +- Follow existing code patterns + +## Browser Testing (If Available) + +For any story that changes UI, verify it works in the browser if you have browser testing tools configured: + +1. Navigate to the relevant page +2. Verify the UI changes work as expected +3. Take a screenshot if helpful for the progress log + +If no browser tools are available, note in your progress report that manual browser verification is needed. + +## Stop Condition + +After completing a user story, check if ALL stories have `passes: true`. + +If ALL stories are complete and passing, reply with: +COMPLETE + +If there are still stories with `passes: false`, end your response normally (another iteration will pick up the next story). + +## Important + +- You are **{{AGENT_ID}}** — always use this in commit messages and progress files +- Work on ONE story per iteration +- Claim before working — never start without claiming +- Commit frequently with small, focused commits +- Keep CI green +- Read ALL progress files (yours and other agents') before starting +- Do not attempt to install system packages or modify system configuration +- Focus on making measurable progress each iteration — quality over quantity diff --git a/parallel/README.md b/parallel/README.md new file mode 100644 index 00000000..30d5d8e4 --- /dev/null +++ b/parallel/README.md @@ -0,0 +1,148 @@ +# Ralph Parallel Mode + +Run N containerized Claude Code agents simultaneously against the same PRD. Each agent is sandboxed in Docker with network restrictions, resource limits, and no host access. + +## How It Works + +1. **Orchestrator** (`ralph-parallel.sh`) builds a Docker image, creates networks, and launches N containers +2. Each container runs the **agent loop** (`docker/agent-loop.sh`) which: + - Clones the project from a bind-mounted directory + - Claims a story in `prd.json` via git atomic push + - Runs Claude Code with the parallel prompt + - Pushes results and picks the next story +3. The orchestrator monitors container health, recovers stale claims, and handles token refresh +4. When all stories have `passes: true`, everything shuts down + +## Prerequisites + +- Docker installed and running +- A Claude Code auth token +- `jq` installed (`brew install jq` on macOS) +- A `prd.json` in the ralph root directory + +## Quick Start + +```bash +# Set your Claude auth token +export RALPH_CLAUDE_TOKEN='' + +# Run with 3 builder agents +./parallel/ralph-parallel.sh --agents 3 + +# Check status +./parallel/status.sh + +# Graceful shutdown +./parallel/stop.sh +``` + +## CLI Options + +``` +./parallel/ralph-parallel.sh [options] [max_iterations] + +Options: + --agents N Number of builder agents (default: 2) + --researcher N Number of researcher agents with full internet (default: 0) + --model MODEL Claude model (default: claude-sonnet-4-5-20250929) + --memory SIZE Per-container memory limit (default: 4g) + --cpus N Per-container CPU limit (default: 2) + +Arguments: + max_iterations Per-agent iteration cap (default: 0 = until PRD complete) +``` + +## Authentication + +Token retrieval priority (first wins): + +1. **`RALPH_CLAUDE_TOKEN` env var** — set before running +2. **`.ralph/token` file** — write your token here +3. **1Password** via `op read` — interactive, startup only + +### Mid-Run Token Refresh + +Write a new token to `.ralph/token_refresh`. The orchestrator detects it within 30 seconds and restarts all containers with the new token. + +## Story Claiming + +Agents claim stories by modifying `prd.json` and using git's atomic push as a lock: + +1. Agent finds highest-priority unclaimed story (`passes: false`, `claimed_by` empty) +2. Sets `claimed_by` and `claimed_at` fields +3. Commits and pushes +4. If push fails (another agent pushed first), rebase and pick a different story + +### Stale Claim Recovery + +The orchestrator checks for claims older than 30 minutes where the agent's container is no longer running. Stale claims are automatically cleared so other agents can pick up the work. + +## Agent Roles + +| Role | Network | Purpose | +|------|---------|---------| +| `builder` | API + npm only | Feature implementation, testing, code changes | +| `researcher` | Full internet | Web research, documentation lookup | + +Builder agents are restricted via iptables to only reach: +- `api.anthropic.com` (Claude API) +- `statsig.anthropic.com` (telemetry) +- `registry.npmjs.org` (npm packages) + +## File Layout + +``` +docker/ +├── Dockerfile # Container image: node:20-slim + claude-code +├── agent-loop.sh # Container entrypoint: firewall → auth → clone → loop +├── init-firewall-builder.sh # iptables: whitelist API + npm only +└── init-firewall-researcher.sh # No-op (full internet) + +parallel/ +├── ralph-parallel.sh # Host orchestrator: launch, monitor, restart +├── stop.sh # Graceful shutdown +├── status.sh # Container status + story board + logs +├── CLAUDE-parallel.md # Parallel-aware prompt for agents +├── README.md # This file +└── lib/ + ├── auth.sh # Token retrieval: env > file > 1Password + ├── network-setup.sh # Docker network create/teardown + ├── docker-helpers.sh # Container launch/stop/restart + └── logging.sh # Timestamped log helpers +``` + +## Per-Agent Progress Files + +Instead of all agents appending to one `progress.txt` (merge conflict risk), each agent writes to `progress-.txt`. The parallel prompt instructs agents to read ALL progress files for context and write only to their own. + +## Differences from `ralph.sh` + +| | `ralph.sh` (original) | `ralph-parallel.sh` (new) | +|---|---|---| +| Runs on | Host, bare metal | Host, launches Docker containers | +| Agents | 1, sequential | N, parallel | +| Prompt | `CLAUDE.md` | `parallel/CLAUDE-parallel.md` | +| Auth | Delegates to CLI | Env var / file / 1Password | +| Network | Unrestricted | iptables firewall per container | +| Progress | `progress.txt` | `progress-.txt` per agent | +| Config | CLI args | CLI args | + +## Debugging + +```bash +# Check container status +./parallel/status.sh + +# View container logs directly +docker logs ralph-agent-1 + +# View agent log files +ls -lt agent_logs/ + +# Check prd.json story status +cat prd.json | jq '.userStories[] | {id, title, passes, claimed_by}' + +# Force rebuild the Docker image +docker rmi ralph-agent:latest +./parallel/ralph-parallel.sh --agents 1 +``` diff --git a/parallel/lib/auth.sh b/parallel/lib/auth.sh new file mode 100644 index 00000000..e0301fef --- /dev/null +++ b/parallel/lib/auth.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# +# auth.sh — Claude auth token retrieval for Ralph parallel mode. +# +# Token retrieval priority: +# 1. RALPH_CLAUDE_TOKEN env var +# 2. .ralph/token file in project directory +# 3. 1Password via `op read` (interactive — only at startup) +# + +fetch_claude_token() { + local project_dir="${1:-}" + + # Priority 1: Environment variable + if [ -n "${RALPH_CLAUDE_TOKEN:-}" ]; then + log_info "Using token from RALPH_CLAUDE_TOKEN env var" + echo "$RALPH_CLAUDE_TOKEN" + return 0 + fi + + # Priority 2: Token file + if [ -n "$project_dir" ] && [ -f "$project_dir/.ralph/token" ]; then + local file_token + file_token=$(cat "$project_dir/.ralph/token") + if [ -n "$file_token" ]; then + log_info "Using token from $project_dir/.ralph/token" + echo "$file_token" + return 0 + fi + fi + + # Priority 3: 1Password (interactive — will prompt for biometric/password) + log_info "Fetching token from 1Password (may prompt for auth)..." + + if ! command -v op &> /dev/null; then + log_error "No token available." + log_error "Provide a token via one of:" + log_error " 1. RALPH_CLAUDE_TOKEN env var" + log_error " 2. File at /.ralph/token" + log_error " 3. Install 1Password CLI: https://developer.1password.com/docs/cli/get-started/" + return 1 + fi + + local op_ref="${OP_ITEM_REF:-op://Private/Claude Code OAuth/credential}" + local token + token=$(op read "$op_ref" 2>&1) || { + log_error "Failed to read token from 1Password." + log_error "Reference: $op_ref" + log_error "Fallback: set RALPH_CLAUDE_TOKEN env var or write token to /.ralph/token" + return 1 + } + + if [ -z "$token" ]; then + log_error "1Password returned empty token." + return 1 + fi + + echo "$token" +} + +# Check if a refreshed token has been dropped into the refresh file. +# Returns 0 and prints the new token if found, 1 otherwise. +check_token_refresh_file() { + local project_dir="$1" + local refresh_file="$project_dir/.ralph/token_refresh" + + if [ -f "$refresh_file" ]; then + local new_token + new_token=$(cat "$refresh_file") + if [ -n "$new_token" ]; then + mv "$refresh_file" "${refresh_file}.consumed.$(date +%s)" + echo "$new_token" + return 0 + fi + fi + return 1 +} diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh new file mode 100644 index 00000000..d51d1567 --- /dev/null +++ b/parallel/lib/docker-helpers.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# +# docker-helpers.sh — Container launch and management helpers for Ralph parallel mode. +# + +RALPH_IMAGE="ralph-agent:latest" + +build_image() { + local docker_dir="$1" + + log_info "Building Ralph agent image..." + docker build --platform linux/arm64 -t "$RALPH_IMAGE" "$docker_dir" + log_info "Image built: $RALPH_IMAGE" +} + +launch_agent() { + local agent_id="$1" + local agent_role="$2" + local project_dir="$3" + local claude_token="$4" + local claude_model="$5" + local max_iterations="$6" + local container_memory="${7:-4g}" + local container_cpus="${8:-2}" + + # Determine network based on role + local network + case "$agent_role" in + researcher) network="$RESEARCHER_NETWORK" ;; + *) network="$BUILDER_NETWORK" ;; + esac + + local container_name="ralph-${agent_id}" + local project_dir_abs + project_dir_abs="$(cd "$project_dir" && pwd)" + + log_info "Launching container: $container_name (role=$agent_role, network=$network)" + + # Ensure log and state directories exist + mkdir -p "$project_dir_abs/agent_logs" "$project_dir_abs/.ralph" + + docker run -d \ + --name "$container_name" \ + --network "$network" \ + --platform linux/arm64 \ + --memory="$container_memory" \ + --cpus="$container_cpus" \ + --pids-limit=256 \ + --cap-add=NET_ADMIN \ + --cap-add=NET_RAW \ + -e "AGENT_ID=$agent_id" \ + -e "AGENT_ROLE=$agent_role" \ + -e "CLAUDE_CODE_OAUTH_TOKEN=$claude_token" \ + -e "CLAUDE_MODEL=$claude_model" \ + -e "MAX_ITERATIONS=$max_iterations" \ + -v "$project_dir_abs/.ralph/repo.git:/repo.git:rw" \ + -v "$project_dir_abs/parallel/CLAUDE-parallel.md:/parallel-prompt/CLAUDE-parallel.md:ro" \ + -v "$project_dir_abs/agent_logs:/agent-logs:rw" \ + -v "$project_dir_abs/.ralph:/harness-state:ro" \ + "$RALPH_IMAGE" + + log_info "Container $container_name started" +} + +stop_agent() { + local container_name="$1" + local timeout="${2:-30}" + + log_info "Stopping container: $container_name (timeout=${timeout}s)" + docker stop -t "$timeout" "$container_name" 2>/dev/null || true + docker rm "$container_name" 2>/dev/null || true +} + +is_agent_running() { + local container_name="$1" + docker inspect -f '{{.State.Running}}' "$container_name" 2>/dev/null | grep -q "true" +} + +restart_agent() { + local container_name="$1" + + log_info "Restarting container: $container_name" + docker restart "$container_name" 2>/dev/null || { + log_error "Could not restart $container_name" + return 1 + } + log_info "Container $container_name restarted" +} diff --git a/parallel/lib/logging.sh b/parallel/lib/logging.sh new file mode 100644 index 00000000..8c8f6290 --- /dev/null +++ b/parallel/lib/logging.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# +# logging.sh — Timestamped log helpers for Ralph parallel mode. +# + +log_info() { + echo "[$(date -u +"%H:%M:%S")] INFO: $*" +} + +log_warn() { + echo "[$(date -u +"%H:%M:%S")] WARN: $*" >&2 +} + +log_error() { + echo "[$(date -u +"%H:%M:%S")] ERROR: $*" >&2 +} diff --git a/parallel/lib/network-setup.sh b/parallel/lib/network-setup.sh new file mode 100644 index 00000000..dfeca5a0 --- /dev/null +++ b/parallel/lib/network-setup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# network-setup.sh — Docker network creation and teardown for Ralph parallel mode. +# + +BUILDER_NETWORK="ralph-builder" +RESEARCHER_NETWORK="ralph-researcher" + +create_networks() { + log_info "Creating Docker networks..." + + # Builder network: bridge with masquerade (firewall handles restrictions) + if ! docker network inspect "$BUILDER_NETWORK" &> /dev/null; then + docker network create "$BUILDER_NETWORK" \ + --driver bridge \ + --opt "com.docker.network.bridge.enable_ip_masquerade=true" + log_info "Created network: $BUILDER_NETWORK" + else + log_info "Network $BUILDER_NETWORK already exists" + fi + + # Researcher network: standard bridge with full internet + if ! docker network inspect "$RESEARCHER_NETWORK" &> /dev/null; then + docker network create "$RESEARCHER_NETWORK" \ + --driver bridge + log_info "Created network: $RESEARCHER_NETWORK" + else + log_info "Network $RESEARCHER_NETWORK already exists" + fi +} + +teardown_networks() { + log_info "Tearing down Docker networks..." + + docker network rm "$BUILDER_NETWORK" 2>/dev/null && \ + log_info "Removed network: $BUILDER_NETWORK" || \ + log_info "Network $BUILDER_NETWORK not found or in use" + + docker network rm "$RESEARCHER_NETWORK" 2>/dev/null && \ + log_info "Removed network: $RESEARCHER_NETWORK" || \ + log_info "Network $RESEARCHER_NETWORK not found or in use" +} diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh new file mode 100755 index 00000000..9078b1f7 --- /dev/null +++ b/parallel/ralph-parallel.sh @@ -0,0 +1,409 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# ralph-parallel.sh — Parallel mode orchestrator for Ralph. +# +# Launches N containerized Claude Code agents that work on prd.json stories +# simultaneously. Each agent runs in a Docker container with network restrictions, +# resource limits, and no host access. +# +# Usage: ./parallel/ralph-parallel.sh [options] [max_iterations] +# +# Options: +# --agents N Number of builder agents (default: 2) +# --researcher N Number of researcher agents with full internet (default: 0) +# --model MODEL Claude model to use (default: claude-sonnet-4-5-20250929) +# --memory SIZE Per-container memory limit (default: 4g) +# --cpus N Per-container CPU limit (default: 2) +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Source library scripts +source "$SCRIPT_DIR/lib/logging.sh" +source "$SCRIPT_DIR/lib/auth.sh" +source "$SCRIPT_DIR/lib/network-setup.sh" +source "$SCRIPT_DIR/lib/docker-helpers.sh" + +# --- Defaults --- +NUM_BUILDERS=2 +NUM_RESEARCHERS=0 +CLAUDE_MODEL="claude-sonnet-4-5-20250929" +CONTAINER_MEMORY="4g" +CONTAINER_CPUS="2" +MAX_ITERATIONS=0 +STALE_CLAIM_MINUTES=30 + +# --- Parse arguments --- +while [[ $# -gt 0 ]]; do + case $1 in + --agents) + NUM_BUILDERS="$2" + shift 2 + ;; + --agents=*) + NUM_BUILDERS="${1#*=}" + shift + ;; + --researcher) + NUM_RESEARCHERS="$2" + shift 2 + ;; + --researcher=*) + NUM_RESEARCHERS="${1#*=}" + shift + ;; + --model) + CLAUDE_MODEL="$2" + shift 2 + ;; + --model=*) + CLAUDE_MODEL="${1#*=}" + shift + ;; + --memory) + CONTAINER_MEMORY="$2" + shift 2 + ;; + --memory=*) + CONTAINER_MEMORY="${1#*=}" + shift + ;; + --cpus) + CONTAINER_CPUS="$2" + shift 2 + ;; + --cpus=*) + CONTAINER_CPUS="${1#*=}" + shift + ;; + -h|--help) + echo "Usage: $0 [options] [max_iterations]" + echo "" + echo "Options:" + echo " --agents N Number of builder agents (default: 2)" + echo " --researcher N Number of researcher agents (default: 0)" + echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" + echo " --memory SIZE Per-container memory limit (default: 4g)" + echo " --cpus N Per-container CPU limit (default: 2)" + echo "" + echo "Arguments:" + echo " max_iterations Per-agent iteration cap (default: 0 = until PRD complete)" + exit 0 + ;; + *) + if [[ "$1" =~ ^[0-9]+$ ]]; then + MAX_ITERATIONS="$1" + else + log_error "Unknown option: $1" + exit 1 + fi + shift + ;; + esac +done + +TOTAL_AGENTS=$((NUM_BUILDERS + NUM_RESEARCHERS)) + +if [ "$TOTAL_AGENTS" -eq 0 ]; then + log_error "No agents configured. Use --agents N and/or --researcher N." + exit 1 +fi + +# --- Validate project directory --- +# Ralph parallel runs from the project root (same as ralph.sh) +PROJECT_DIR="$RALPH_ROOT" +PRD_FILE="$PROJECT_DIR/prd.json" + +if [ ! -f "$PRD_FILE" ]; then + log_error "No prd.json found in $PROJECT_DIR" + log_error "Create a prd.json first (see prd.json.example)." + exit 1 +fi + +if [ ! -f "$SCRIPT_DIR/CLAUDE-parallel.md" ]; then + log_error "Missing parallel/CLAUDE-parallel.md prompt file" + exit 1 +fi + +# --- Display config --- +PROJECT_NAME=$(jq -r '.project // "unknown"' "$PRD_FILE" 2>/dev/null || echo "unknown") +BRANCH_NAME=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") +TOTAL_STORIES=$(jq '.userStories | length' "$PRD_FILE" 2>/dev/null || echo "?") +DONE_STORIES=$(jq '[.userStories[] | select(.passes == true)] | length' "$PRD_FILE" 2>/dev/null || echo "?") + +log_info "Ralph Parallel Mode" +log_info "====================" +log_info "Project: $PROJECT_NAME" +log_info "Branch: ${BRANCH_NAME:-}" +log_info "Stories: $DONE_STORIES/$TOTAL_STORIES complete" +log_info "Agents: $NUM_BUILDERS builders, $NUM_RESEARCHERS researchers ($TOTAL_AGENTS total)" +log_info "Model: $CLAUDE_MODEL" +log_info "Memory: $CONTAINER_MEMORY per container" +log_info "CPUs: $CONTAINER_CPUS per container" +log_info "Max iterations: $MAX_ITERATIONS (0=until PRD complete)" +echo "" + +# --- Step 1: Build Docker image if needed --- +log_info "Checking Docker image..." +if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then + build_image "$RALPH_ROOT/docker" +else + log_info "Image $RALPH_IMAGE already exists. Use 'docker rmi $RALPH_IMAGE' to force rebuild." +fi + +# --- Step 2: Create Docker networks --- +create_networks + +# --- Step 3: Fetch Claude auth token --- +log_info "Fetching Claude auth token..." +CLAUDE_TOKEN=$(fetch_claude_token "$PROJECT_DIR") + +if [ -z "$CLAUDE_TOKEN" ]; then + log_error "Failed to retrieve Claude auth token" + log_error "Options: set RALPH_CLAUDE_TOKEN env var, write to $PROJECT_DIR/.ralph/token, or configure 1Password" + exit 1 +fi +log_info "Auth token retrieved successfully" + +# --- Step 4: Create bare repo for agent coordination --- +# Agents need a shared bare repo to push to — you can't reliably push +# to a non-bare repo's checked-out branch. We create .ralph/repo.git +# as a bare clone of the project, and agents push/pull from this. +BARE_REPO="$PROJECT_DIR/.ralph/repo.git" +if [ ! -d "$BARE_REPO" ]; then + log_info "Creating bare repo for agent coordination..." + mkdir -p "$PROJECT_DIR/.ralph" + git clone --bare "$PROJECT_DIR" "$BARE_REPO" + log_info "Bare repo created at $BARE_REPO" +else + # Update the bare repo from the working directory + log_info "Updating bare repo from project..." + cd "$PROJECT_DIR" + git push "$BARE_REPO" --all 2>/dev/null || true + cd - > /dev/null +fi + +rm -f "$PROJECT_DIR/.ralph/stop_requested" + +# --- Step 6: Launch agent containers --- +AGENT_NUM=0 +declare -a CONTAINER_NAMES=() + +launch_agents_for_role() { + local role="$1" + local count="$2" + + for i in $(seq 1 "$count"); do + AGENT_NUM=$((AGENT_NUM + 1)) + local agent_id="agent-${AGENT_NUM}" + local container_name="ralph-${agent_id}" + + # Stop existing container with same name if present + if docker inspect "$container_name" &> /dev/null; then + log_warn "Container $container_name already exists. Removing." + stop_agent "$container_name" 10 + fi + + launch_agent \ + "$agent_id" \ + "$role" \ + "$PROJECT_DIR" \ + "$CLAUDE_TOKEN" \ + "$CLAUDE_MODEL" \ + "$MAX_ITERATIONS" \ + "$CONTAINER_MEMORY" \ + "$CONTAINER_CPUS" + + CONTAINER_NAMES+=("$container_name") + done +} + +log_info "Launching agents..." +launch_agents_for_role "builder" "$NUM_BUILDERS" +launch_agents_for_role "researcher" "$NUM_RESEARCHERS" + +log_info "All $TOTAL_AGENTS agents launched." +echo "" + +# --- Step 6: Monitor loop --- +MONITOR_INTERVAL=30 +log_info "Entering monitor loop (checking every ${MONITOR_INTERVAL}s)." +log_info "Use ./parallel/stop.sh to stop." +log_info "Use ./parallel/status.sh to check status." +echo "" + +# Helper: read a file from the bare repo without a full checkout +read_from_bare_repo() { + local file="$1" + local branch="${2:-main}" + git --git-dir="$BARE_REPO" show "${branch}:${file}" 2>/dev/null \ + || git --git-dir="$BARE_REPO" show "master:${file}" 2>/dev/null \ + || echo "" +} + +# Helper: check if all stories are complete in the bare repo +check_all_stories_complete() { + local prd_content + prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + [ -z "$prd_content" ] && return 1 + + local incomplete + incomplete=$(echo "$prd_content" | jq '[.userStories[] | select(.passes == false)] | length' 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] +} + +recover_stale_claims() { + # Read prd.json from the bare repo (agents push there, not to project dir) + local prd_content + prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + [ -z "$prd_content" ] && return + + local now_epoch + now_epoch=$(date +%s) + local stale_seconds=$((STALE_CLAIM_MINUTES * 60)) + + local claims + claims=$(echo "$prd_content" | jq -r ' + .userStories[] + | select(.passes == false and .claimed_by != null and .claimed_by != "") + | "\(.id)|\(.claimed_by)|\(.claimed_at // "")" + ' 2>/dev/null || echo "") + + [ -z "$claims" ] && return + + local cleared=false + local updated_prd="$prd_content" + while IFS='|' read -r story_id agent claimed_at; do + [ -z "$story_id" ] && continue + [ -z "$claimed_at" ] && continue + + # Parse claimed_at timestamp (macOS date -j, fallback to GNU date -d) + local claimed_epoch + claimed_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$claimed_at" +%s 2>/dev/null \ + || date -d "$claimed_at" +%s 2>/dev/null \ + || echo "0") + + if [ "$claimed_epoch" -eq 0 ]; then + continue + fi + + local age=$((now_epoch - claimed_epoch)) + if [ "$age" -gt "$stale_seconds" ]; then + local container_name="ralph-${agent}" + if ! is_agent_running "$container_name"; then + log_warn "Stale claim detected: $story_id by $agent (${age}s old, container not running). Clearing." + updated_prd=$(echo "$updated_prd" | jq --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then + del(.claimed_by) | del(.claimed_at) + else . end + ) + ') + cleared=true + fi + fi + done <<< "$claims" + + if $cleared; then + # Commit the cleared claims to the bare repo via a temp checkout + local temp_dir + temp_dir=$(mktemp -d) + git clone "$BARE_REPO" "$temp_dir/work" 2>/dev/null + cd "$temp_dir/work" + git config user.name "ralph-orchestrator" + git config user.email "orchestrator@ralph-agent.local" + if [ -n "$BRANCH_NAME" ]; then + git checkout "$BRANCH_NAME" 2>/dev/null || true + fi + echo "$updated_prd" | jq '.' > prd.json + git add prd.json + git commit -m "[orchestrator] Clear stale claims" 2>/dev/null || true + git push origin 2>/dev/null || true + cd - > /dev/null + rm -rf "$temp_dir" + fi +} + +while true; do + sleep "$MONITOR_INTERVAL" + + # Check if stop was requested + if [ -f "$PROJECT_DIR/.ralph/stop_requested" ]; then + log_info "Stop requested. Shutting down all agents..." + for name in "${CONTAINER_NAMES[@]}"; do + stop_agent "$name" 30 + done + teardown_networks + log_info "All agents stopped. Exiting." + exit 0 + fi + + # Check for token refresh + if NEW_TOKEN=$(check_token_refresh_file "$PROJECT_DIR"); then + log_info "New auth token detected. Restarting all agents with refreshed token..." + CLAUDE_TOKEN="$NEW_TOKEN" + for name in "${CONTAINER_NAMES[@]}"; do + stop_agent "$name" 15 + done + CONTAINER_NAMES=() + AGENT_NUM=0 + launch_agents_for_role "builder" "$NUM_BUILDERS" + launch_agents_for_role "researcher" "$NUM_RESEARCHERS" + log_info "All agents restarted with new token." + continue + fi + + # Recover stale claims + recover_stale_claims + + # Check container health + ALL_STOPPED=true + for name in "${CONTAINER_NAMES[@]}"; do + if is_agent_running "$name"; then + ALL_STOPPED=false + else + EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' "$name" 2>/dev/null || echo "unknown") + + if [ "$EXIT_CODE" = "0" ]; then + log_info "Container $name exited cleanly (code 0)." + else + log_warn "Container $name stopped unexpectedly (exit code: $EXIT_CODE). Restarting..." + restart_agent "$name" + if is_agent_running "$name"; then + ALL_STOPPED=false + else + log_error "Failed to restart $name" + fi + fi + fi + done + + # Check if all stories are complete (read from bare repo) + if check_all_stories_complete; then + log_info "All PRD stories are complete!" + log_info "Shutting down agents..." + for name in "${CONTAINER_NAMES[@]}"; do + stop_agent "$name" 15 + done + # Sync bare repo back to project working directory + log_info "Syncing results back to project..." + cd "$PROJECT_DIR" + git fetch "$BARE_REPO" 2>/dev/null || true + git merge FETCH_HEAD 2>/dev/null || true + teardown_networks + log_info "Done. All stories passed." + exit 0 + fi + + if $ALL_STOPPED; then + log_info "All agents have exited. Cleaning up..." + for name in "${CONTAINER_NAMES[@]}"; do + docker rm "$name" 2>/dev/null || true + done + teardown_networks + log_info "Done." + exit 0 + fi +done diff --git a/parallel/status.sh b/parallel/status.sh new file mode 100755 index 00000000..45b9f3ed --- /dev/null +++ b/parallel/status.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# status.sh — Show status of Ralph parallel agents and PRD stories. +# +# Usage: ./parallel/status.sh +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +source "$SCRIPT_DIR/lib/logging.sh" + +PROJECT_DIR="$RALPH_ROOT" +PRD_FILE="$PROJECT_DIR/prd.json" +BARE_REPO="$PROJECT_DIR/.ralph/repo.git" + +# Helper: read file from bare repo +read_from_bare_repo() { + local file="$1" + git --git-dir="$BARE_REPO" show "HEAD:${file}" 2>/dev/null || echo "" +} + +# Load project info — prefer bare repo (has latest agent pushes), fallback to working dir +PROJECT_NAME="unknown" +PRD_CONTENT="" +if [ -d "$BARE_REPO" ]; then + PRD_CONTENT=$(read_from_bare_repo "prd.json") +fi +if [ -z "$PRD_CONTENT" ] && [ -f "$PRD_FILE" ]; then + PRD_CONTENT=$(cat "$PRD_FILE") +fi +if [ -n "$PRD_CONTENT" ]; then + PROJECT_NAME=$(echo "$PRD_CONTENT" | jq -r '.project // "unknown"' 2>/dev/null || echo "unknown") +fi + +echo "========================================" +echo " Ralph Parallel Status: $PROJECT_NAME" +echo "========================================" +echo "" + +# --- Stop signal check --- +if [ -f "$PROJECT_DIR/.ralph/stop_requested" ]; then + echo "** STOP REQUESTED -- agents will exit after current iteration **" + echo "" +fi + +# --- Container Status --- +echo "--- Containers ---" +CONTAINERS=$(docker ps -a --filter "name=ralph-agent-" --format "table {{.Names}}\t{{.Status}}\t{{.RunningFor}}" 2>/dev/null || true) +if [ -n "$CONTAINERS" ]; then + echo "$CONTAINERS" +else + echo "No Ralph containers found." +fi +echo "" + +# --- Story Board from prd.json --- +echo "--- Story Board ---" +if [ -n "$PRD_CONTENT" ]; then + # Available stories (passes: false, no claim) + echo "Available:" + AVAILABLE=$(echo "$PRD_CONTENT" | jq -r ' + .userStories[] + | select(.passes == false and (.claimed_by == null or .claimed_by == "")) + | " [ ] \(.id): \(.title) (priority: \(.priority))" + ' 2>/dev/null || echo "") + if [ -n "$AVAILABLE" ]; then + echo "$AVAILABLE" + else + echo " (none)" + fi + + # Claimed stories (passes: false, has claim) + echo "Claimed:" + CLAIMED=$(echo "$PRD_CONTENT" | jq -r ' + .userStories[] + | select(.passes == false and .claimed_by != null and .claimed_by != "") + | " [~] \(.id): \(.title) (by \(.claimed_by) at \(.claimed_at // "?"))" + ' 2>/dev/null || echo "") + if [ -n "$CLAIMED" ]; then + echo "$CLAIMED" + else + echo " (none)" + fi + + # Complete stories (passes: true) + echo "Done:" + DONE=$(echo "$PRD_CONTENT" | jq -r ' + .userStories[] + | select(.passes == true) + | " [x] \(.id): \(.title)" + ' 2>/dev/null || echo "") + if [ -n "$DONE" ]; then + echo "$DONE" + else + echo " (none)" + fi + + # Summary + TOTAL=$(echo "$PRD_CONTENT" | jq '.userStories | length' 2>/dev/null || echo "?") + DONE_COUNT=$(echo "$PRD_CONTENT" | jq '[.userStories[] | select(.passes == true)] | length' 2>/dev/null || echo "?") + echo "" + echo "Progress: $DONE_COUNT/$TOTAL stories complete" +else + echo "No prd.json found." +fi +echo "" + +# --- Recent Logs --- +echo "--- Recent Logs (last 10 lines, 3 most recent) ---" +LOG_DIR="$PROJECT_DIR/agent_logs" +if [ -d "$LOG_DIR" ]; then + LATEST_LOGS=$(ls -t "$LOG_DIR"/*.log 2>/dev/null | head -3) + if [ -n "$LATEST_LOGS" ]; then + for logfile in $LATEST_LOGS; do + echo " $(basename "$logfile"):" + tail -n 10 "$logfile" | sed 's/^/ /' + echo "" + done + else + echo " No log files yet." + fi +else + echo " No log directory found." +fi + +# --- Git Log --- +echo "--- Recent Commits ---" +if [ -d "$BARE_REPO" ]; then + git --git-dir="$BARE_REPO" log --oneline -10 2>/dev/null || echo " No commits yet." +elif [ -d "$PROJECT_DIR/.git" ]; then + git -C "$PROJECT_DIR" log --oneline -10 2>/dev/null || echo " No commits yet." +else + echo " Not a git repository." +fi +echo "" diff --git a/parallel/stop.sh b/parallel/stop.sh new file mode 100755 index 00000000..38fb1709 --- /dev/null +++ b/parallel/stop.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# stop.sh — Graceful shutdown of Ralph parallel agents. +# +# Usage: ./parallel/stop.sh +# +# Creates a stop_requested file that agents check each iteration. +# Waits up to 120s for graceful exit, then force-kills remaining containers. +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +source "$SCRIPT_DIR/lib/logging.sh" + +PROJECT_DIR="$RALPH_ROOT" + +# --- Signal stop --- +log_info "Requesting graceful stop for Ralph parallel agents..." +mkdir -p "$PROJECT_DIR/.ralph" +touch "$PROJECT_DIR/.ralph/stop_requested" + +# --- Wait for containers to stop --- +TIMEOUT=120 +ELAPSED=0 +CHECK_INTERVAL=5 + +log_info "Waiting up to ${TIMEOUT}s for agents to finish current iteration..." + +while [ $ELAPSED -lt $TIMEOUT ]; do + RUNNING=$(docker ps --filter "name=ralph-agent-" --format "{{.Names}}" 2>/dev/null || true) + + if [ -z "$RUNNING" ]; then + log_info "All agents stopped gracefully." + rm -f "$PROJECT_DIR/.ralph/stop_requested" + exit 0 + fi + + RUNNING_COUNT=$(echo "$RUNNING" | wc -l | tr -d ' ') + log_info "Still running: $RUNNING_COUNT containers ($ELAPSED/${TIMEOUT}s)" + + sleep "$CHECK_INTERVAL" + ELAPSED=$((ELAPSED + CHECK_INTERVAL)) +done + +# --- Force kill remaining containers --- +REMAINING=$(docker ps --filter "name=ralph-agent-" --format "{{.Names}}" 2>/dev/null || true) + +if [ -n "$REMAINING" ]; then + log_warn "Timeout reached. Force-stopping remaining containers..." + for name in $REMAINING; do + log_warn "Force-stopping: $name" + docker kill "$name" 2>/dev/null || true + docker rm "$name" 2>/dev/null || true + done +fi + +rm -f "$PROJECT_DIR/.ralph/stop_requested" +log_info "Shutdown complete." From 75e9a1d3ad00c782de3d92df367164bdf643d3de Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 10:09:43 -0500 Subject: [PATCH 02/21] fix: volume-based auth, --project flag, and smoke test fixes Switch from env-var token passing to Docker volume-based auth: - Mount ralph-claude-auth volume at /claude-auth:ro - agent-loop.sh copies credentials to writable ~/.claude/ - Add check_auth_volume() to verify volume before launch - Remove CLAUDE_CODE_OAUTH_TOKEN env var requirement Add --project DIR flag to orchestrator, status, and stop scripts so ralph can target external project directories. Bug fixes discovered during smoke test: - Fix UID 1000 conflict in Dockerfile (node:20-slim uses 1000) - Fix macOS seq counting down when count=0 (guard with -le 0) - Fix PARALLEL_PROMPT path resolution for external projects Co-Authored-By: Claude Opus 4.6 --- docker/Dockerfile | 4 +-- docker/agent-loop.sh | 18 +++++++---- parallel/lib/docker-helpers.sh | 43 +++++++++++++++++++++---- parallel/ralph-parallel.sh | 58 ++++++++++++++++++---------------- parallel/status.sh | 15 +++++++-- parallel/stop.sh | 15 +++++++-- 6 files changed, 106 insertions(+), 47 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0a25a72f..721d4b3a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,8 +16,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Install claude-code globally RUN npm install -g @anthropic-ai/claude-code -# Create non-root agent user -RUN useradd -m -s /bin/bash -u 1000 agent +# Create non-root agent user (UID 1001 since node:20-slim uses 1000 for 'node') +RUN useradd -m -s /bin/bash -u 1001 agent # Allow agent to run firewall init scripts via sudo (no password) RUN echo "agent ALL=(root) NOPASSWD: /opt/ralph/init-firewall-builder.sh, /opt/ralph/init-firewall-researcher.sh" \ diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index edecb723..a932220f 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -9,14 +9,14 @@ set -euo pipefail # Expected environment variables: # AGENT_ID - Unique agent identifier (e.g., "agent-1") # AGENT_ROLE - One of: builder, researcher -# CLAUDE_CODE_OAUTH_TOKEN - OAuth token JSON for Claude authentication # MAX_ITERATIONS - Max loop iterations (0 = infinite, default: 0) # CLAUDE_MODEL - Model to use (default: claude-sonnet-4-5-20250929) # +# Auth: Claude credentials are mounted via Docker volume at /home/agent/.claude +# AGENT_ID="${AGENT_ID:?AGENT_ID is required}" AGENT_ROLE="${AGENT_ROLE:?AGENT_ROLE is required}" -CLAUDE_CODE_OAUTH_TOKEN="${CLAUDE_CODE_OAUTH_TOKEN:?CLAUDE_CODE_OAUTH_TOKEN is required}" MAX_ITERATIONS="${MAX_ITERATIONS:-0}" CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-5-20250929}" @@ -41,14 +41,18 @@ case "$AGENT_ROLE" in ;; esac -# --- Step 2: Write Claude auth credentials --- -echo "[$AGENT_ID] Configuring Claude authentication" +# --- Step 2: Copy Claude auth credentials from mounted volume --- +if [ ! -f /claude-auth/.credentials.json ]; then + echo "[$AGENT_ID] ERROR: No Claude credentials found at /claude-auth/.credentials.json" + echo "[$AGENT_ID] Ensure the ralph-claude-auth volume is mounted." + exit 1 +fi mkdir -p ~/.claude -echo "$CLAUDE_CODE_OAUTH_TOKEN" > ~/.claude/.credentials.json +cp /claude-auth/.credentials.json ~/.claude/.credentials.json chmod 600 ~/.claude/.credentials.json -export CLAUDE_CODE_OAUTH_TOKEN +echo "[$AGENT_ID] Claude credentials copied" -# --- Step 3: Clone or update workspace from project mount --- +# --- Step 3: Clone or update workspace from bare repo --- setup_workspace() { if [ -d "$WORKSPACE/.git" ]; then echo "[$AGENT_ID] Fetching latest changes" diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index d51d1567..c2e92f80 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -4,6 +4,7 @@ # RALPH_IMAGE="ralph-agent:latest" +CLAUDE_AUTH_VOLUME="ralph-claude-auth" build_image() { local docker_dir="$1" @@ -13,15 +14,40 @@ build_image() { log_info "Image built: $RALPH_IMAGE" } +# Verify the shared Claude auth volume exists and has credentials +check_auth_volume() { + if ! docker volume inspect "$CLAUDE_AUTH_VOLUME" &> /dev/null; then + log_error "Claude auth volume '$CLAUDE_AUTH_VOLUME' not found." + log_error "Run the auth setup first:" + log_error " docker run -it --platform linux/arm64 --entrypoint bash \\" + log_error " -v $CLAUDE_AUTH_VOLUME:/home/agent/.claude \\" + log_error " $RALPH_IMAGE" + log_error " Then inside: claude login" + return 1 + fi + + # Quick check that credentials exist on the volume + local has_creds + has_creds=$(docker run --rm --platform linux/arm64 --entrypoint test \ + -v "$CLAUDE_AUTH_VOLUME":/claude-auth:ro \ + "$RALPH_IMAGE" -f /claude-auth/.credentials.json && echo "yes" || echo "no") + + if [ "$has_creds" != "yes" ]; then + log_error "No credentials found on auth volume. Run 'claude login' in a container first." + return 1 + fi + + return 0 +} + launch_agent() { local agent_id="$1" local agent_role="$2" local project_dir="$3" - local claude_token="$4" - local claude_model="$5" - local max_iterations="$6" - local container_memory="${7:-4g}" - local container_cpus="${8:-2}" + local claude_model="$4" + local max_iterations="$5" + local container_memory="${6:-4g}" + local container_cpus="${7:-2}" # Determine network based on role local network @@ -34,6 +60,9 @@ launch_agent() { local project_dir_abs project_dir_abs="$(cd "$project_dir" && pwd)" + # PARALLEL_PROMPT is set by ralph-parallel.sh (points to ralph repo's CLAUDE-parallel.md) + local prompt_path="${PARALLEL_PROMPT:-$project_dir_abs/parallel/CLAUDE-parallel.md}" + log_info "Launching container: $container_name (role=$agent_role, network=$network)" # Ensure log and state directories exist @@ -50,11 +79,11 @@ launch_agent() { --cap-add=NET_RAW \ -e "AGENT_ID=$agent_id" \ -e "AGENT_ROLE=$agent_role" \ - -e "CLAUDE_CODE_OAUTH_TOKEN=$claude_token" \ -e "CLAUDE_MODEL=$claude_model" \ -e "MAX_ITERATIONS=$max_iterations" \ + -v "$CLAUDE_AUTH_VOLUME:/claude-auth:ro" \ -v "$project_dir_abs/.ralph/repo.git:/repo.git:rw" \ - -v "$project_dir_abs/parallel/CLAUDE-parallel.md:/parallel-prompt/CLAUDE-parallel.md:ro" \ + -v "$prompt_path:/parallel-prompt/CLAUDE-parallel.md:ro" \ -v "$project_dir_abs/agent_logs:/agent-logs:rw" \ -v "$project_dir_abs/.ralph:/harness-state:ro" \ "$RALPH_IMAGE" diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 9078b1f7..408e493d 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -10,6 +10,7 @@ set -euo pipefail # Usage: ./parallel/ralph-parallel.sh [options] [max_iterations] # # Options: +# --project DIR Project directory containing prd.json (default: current dir) # --agents N Number of builder agents (default: 2) # --researcher N Number of researcher agents with full internet (default: 0) # --model MODEL Claude model to use (default: claude-sonnet-4-5-20250929) @@ -34,10 +35,19 @@ CONTAINER_MEMORY="4g" CONTAINER_CPUS="2" MAX_ITERATIONS=0 STALE_CLAIM_MINUTES=30 +PROJECT_DIR="" # --- Parse arguments --- while [[ $# -gt 0 ]]; do case $1 in + --project) + PROJECT_DIR="$2" + shift 2 + ;; + --project=*) + PROJECT_DIR="${1#*=}" + shift + ;; --agents) NUM_BUILDERS="$2" shift 2 @@ -82,6 +92,7 @@ while [[ $# -gt 0 ]]; do echo "Usage: $0 [options] [max_iterations]" echo "" echo "Options:" + echo " --project DIR Project directory with prd.json (default: current dir)" echo " --agents N Number of builder agents (default: 2)" echo " --researcher N Number of researcher agents (default: 0)" echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" @@ -112,9 +123,15 @@ if [ "$TOTAL_AGENTS" -eq 0 ]; then fi # --- Validate project directory --- -# Ralph parallel runs from the project root (same as ralph.sh) -PROJECT_DIR="$RALPH_ROOT" +# Default to current working directory if --project not specified +if [ -z "$PROJECT_DIR" ]; then + PROJECT_DIR="$(pwd)" +fi +# Resolve to absolute path +PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" PRD_FILE="$PROJECT_DIR/prd.json" +# CLAUDE-parallel.md lives in the ralph repo, not the project +PARALLEL_PROMPT="$SCRIPT_DIR/CLAUDE-parallel.md" if [ ! -f "$PRD_FILE" ]; then log_error "No prd.json found in $PROJECT_DIR" @@ -122,11 +139,16 @@ if [ ! -f "$PRD_FILE" ]; then exit 1 fi -if [ ! -f "$SCRIPT_DIR/CLAUDE-parallel.md" ]; then +if [ ! -f "$PARALLEL_PROMPT" ]; then log_error "Missing parallel/CLAUDE-parallel.md prompt file" exit 1 fi +if [ ! -d "$PROJECT_DIR/.git" ]; then + log_error "$PROJECT_DIR is not a git repository" + exit 1 +fi + # --- Display config --- PROJECT_NAME=$(jq -r '.project // "unknown"' "$PRD_FILE" 2>/dev/null || echo "unknown") BRANCH_NAME=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") @@ -156,16 +178,12 @@ fi # --- Step 2: Create Docker networks --- create_networks -# --- Step 3: Fetch Claude auth token --- -log_info "Fetching Claude auth token..." -CLAUDE_TOKEN=$(fetch_claude_token "$PROJECT_DIR") - -if [ -z "$CLAUDE_TOKEN" ]; then - log_error "Failed to retrieve Claude auth token" - log_error "Options: set RALPH_CLAUDE_TOKEN env var, write to $PROJECT_DIR/.ralph/token, or configure 1Password" +# --- Step 3: Verify Claude auth volume --- +log_info "Checking Claude auth volume..." +if ! check_auth_volume; then exit 1 fi -log_info "Auth token retrieved successfully" +log_info "Claude auth volume verified" # --- Step 4: Create bare repo for agent coordination --- # Agents need a shared bare repo to push to — you can't reliably push @@ -195,6 +213,8 @@ launch_agents_for_role() { local role="$1" local count="$2" + [ "$count" -le 0 ] && return + for i in $(seq 1 "$count"); do AGENT_NUM=$((AGENT_NUM + 1)) local agent_id="agent-${AGENT_NUM}" @@ -210,7 +230,6 @@ launch_agents_for_role() { "$agent_id" \ "$role" \ "$PROJECT_DIR" \ - "$CLAUDE_TOKEN" \ "$CLAUDE_MODEL" \ "$MAX_ITERATIONS" \ "$CONTAINER_MEMORY" \ @@ -340,21 +359,6 @@ while true; do exit 0 fi - # Check for token refresh - if NEW_TOKEN=$(check_token_refresh_file "$PROJECT_DIR"); then - log_info "New auth token detected. Restarting all agents with refreshed token..." - CLAUDE_TOKEN="$NEW_TOKEN" - for name in "${CONTAINER_NAMES[@]}"; do - stop_agent "$name" 15 - done - CONTAINER_NAMES=() - AGENT_NUM=0 - launch_agents_for_role "builder" "$NUM_BUILDERS" - launch_agents_for_role "researcher" "$NUM_RESEARCHERS" - log_info "All agents restarted with new token." - continue - fi - # Recover stale claims recover_stale_claims diff --git a/parallel/status.sh b/parallel/status.sh index 45b9f3ed..bb9a5755 100755 --- a/parallel/status.sh +++ b/parallel/status.sh @@ -3,7 +3,7 @@ set -euo pipefail # # status.sh — Show status of Ralph parallel agents and PRD stories. # -# Usage: ./parallel/status.sh +# Usage: ./parallel/status.sh [--project DIR] # SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -11,7 +11,18 @@ RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" source "$SCRIPT_DIR/lib/logging.sh" -PROJECT_DIR="$RALPH_ROOT" +PROJECT_DIR="" +while [[ $# -gt 0 ]]; do + case $1 in + --project) PROJECT_DIR="$2"; shift 2 ;; + --project=*) PROJECT_DIR="${1#*=}"; shift ;; + *) shift ;; + esac +done +if [ -z "$PROJECT_DIR" ]; then + PROJECT_DIR="$(pwd)" +fi +PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" PRD_FILE="$PROJECT_DIR/prd.json" BARE_REPO="$PROJECT_DIR/.ralph/repo.git" diff --git a/parallel/stop.sh b/parallel/stop.sh index 38fb1709..259a20b1 100755 --- a/parallel/stop.sh +++ b/parallel/stop.sh @@ -3,7 +3,7 @@ set -euo pipefail # # stop.sh — Graceful shutdown of Ralph parallel agents. # -# Usage: ./parallel/stop.sh +# Usage: ./parallel/stop.sh [--project DIR] # # Creates a stop_requested file that agents check each iteration. # Waits up to 120s for graceful exit, then force-kills remaining containers. @@ -14,7 +14,18 @@ RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" source "$SCRIPT_DIR/lib/logging.sh" -PROJECT_DIR="$RALPH_ROOT" +PROJECT_DIR="" +while [[ $# -gt 0 ]]; do + case $1 in + --project) PROJECT_DIR="$2"; shift 2 ;; + --project=*) PROJECT_DIR="${1#*=}"; shift ;; + *) shift ;; + esac +done +if [ -z "$PROJECT_DIR" ]; then + PROJECT_DIR="$(pwd)" +fi +PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" # --- Signal stop --- log_info "Requesting graceful stop for Ralph parallel agents..." From 0c4418bc463e314d42f37fa02c6a86551ce26eb5 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 10:27:21 -0500 Subject: [PATCH 03/21] feat: add --image flag for custom Docker images Allow projects to specify a custom Docker image via --image flag, enabling project-specific tooling (e.g., Deno, Python) without modifying the base ralph-agent image. Co-Authored-By: Claude Opus 4.6 --- parallel/lib/docker-helpers.sh | 2 +- parallel/ralph-parallel.sh | 32 +++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index c2e92f80..cbc6c747 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -3,7 +3,7 @@ # docker-helpers.sh — Container launch and management helpers for Ralph parallel mode. # -RALPH_IMAGE="ralph-agent:latest" +RALPH_IMAGE="${RALPH_IMAGE:-ralph-agent:latest}" CLAUDE_AUTH_VOLUME="ralph-claude-auth" build_image() { diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 408e493d..9489745b 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -11,6 +11,7 @@ set -euo pipefail # # Options: # --project DIR Project directory containing prd.json (default: current dir) +# --image IMAGE Custom Docker image (default: ralph-agent:latest, auto-built) # --agents N Number of builder agents (default: 2) # --researcher N Number of researcher agents with full internet (default: 0) # --model MODEL Claude model to use (default: claude-sonnet-4-5-20250929) @@ -36,6 +37,7 @@ CONTAINER_CPUS="2" MAX_ITERATIONS=0 STALE_CLAIM_MINUTES=30 PROJECT_DIR="" +CUSTOM_IMAGE="" # --- Parse arguments --- while [[ $# -gt 0 ]]; do @@ -48,6 +50,14 @@ while [[ $# -gt 0 ]]; do PROJECT_DIR="${1#*=}" shift ;; + --image) + CUSTOM_IMAGE="$2" + shift 2 + ;; + --image=*) + CUSTOM_IMAGE="${1#*=}" + shift + ;; --agents) NUM_BUILDERS="$2" shift 2 @@ -93,6 +103,7 @@ while [[ $# -gt 0 ]]; do echo "" echo "Options:" echo " --project DIR Project directory with prd.json (default: current dir)" + echo " --image IMAGE Custom Docker image (default: ralph-agent:latest)" echo " --agents N Number of builder agents (default: 2)" echo " --researcher N Number of researcher agents (default: 0)" echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" @@ -161,18 +172,29 @@ log_info "Project: $PROJECT_NAME" log_info "Branch: ${BRANCH_NAME:-}" log_info "Stories: $DONE_STORIES/$TOTAL_STORIES complete" log_info "Agents: $NUM_BUILDERS builders, $NUM_RESEARCHERS researchers ($TOTAL_AGENTS total)" +log_info "Image: ${CUSTOM_IMAGE:-$RALPH_IMAGE (default)}" log_info "Model: $CLAUDE_MODEL" log_info "Memory: $CONTAINER_MEMORY per container" log_info "CPUs: $CONTAINER_CPUS per container" log_info "Max iterations: $MAX_ITERATIONS (0=until PRD complete)" echo "" -# --- Step 1: Build Docker image if needed --- -log_info "Checking Docker image..." -if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then - build_image "$RALPH_ROOT/docker" +# --- Step 1: Build or verify Docker image --- +if [ -n "$CUSTOM_IMAGE" ]; then + # User specified a custom image — use it, don't auto-build + export RALPH_IMAGE="$CUSTOM_IMAGE" + log_info "Using custom image: $RALPH_IMAGE" + if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then + log_error "Custom image '$RALPH_IMAGE' not found. Build it first." + exit 1 + fi else - log_info "Image $RALPH_IMAGE already exists. Use 'docker rmi $RALPH_IMAGE' to force rebuild." + log_info "Checking Docker image..." + if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then + build_image "$RALPH_ROOT/docker" + else + log_info "Image $RALPH_IMAGE already exists. Use 'docker rmi $RALPH_IMAGE' to force rebuild." + fi fi # --- Step 2: Create Docker networks --- From cee3b3d63a787f967045cc6e73fd63a4042f88b5 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 10:38:41 -0500 Subject: [PATCH 04/21] fix: claim_story stdout pollution and set -e crash on retry - Redirect all git output in claim_story() to stderr so only the story ID goes to stdout (prevents garbage in CLAIMED_STORY) - Wrap claim_story call in if-statement to prevent set -e from killing the script when claim returns non-zero - Fix setup_workspace to reset to current branch on restart, not hard-coded origin/main (preserves feature branches) Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 48 +++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index a932220f..1b9b02c2 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -58,7 +58,14 @@ setup_workspace() { echo "[$AGENT_ID] Fetching latest changes" cd "$WORKSPACE" git fetch origin - git reset --hard origin/main 2>/dev/null || git reset --hard origin/master 2>/dev/null || true + # Reset to current branch's remote tracking, not hard-coded main + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + if [ -n "$current_branch" ]; then + git reset --hard "origin/$current_branch" 2>/dev/null || true + else + git reset --hard origin/main 2>/dev/null || git reset --hard origin/master 2>/dev/null || true + fi else echo "[$AGENT_ID] Cloning bare repo into workspace" git clone "$REPO_PATH" "$WORKSPACE" @@ -111,11 +118,11 @@ checkout_prd_branch() { claim_story() { cd "$WORKSPACE" - # Pull latest prd.json - git pull --rebase 2>&1 || { - git rebase --abort 2>/dev/null || true - git fetch origin - git reset --hard "origin/$(git branch --show-current)" + # Pull latest prd.json (all git output to stderr to keep stdout clean for return value) + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$(git branch --show-current)" >&2 2>&1 } if [ ! -f prd.json ]; then @@ -150,24 +157,24 @@ claim_story() { ) ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json - git add prd.json - git commit -m "[$AGENT_ID] Claim: $story_id" 2>&1 || { + git add prd.json >&2 2>&1 + git commit -m "[$AGENT_ID] Claim: $story_id" >&2 2>&1 || { echo "[$AGENT_ID] Failed to commit claim for $story_id" >&2 - git checkout -- prd.json 2>/dev/null || true + git checkout -- prd.json >/dev/null 2>&1 || true return 1 } # Atomic push — if this fails, another agent claimed something concurrently - if git push 2>&1; then + if git push >&2 2>&1; then echo "$story_id" return 0 else echo "[$AGENT_ID] Push failed (concurrent claim). Resetting and retrying..." >&2 - git reset --hard HEAD~1 - git pull --rebase 2>&1 || { - git rebase --abort 2>/dev/null || true - git fetch origin - git reset --hard "origin/$(git branch --show-current)" + git reset --hard HEAD~1 >&2 2>&1 + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$(git branch --show-current)" >&2 2>&1 } return 1 fi @@ -259,9 +266,14 @@ while true; do CLAIMED_STORY="" CLAIM_ATTEMPTS=0 while [ $CLAIM_ATTEMPTS -lt 3 ] && [ -z "$CLAIMED_STORY" ]; do - CLAIMED_STORY=$(claim_story) && break || true - CLAIM_ATTEMPTS=$((CLAIM_ATTEMPTS + 1)) - sleep 2 + # Use if to prevent set -e from killing the script on claim failure + if CLAIMED_STORY=$(claim_story); then + break + else + CLAIMED_STORY="" + CLAIM_ATTEMPTS=$((CLAIM_ATTEMPTS + 1)) + sleep 2 + fi done if [ -z "$CLAIMED_STORY" ]; then From f7f879dd19f96b9c3dcf7fd0c86dd6cbb9a93830 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 12:18:06 -0500 Subject: [PATCH 05/21] fix: add jsr.io and deno.land to builder firewall whitelist Deno projects need access to jsr.io (Deno's package registry) for dependency resolution and type checking. Without this, agents can't run `deno task check` inside builder containers. Co-Authored-By: Claude Opus 4.6 --- docker/init-firewall-builder.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/init-firewall-builder.sh b/docker/init-firewall-builder.sh index 27dd696b..e2894325 100755 --- a/docker/init-firewall-builder.sh +++ b/docker/init-firewall-builder.sh @@ -26,6 +26,8 @@ ALLOWED_DOMAINS=( "api.anthropic.com" "statsig.anthropic.com" "registry.npmjs.org" + "jsr.io" + "deno.land" ) for domain in "${ALLOWED_DOMAINS[@]}"; do From b36bef381cc66fdd932142a2ba982e27c2e62196 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 13:18:00 -0500 Subject: [PATCH 06/21] security: fix token exposure, hardcoded platform, and firewall genericity - Mount only stop_requested file instead of entire .ralph/ directory, preventing agents from reading plaintext auth tokens - Switch stop signal from file-existence to file-content (-s not -f) since the file must exist for Docker bind-mount - Remove hardcoded --platform linux/arm64 so builds work on any arch - Replace hardcoded npm/jsr/deno firewall whitelist with --allow-domain flag, making the firewall language-agnostic - Use treeless bare clone (--filter=blob:none) to avoid exposing old file content that may contain secrets - Add SETENV to sudoers so RALPH_EXTRA_DOMAINS passes through sudo - Document custom image contract and extension pattern in README Co-Authored-By: Claude Opus 4.6 --- docker/Dockerfile | 2 +- docker/agent-loop.sh | 4 +- docker/init-firewall-builder.sh | 22 +++++--- parallel/README.md | 94 +++++++++++++++++++++++++++++---- parallel/lib/docker-helpers.sh | 14 ++--- parallel/ralph-parallel.sh | 24 +++++++-- parallel/status.sh | 2 +- parallel/stop.sh | 2 +- 8 files changed, 134 insertions(+), 30 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 721d4b3a..40434b44 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ RUN npm install -g @anthropic-ai/claude-code RUN useradd -m -s /bin/bash -u 1001 agent # Allow agent to run firewall init scripts via sudo (no password) -RUN echo "agent ALL=(root) NOPASSWD: /opt/ralph/init-firewall-builder.sh, /opt/ralph/init-firewall-researcher.sh" \ +RUN echo "agent ALL=(root) NOPASSWD:SETENV: /opt/ralph/init-firewall-builder.sh, /opt/ralph/init-firewall-researcher.sh" \ > /etc/sudoers.d/agent-firewall && chmod 0440 /etc/sudoers.d/agent-firewall # Copy scripts diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 1b9b02c2..e400a4c9 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -37,7 +37,7 @@ case "$AGENT_ROLE" in sudo /opt/ralph/init-firewall-researcher.sh ;; *) - sudo /opt/ralph/init-firewall-builder.sh + sudo RALPH_EXTRA_DOMAINS="${RALPH_EXTRA_DOMAINS:-}" /opt/ralph/init-firewall-builder.sh ;; esac @@ -238,7 +238,7 @@ echo "[$AGENT_ID] Entering main loop" while true; do # Check stop signal - if [ -f "$STOP_FILE" ]; then + if [ -s "$STOP_FILE" ]; then echo "[$AGENT_ID] Stop requested. Exiting gracefully." exit 0 fi diff --git a/docker/init-firewall-builder.sh b/docker/init-firewall-builder.sh index e2894325..5da643fe 100755 --- a/docker/init-firewall-builder.sh +++ b/docker/init-firewall-builder.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash set -euo pipefail # -# Builder firewall: whitelist only Claude API, Statsig, and npm registry. +# Builder firewall: whitelist only Claude API and user-specified domains. # Everything else is denied. Must run as root (called via sudo). # +# Extra domains are passed via the RALPH_EXTRA_DOMAINS env var (comma-separated). +# # Flush existing rules iptables -F OUTPUT 2>/dev/null || true @@ -21,15 +23,23 @@ iptables -A OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT iptables -A OUTPUT -p udp --dport 53 -j ACCEPT iptables -A OUTPUT -p tcp --dport 53 -j ACCEPT -# Resolve and allow each whitelisted domain +# Always-allowed domains (Claude Code needs these) ALLOWED_DOMAINS=( "api.anthropic.com" "statsig.anthropic.com" - "registry.npmjs.org" - "jsr.io" - "deno.land" ) +# Append user-specified domains from RALPH_EXTRA_DOMAINS env var +if [ -n "${RALPH_EXTRA_DOMAINS:-}" ]; then + IFS=',' read -ra extra <<< "$RALPH_EXTRA_DOMAINS" + for domain in "${extra[@]}"; do + # Trim whitespace + domain=$(echo "$domain" | xargs) + [ -n "$domain" ] && ALLOWED_DOMAINS+=("$domain") + done +fi + +# Resolve and allow each whitelisted domain for domain in "${ALLOWED_DOMAINS[@]}"; do # Resolve all IPs for the domain ips=$(dig +short "$domain" 2>/dev/null | grep -E '^[0-9]+\.' || true) @@ -52,4 +62,4 @@ done # Default deny all other outbound traffic iptables -A OUTPUT -j DROP -echo "[firewall] Builder firewall initialized. Only API + npm registry allowed." +echo "[firewall] Builder firewall initialized. Allowed domains: ${ALLOWED_DOMAINS[*]}" diff --git a/parallel/README.md b/parallel/README.md index 30d5d8e4..72746c59 100644 --- a/parallel/README.md +++ b/parallel/README.md @@ -42,11 +42,14 @@ export RALPH_CLAUDE_TOKEN='' ./parallel/ralph-parallel.sh [options] [max_iterations] Options: - --agents N Number of builder agents (default: 2) - --researcher N Number of researcher agents with full internet (default: 0) - --model MODEL Claude model (default: claude-sonnet-4-5-20250929) - --memory SIZE Per-container memory limit (default: 4g) - --cpus N Per-container CPU limit (default: 2) + --project DIR Project directory with prd.json (default: current dir) + --image IMAGE Custom Docker image (default: ralph-agent:latest) + --agents N Number of builder agents (default: 2) + --researcher N Number of researcher agents with full internet (default: 0) + --model MODEL Claude model (default: claude-sonnet-4-5-20250929) + --memory SIZE Per-container memory limit (default: 4g) + --cpus N Per-container CPU limit (default: 2) + --allow-domain D Extra domain to whitelist in firewall (repeatable) Arguments: max_iterations Per-agent iteration cap (default: 0 = until PRD complete) @@ -81,13 +84,84 @@ The orchestrator checks for claims older than 30 minutes where the agent's conta | Role | Network | Purpose | |------|---------|---------| -| `builder` | API + npm only | Feature implementation, testing, code changes | +| `builder` | API + allowed domains only | Feature implementation, testing, code changes | | `researcher` | Full internet | Web research, documentation lookup | Builder agents are restricted via iptables to only reach: -- `api.anthropic.com` (Claude API) -- `statsig.anthropic.com` (telemetry) -- `registry.npmjs.org` (npm packages) +- `api.anthropic.com` (Claude API) — always allowed +- `statsig.anthropic.com` (telemetry) — always allowed +- Any domains passed via `--allow-domain` + +Use `--allow-domain` to whitelist package registries your project needs: + +```bash +# Node.js +./parallel/ralph-parallel.sh --allow-domain registry.npmjs.org + +# Python +./parallel/ralph-parallel.sh \ + --allow-domain pypi.org \ + --allow-domain files.pythonhosted.org + +# Go +./parallel/ralph-parallel.sh \ + --allow-domain proxy.golang.org \ + --allow-domain sum.golang.org + +# Rust +./parallel/ralph-parallel.sh \ + --allow-domain crates.io \ + --allow-domain static.crates.io +``` + +## Custom Images + +The default `ralph-agent:latest` image is based on `node:20-slim` (Node.js is required for Claude Code). If your project needs additional runtimes (Python, Go, Rust, etc.), extend the base image. + +### Image Contract + +When extending `ralph-agent:latest`, you **must**: + +- Preserve the `agent` user (UID 1001) — do not delete or change its UID +- Keep the `/opt/ralph/` scripts intact — do not remove or modify them +- Keep the default `ENTRYPOINT` (`/opt/ralph/agent-loop.sh`) +- Switch back to `USER agent` after installing system packages + +### Example: Adding Python + +```dockerfile +FROM ralph-agent:latest +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ + && rm -rf /var/lib/apt/lists/* +USER agent +``` + +### Example: Adding Go + +```dockerfile +FROM ralph-agent:latest +USER root +RUN curl -fsSL https://go.dev/dl/go1.22.0.linux-$(dpkg --print-architecture).tar.gz \ + | tar -C /usr/local -xz +ENV PATH="/usr/local/go/bin:${PATH}" +USER agent +``` + +### Using a Custom Image + +Build your extended image, then pass it with `--image`: + +```bash +docker build -t my-project-agent:latest -f Dockerfile.myproject . + +./parallel/ralph-parallel.sh \ + --project /path/to/project \ + --image my-project-agent:latest \ + --allow-domain pypi.org \ + --agents 3 +``` ## File Layout @@ -95,7 +169,7 @@ Builder agents are restricted via iptables to only reach: docker/ ├── Dockerfile # Container image: node:20-slim + claude-code ├── agent-loop.sh # Container entrypoint: firewall → auth → clone → loop -├── init-firewall-builder.sh # iptables: whitelist API + npm only +├── init-firewall-builder.sh # iptables: whitelist API + allowed domains └── init-firewall-researcher.sh # No-op (full internet) parallel/ diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index cbc6c747..0d329cf7 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -10,7 +10,7 @@ build_image() { local docker_dir="$1" log_info "Building Ralph agent image..." - docker build --platform linux/arm64 -t "$RALPH_IMAGE" "$docker_dir" + docker build -t "$RALPH_IMAGE" "$docker_dir" log_info "Image built: $RALPH_IMAGE" } @@ -19,7 +19,7 @@ check_auth_volume() { if ! docker volume inspect "$CLAUDE_AUTH_VOLUME" &> /dev/null; then log_error "Claude auth volume '$CLAUDE_AUTH_VOLUME' not found." log_error "Run the auth setup first:" - log_error " docker run -it --platform linux/arm64 --entrypoint bash \\" + log_error " docker run -it --entrypoint bash \\" log_error " -v $CLAUDE_AUTH_VOLUME:/home/agent/.claude \\" log_error " $RALPH_IMAGE" log_error " Then inside: claude login" @@ -28,7 +28,7 @@ check_auth_volume() { # Quick check that credentials exist on the volume local has_creds - has_creds=$(docker run --rm --platform linux/arm64 --entrypoint test \ + has_creds=$(docker run --rm --entrypoint test \ -v "$CLAUDE_AUTH_VOLUME":/claude-auth:ro \ "$RALPH_IMAGE" -f /claude-auth/.credentials.json && echo "yes" || echo "no") @@ -65,13 +65,14 @@ launch_agent() { log_info "Launching container: $container_name (role=$agent_role, network=$network)" - # Ensure log and state directories exist + # Ensure log and state directories exist, and stop signal file is present + # (Docker errors on bind-mounting a nonexistent file) mkdir -p "$project_dir_abs/agent_logs" "$project_dir_abs/.ralph" + touch "$project_dir_abs/.ralph/stop_requested" docker run -d \ --name "$container_name" \ --network "$network" \ - --platform linux/arm64 \ --memory="$container_memory" \ --cpus="$container_cpus" \ --pids-limit=256 \ @@ -81,11 +82,12 @@ launch_agent() { -e "AGENT_ROLE=$agent_role" \ -e "CLAUDE_MODEL=$claude_model" \ -e "MAX_ITERATIONS=$max_iterations" \ + -e "RALPH_EXTRA_DOMAINS=${RALPH_EXTRA_DOMAINS:-}" \ -v "$CLAUDE_AUTH_VOLUME:/claude-auth:ro" \ -v "$project_dir_abs/.ralph/repo.git:/repo.git:rw" \ -v "$prompt_path:/parallel-prompt/CLAUDE-parallel.md:ro" \ -v "$project_dir_abs/agent_logs:/agent-logs:rw" \ - -v "$project_dir_abs/.ralph:/harness-state:ro" \ + -v "$project_dir_abs/.ralph/stop_requested:/harness-state/stop_requested:ro" \ "$RALPH_IMAGE" log_info "Container $container_name started" diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 9489745b..a0e9ba05 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -38,6 +38,7 @@ MAX_ITERATIONS=0 STALE_CLAIM_MINUTES=30 PROJECT_DIR="" CUSTOM_IMAGE="" +declare -a EXTRA_DOMAINS=() # --- Parse arguments --- while [[ $# -gt 0 ]]; do @@ -98,6 +99,14 @@ while [[ $# -gt 0 ]]; do CONTAINER_CPUS="${1#*=}" shift ;; + --allow-domain) + EXTRA_DOMAINS+=("$2") + shift 2 + ;; + --allow-domain=*) + EXTRA_DOMAINS+=("${1#*=}") + shift + ;; -h|--help) echo "Usage: $0 [options] [max_iterations]" echo "" @@ -109,6 +118,7 @@ while [[ $# -gt 0 ]]; do echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" echo " --memory SIZE Per-container memory limit (default: 4g)" echo " --cpus N Per-container CPU limit (default: 2)" + echo " --allow-domain D Extra domain to whitelist in firewall (repeatable)" echo "" echo "Arguments:" echo " max_iterations Per-agent iteration cap (default: 0 = until PRD complete)" @@ -177,6 +187,13 @@ log_info "Model: $CLAUDE_MODEL" log_info "Memory: $CONTAINER_MEMORY per container" log_info "CPUs: $CONTAINER_CPUS per container" log_info "Max iterations: $MAX_ITERATIONS (0=until PRD complete)" +if [ ${#EXTRA_DOMAINS[@]} -gt 0 ]; then + RALPH_EXTRA_DOMAINS=$(IFS=,; echo "${EXTRA_DOMAINS[*]}") + export RALPH_EXTRA_DOMAINS + log_info "Extra domains: $RALPH_EXTRA_DOMAINS" +else + RALPH_EXTRA_DOMAINS="" +fi echo "" # --- Step 1: Build or verify Docker image --- @@ -215,7 +232,7 @@ BARE_REPO="$PROJECT_DIR/.ralph/repo.git" if [ ! -d "$BARE_REPO" ]; then log_info "Creating bare repo for agent coordination..." mkdir -p "$PROJECT_DIR/.ralph" - git clone --bare "$PROJECT_DIR" "$BARE_REPO" + git clone --bare --filter=blob:none "$PROJECT_DIR" "$BARE_REPO" log_info "Bare repo created at $BARE_REPO" else # Update the bare repo from the working directory @@ -225,7 +242,8 @@ else cd - > /dev/null fi -rm -f "$PROJECT_DIR/.ralph/stop_requested" +# Clear any previous stop signal (truncate to empty; file is kept for Docker bind-mount) +: > "$PROJECT_DIR/.ralph/stop_requested" # --- Step 6: Launch agent containers --- AGENT_NUM=0 @@ -371,7 +389,7 @@ while true; do sleep "$MONITOR_INTERVAL" # Check if stop was requested - if [ -f "$PROJECT_DIR/.ralph/stop_requested" ]; then + if [ -s "$PROJECT_DIR/.ralph/stop_requested" ]; then log_info "Stop requested. Shutting down all agents..." for name in "${CONTAINER_NAMES[@]}"; do stop_agent "$name" 30 diff --git a/parallel/status.sh b/parallel/status.sh index bb9a5755..924832b3 100755 --- a/parallel/status.sh +++ b/parallel/status.sh @@ -51,7 +51,7 @@ echo "========================================" echo "" # --- Stop signal check --- -if [ -f "$PROJECT_DIR/.ralph/stop_requested" ]; then +if [ -s "$PROJECT_DIR/.ralph/stop_requested" ]; then echo "** STOP REQUESTED -- agents will exit after current iteration **" echo "" fi diff --git a/parallel/stop.sh b/parallel/stop.sh index 259a20b1..bfe23084 100755 --- a/parallel/stop.sh +++ b/parallel/stop.sh @@ -30,7 +30,7 @@ PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" # --- Signal stop --- log_info "Requesting graceful stop for Ralph parallel agents..." mkdir -p "$PROJECT_DIR/.ralph" -touch "$PROJECT_DIR/.ralph/stop_requested" +echo "stop" > "$PROJECT_DIR/.ralph/stop_requested" # --- Wait for containers to stop --- TIMEOUT=120 From 538c478fe37d3e39c24e20eaffbd1a14a4fa0f61 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 13:45:51 -0500 Subject: [PATCH 07/21] fix: handle new branch push and pull in agent-loop - Add push.autoSetupRemote to git config so first push on a new branch automatically sets up tracking - Skip git pull --rebase when remote branch doesn't exist yet (new branch from prd.json branchName) - Use file:// prefix for bare clone so --filter=blob:none takes effect (git ignores filters on local path clones) Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 30 ++++++++++++++++++++---------- parallel/ralph-parallel.sh | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index e400a4c9..59949cbb 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -78,6 +78,7 @@ setup_git_identity() { git config user.name "$AGENT_ID" git config user.email "${AGENT_ID}@ralph-agent.local" git config pull.rebase true + git config push.autoSetupRemote true } # --- Step 5: Check out the correct branch from prd.json --- @@ -119,11 +120,16 @@ claim_story() { cd "$WORKSPACE" # Pull latest prd.json (all git output to stderr to keep stdout clean for return value) - git pull --rebase >&2 2>&1 || { - git rebase --abort >/dev/null 2>&1 || true - git fetch origin >&2 2>&1 - git reset --hard "origin/$(git branch --show-current)" >&2 2>&1 - } + # On a new branch with no remote tracking yet, pull will fail — that's fine, we continue + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$current_branch" >&2 2>&1 + } + fi if [ ! -f prd.json ]; then echo "[$AGENT_ID] No prd.json found" >&2 @@ -171,11 +177,15 @@ claim_story() { else echo "[$AGENT_ID] Push failed (concurrent claim). Resetting and retrying..." >&2 git reset --hard HEAD~1 >&2 2>&1 - git pull --rebase >&2 2>&1 || { - git rebase --abort >/dev/null 2>&1 || true - git fetch origin >&2 2>&1 - git reset --hard "origin/$(git branch --show-current)" >&2 2>&1 - } + local retry_branch + retry_branch=$(git branch --show-current 2>/dev/null || echo "") + if git rev-parse --verify "origin/$retry_branch" >/dev/null 2>&1; then + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$retry_branch" >&2 2>&1 + } + fi return 1 fi } diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index a0e9ba05..17465c14 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -232,7 +232,7 @@ BARE_REPO="$PROJECT_DIR/.ralph/repo.git" if [ ! -d "$BARE_REPO" ]; then log_info "Creating bare repo for agent coordination..." mkdir -p "$PROJECT_DIR/.ralph" - git clone --bare --filter=blob:none "$PROJECT_DIR" "$BARE_REPO" + git clone --bare --filter=blob:none "file://$PROJECT_DIR" "$BARE_REPO" log_info "Bare repo created at $BARE_REPO" else # Update the bare repo from the working directory From 2c82c083b6b6a7f4b55b242e20880f5dce5fde5c Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 13:57:19 -0500 Subject: [PATCH 08/21] feat: auto-detect Dockerfile.ralph in project directory Projects can now be "ralph-ready" by adding a Dockerfile.ralph to their root. When detected, ralph automatically builds a project-specific image (tagged ralph-agent-:latest) without needing --image. Resolution order: --image flag > Dockerfile.ralph > default base image. Co-Authored-By: Claude Opus 4.6 --- parallel/README.md | 43 ++++++++++++++++++++++------------ parallel/lib/docker-helpers.sh | 7 +++++- parallel/ralph-parallel.sh | 18 +++++++++++++- 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/parallel/README.md b/parallel/README.md index 72746c59..9dc9bfe2 100644 --- a/parallel/README.md +++ b/parallel/README.md @@ -118,18 +118,38 @@ Use `--allow-domain` to whitelist package registries your project needs: The default `ralph-agent:latest` image is based on `node:20-slim` (Node.js is required for Claude Code). If your project needs additional runtimes (Python, Go, Rust, etc.), extend the base image. +### `Dockerfile.ralph` Convention + +The easiest way to make a project "ralph-ready" is to add a `Dockerfile.ralph` to the project root. When ralph detects this file, it automatically builds a project-specific image — no `--image` flag needed. + +``` +my-project/ +├── Dockerfile.ralph # <-- ralph auto-detects this +├── prd.json +├── src/ +└── ... +``` + +```bash +# Ralph sees Dockerfile.ralph and builds automatically +./parallel/ralph-parallel.sh --project /path/to/my-project --agents 3 +``` + +The image is tagged `ralph-agent-:latest` (derived from `prd.json`'s `project` field) so multiple projects don't collide. + ### Image Contract -When extending `ralph-agent:latest`, you **must**: +`Dockerfile.ralph` should extend `ralph-agent:latest`. When doing so, you **must**: - Preserve the `agent` user (UID 1001) — do not delete or change its UID - Keep the `/opt/ralph/` scripts intact — do not remove or modify them - Keep the default `ENTRYPOINT` (`/opt/ralph/agent-loop.sh`) - Switch back to `USER agent` after installing system packages -### Example: Adding Python +### Example: Python Project ```dockerfile +# Dockerfile.ralph FROM ralph-agent:latest USER root RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -138,9 +158,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ USER agent ``` -### Example: Adding Go +### Example: Go Project ```dockerfile +# Dockerfile.ralph FROM ralph-agent:latest USER root RUN curl -fsSL https://go.dev/dl/go1.22.0.linux-$(dpkg --print-architecture).tar.gz \ @@ -149,19 +170,11 @@ ENV PATH="/usr/local/go/bin:${PATH}" USER agent ``` -### Using a Custom Image - -Build your extended image, then pass it with `--image`: - -```bash -docker build -t my-project-agent:latest -f Dockerfile.myproject . +### Image Resolution Order -./parallel/ralph-parallel.sh \ - --project /path/to/project \ - --image my-project-agent:latest \ - --allow-domain pypi.org \ - --agents 3 -``` +1. `--image IMAGE` flag — explicit override, used as-is +2. `Dockerfile.ralph` in the project directory — auto-built +3. Default `ralph-agent:latest` — base image with Node.js only ## File Layout diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index 0d329cf7..b691c948 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -8,9 +8,14 @@ CLAUDE_AUTH_VOLUME="ralph-claude-auth" build_image() { local docker_dir="$1" + local dockerfile="${2:-}" log_info "Building Ralph agent image..." - docker build -t "$RALPH_IMAGE" "$docker_dir" + if [ -n "$dockerfile" ]; then + docker build -t "$RALPH_IMAGE" -f "$dockerfile" "$docker_dir" + else + docker build -t "$RALPH_IMAGE" "$docker_dir" + fi log_info "Image built: $RALPH_IMAGE" } diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 17465c14..d1a06cb2 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -197,14 +197,30 @@ fi echo "" # --- Step 1: Build or verify Docker image --- +PROJECT_DOCKERFILE="$PROJECT_DIR/Dockerfile.ralph" + if [ -n "$CUSTOM_IMAGE" ]; then - # User specified a custom image — use it, don't auto-build + # Explicit --image flag takes priority export RALPH_IMAGE="$CUSTOM_IMAGE" log_info "Using custom image: $RALPH_IMAGE" if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then log_error "Custom image '$RALPH_IMAGE' not found. Build it first." exit 1 fi +elif [ -f "$PROJECT_DOCKERFILE" ]; then + # Project has a Dockerfile.ralph — build a project-specific image + # Tag includes project name to avoid collisions between projects + PROJECT_IMAGE_TAG="ralph-agent-${PROJECT_NAME}:latest" + export RALPH_IMAGE="$PROJECT_IMAGE_TAG" + log_info "Found Dockerfile.ralph — building project image: $RALPH_IMAGE" + + # Always ensure the base image exists first + if ! docker image inspect "ralph-agent:latest" &> /dev/null; then + log_info "Building base image first..." + build_image "$RALPH_ROOT/docker" + fi + + build_image "$PROJECT_DIR" "$PROJECT_DOCKERFILE" else log_info "Checking Docker image..." if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then From 991e2b9c31710dd6f3d22ef121b4e2c9bf32c784 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 13 Feb 2026 14:18:27 -0500 Subject: [PATCH 09/21] fix: address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Swap date parsing order to try GNU date -d first, macOS date -j as fallback (orchestrator runs on the host which could be either OS) - Remove dead token_refresh docs and unused check_token_refresh_file() function — auth model is now volume-based, not token-file-based Co-Authored-By: Claude Opus 4.6 --- README.md | 2 -- parallel/README.md | 6 +----- parallel/lib/auth.sh | 18 ------------------ parallel/ralph-parallel.sh | 6 +++--- 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index dffac24d..ccf2864a 100644 --- a/README.md +++ b/README.md @@ -284,8 +284,6 @@ Priority order (first wins): 2. `.ralph/token` file in the project directory 3. 1Password via `op read` (interactive, startup only) -To refresh the token mid-run without stopping, write a new token to `.ralph/token_refresh`. The orchestrator picks it up within 30 seconds and restarts all containers. - See [parallel/README.md](parallel/README.md) for full documentation. ## References diff --git a/parallel/README.md b/parallel/README.md index 9dc9bfe2..5da339ea 100644 --- a/parallel/README.md +++ b/parallel/README.md @@ -10,7 +10,7 @@ Run N containerized Claude Code agents simultaneously against the same PRD. Each - Claims a story in `prd.json` via git atomic push - Runs Claude Code with the parallel prompt - Pushes results and picks the next story -3. The orchestrator monitors container health, recovers stale claims, and handles token refresh +3. The orchestrator monitors container health and recovers stale claims 4. When all stories have `passes: true`, everything shuts down ## Prerequisites @@ -63,10 +63,6 @@ Token retrieval priority (first wins): 2. **`.ralph/token` file** — write your token here 3. **1Password** via `op read` — interactive, startup only -### Mid-Run Token Refresh - -Write a new token to `.ralph/token_refresh`. The orchestrator detects it within 30 seconds and restarts all containers with the new token. - ## Story Claiming Agents claim stories by modifying `prd.json` and using git's atomic push as a lock: diff --git a/parallel/lib/auth.sh b/parallel/lib/auth.sh index e0301fef..5115ac20 100644 --- a/parallel/lib/auth.sh +++ b/parallel/lib/auth.sh @@ -57,21 +57,3 @@ fetch_claude_token() { echo "$token" } - -# Check if a refreshed token has been dropped into the refresh file. -# Returns 0 and prints the new token if found, 1 otherwise. -check_token_refresh_file() { - local project_dir="$1" - local refresh_file="$project_dir/.ralph/token_refresh" - - if [ -f "$refresh_file" ]; then - local new_token - new_token=$(cat "$refresh_file") - if [ -n "$new_token" ]; then - mv "$refresh_file" "${refresh_file}.consumed.$(date +%s)" - echo "$new_token" - return 0 - fi - fi - return 1 -} diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index d1a06cb2..048b2769 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -354,10 +354,10 @@ recover_stale_claims() { [ -z "$story_id" ] && continue [ -z "$claimed_at" ] && continue - # Parse claimed_at timestamp (macOS date -j, fallback to GNU date -d) + # Parse claimed_at timestamp (GNU date -d first, macOS date -j fallback) local claimed_epoch - claimed_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$claimed_at" +%s 2>/dev/null \ - || date -d "$claimed_at" +%s 2>/dev/null \ + claimed_epoch=$(date -d "$claimed_at" +%s 2>/dev/null \ + || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$claimed_at" +%s 2>/dev/null \ || echo "0") if [ "$claimed_epoch" -eq 0 ]; then From bd549042556d9f2134d590747007f874fae5dbe9 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Sat, 14 Feb 2026 14:41:54 -0500 Subject: [PATCH 10/21] Fix double-claiming: script claims story, injects ID into prompt Previously both agent-loop.sh AND Claude claimed stories. The script would claim US-001, then Claude would read CLAUDE-parallel.md's claim protocol and grab US-002 and US-003 before doing any work. Now: - agent-loop.sh injects {{CLAIMED_STORY}} into the prompt via sed - CLAUDE-parallel.md tells Claude which story is pre-assigned - Claude is explicitly told not to claim additional stories - Claiming is solely the script's responsibility Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 4 ++-- parallel/CLAUDE-parallel.md | 37 +++++++------------------------------ 2 files changed, 9 insertions(+), 32 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 59949cbb..744228af 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -235,8 +235,8 @@ prepare_prompt() { return 1 fi - # Inject agent identity into prompt - sed "s/{{AGENT_ID}}/$AGENT_ID/g" "$PROMPT_FILE" + # Inject agent identity and claimed story into prompt + sed -e "s/{{AGENT_ID}}/$AGENT_ID/g" -e "s/{{CLAIMED_STORY}}/$CLAIMED_STORY/g" "$PROMPT_FILE" } # --- Main loop --- diff --git a/parallel/CLAUDE-parallel.md b/parallel/CLAUDE-parallel.md index 0b98cd18..63e02db8 100644 --- a/parallel/CLAUDE-parallel.md +++ b/parallel/CLAUDE-parallel.md @@ -4,43 +4,20 @@ You are **{{AGENT_ID}}**, an autonomous coding agent running in parallel with ot ## Your Task +Your assigned story is **{{CLAIMED_STORY}}** — it has already been claimed for you in prd.json. Do NOT claim or work on any other story. + 1. Read the PRD at `prd.json` 2. Read ALL progress files: `progress.txt` and any `progress-*.txt` files (check Codebase Patterns section first) 3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. -4. **Claim** the highest priority user story where `passes: false` AND `claimed_by` is empty (see Claim Protocol below) -5. Implement that single user story +4. `git pull --rebase` to get the latest code +5. Implement **only** story **{{CLAIMED_STORY}}** 6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) 7. Update AGENTS.md files if you discover reusable patterns (see below) -8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]` -9. Update the PRD to set `passes: true` for the completed story +8. If checks pass, commit ALL changes with message: `feat: {{CLAIMED_STORY}} - [Story Title]` +9. Update the PRD to set `passes: true` for **{{CLAIMED_STORY}}** only 10. Append your progress to `progress-{{AGENT_ID}}.txt` -## Claim Protocol - -You are running alongside other agents. To avoid duplicate work, you must **claim** a story before working on it using git's atomic push as a lock: - -1. `git pull --rebase` to get latest prd.json -2. Find the highest-priority story where `passes: false` AND (`claimed_by` is null or empty) -3. Set `claimed_by: "{{AGENT_ID}}"` and `claimed_at: ""` in prd.json for that story -4. `git add prd.json && git commit -m "[{{AGENT_ID}}] Claim: "` -5. `git push` -6. **If push fails** — another agent claimed something concurrently. Run `git pull --rebase` and pick a different unclaimed story. Repeat up to 3 times. -7. After completing work, set `passes: true` in prd.json, commit, and push. - -### Example claim in prd.json: -```json -{ - "id": "US-001", - "title": "Add priority field", - "passes": false, - "claimed_by": "{{AGENT_ID}}", - "claimed_at": "2025-01-15T10:30:00Z", - "priority": 1 -} -``` - -### 3-Strike Rule for Claims -If your `git push` fails 3 times in a row when trying to claim, document the situation in `progress-{{AGENT_ID}}.txt` and wait 30 seconds before retrying. Do not spin indefinitely. +**Important**: Do NOT modify `claimed_by` fields or claim additional stories. The harness script manages claiming. You implement the one story assigned to you. ## Per-Agent Progress Files From d651eff4e37654aa9f4c923244eb4af81e102723 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Tue, 17 Feb 2026 09:50:39 -0500 Subject: [PATCH 11/21] fix: prevent story hoarding and release claims on auth failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agents could claim every story in rapid succession without completing any — especially when Claude failed (e.g. expired OAuth token). Two new mechanisms prevent this: - check_existing_claim(): before claiming a new story, checks if the agent already owns an incomplete one and reuses that claim - release_claim(): on auth/hard failure, releases the claim back to the pool so other agents can pick it up - Auth failures trigger a 60s backoff before retrying Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 213 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 15 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 744228af..f4776b9d 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -114,6 +114,74 @@ checkout_prd_branch() { fi } +# --- Step 5b: Check if this agent already owns an incomplete story --- +# Returns 0 and prints story ID if found, returns 1 if no active claim +check_existing_claim() { + cd "$WORKSPACE" + + # Pull latest prd.json + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$current_branch" >&2 2>&1 + } + fi + + if [ ! -f prd.json ]; then + return 1 + fi + + local story_id + story_id=$(jq -r --arg agent "$AGENT_ID" ' + .userStories + | map(select(.passes == false and .claimed_by == $agent)) + | first + | .id // empty + ' prd.json 2>/dev/null || echo "") + + if [ -n "$story_id" ]; then + echo "[$AGENT_ID] Already owns incomplete story: $story_id" >&2 + echo "$story_id" + return 0 + fi + return 1 +} + +# --- Step 5c: Release a claim after Claude failure --- +release_claim() { + local story_id="$1" + cd "$WORKSPACE" + + if [ ! -f prd.json ]; then + return 1 + fi + + echo "[$AGENT_ID] Releasing claim on $story_id" >&2 + + jq --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then + .claimed_by = null | .claimed_at = null + else . end + ) + ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + + git add prd.json >&2 2>&1 + git commit -m "[$AGENT_ID] Release: $story_id (Claude failure)" >&2 2>&1 || { + git checkout -- prd.json >/dev/null 2>&1 || true + return 1 + } + git push >&2 2>&1 || { + echo "[$AGENT_ID] Failed to push release for $story_id" >&2 + git reset --hard HEAD~1 >&2 2>&1 + return 1 + } + return 0 +} + # --- Step 6: Claim a story in prd.json --- # Returns 0 and prints story ID if claimed, returns 1 if no stories available claim_story() { @@ -190,15 +258,96 @@ claim_story() { fi } +# --- Step 6b: Claim a story for verification --- +# Returns 0 and prints story ID if claimed, returns 1 if no stories ready for verification +claim_verification() { + cd "$WORKSPACE" + + # Pull latest prd.json + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$current_branch" >&2 2>&1 + } + fi + + if [ ! -f prd.json ]; then + echo "[$AGENT_ID] No prd.json found" >&2 + return 1 + fi + + # Find stories with passes=true, verified!=true, and no verified_by claim + local story_id + story_id=$(jq -r ' + .userStories + | map(select(.passes == true and .verified != true and (.verified_by == null or .verified_by == ""))) + | first + | .id // empty + ' prd.json 2>/dev/null || echo "") + + if [ -z "$story_id" ]; then + echo "[$AGENT_ID] No stories ready for verification" >&2 + return 1 + fi + + # Claim it by setting verified_by and verified_at + local timestamp + timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + jq --arg agent "$AGENT_ID" --arg ts "$timestamp" --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then + .verified_by = $agent | .verified_at = $ts + else . end + ) + ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + + git add prd.json >&2 2>&1 + git commit -m "[$AGENT_ID] Verify claim: $story_id" >&2 2>&1 || { + echo "[$AGENT_ID] Failed to commit verification claim for $story_id" >&2 + git checkout -- prd.json >/dev/null 2>&1 || true + return 1 + } + + # Atomic push + if git push >&2 2>&1; then + echo "$story_id" + return 0 + else + echo "[$AGENT_ID] Push failed (concurrent claim). Resetting and retrying..." >&2 + git reset --hard HEAD~1 >&2 2>&1 + local retry_branch + retry_branch=$(git branch --show-current 2>/dev/null || echo "") + if git rev-parse --verify "origin/$retry_branch" >/dev/null 2>&1; then + git pull --rebase >&2 2>&1 || { + git rebase --abort >/dev/null 2>&1 || true + git fetch origin >&2 2>&1 + git reset --hard "origin/$retry_branch" >&2 2>&1 + } + fi + return 1 + fi +} + # --- Step 7: Check if all stories are complete --- all_stories_complete() { if [ ! -f "$WORKSPACE/prd.json" ]; then return 1 fi - local incomplete - incomplete=$(jq '[.userStories[] | select(.passes == false)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") - [ "$incomplete" -eq 0 ] + if [ "$AGENT_ROLE" = "verifier" ]; then + # Verifiers check that all stories are both passing AND verified + local incomplete + incomplete=$(jq '[.userStories[] | select(.passes == false or .verified != true)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] + else + local incomplete + incomplete=$(jq '[.userStories[] | select(.passes == false)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] + fi } # --- Step 8: Push changes with retry --- @@ -272,12 +421,24 @@ while true; do echo "[$AGENT_ID] === Iteration $ITERATION (commit: $COMMIT) at $TIMESTAMP ===" - # Attempt to claim a story (retry up to 3 times with different stories) + # First check if this agent already owns an incomplete story (from a failed iteration) CLAIMED_STORY="" + + if [ "$AGENT_ROLE" = "verifier" ]; then + CLAIM_FUNC="claim_verification" + else + # Check for existing claim before trying to grab a new one + if CLAIMED_STORY=$(check_existing_claim); then + echo "[$AGENT_ID] Resuming existing claim: $CLAIMED_STORY" + fi + CLAIM_FUNC="claim_story" + fi + + # If no existing claim, attempt to claim a new story (retry up to 3 times) CLAIM_ATTEMPTS=0 while [ $CLAIM_ATTEMPTS -lt 3 ] && [ -z "$CLAIMED_STORY" ]; do # Use if to prevent set -e from killing the script on claim failure - if CLAIMED_STORY=$(claim_story); then + if CLAIMED_STORY=$($CLAIM_FUNC); then break else CLAIMED_STORY="" @@ -287,14 +448,25 @@ while true; do done if [ -z "$CLAIMED_STORY" ]; then - echo "[$AGENT_ID] Could not claim any story. Checking if all complete..." - if all_stories_complete; then - echo "[$AGENT_ID] All stories complete. Exiting." - exit 0 + if [ "$AGENT_ROLE" = "verifier" ]; then + echo "[$AGENT_ID] No stories ready for verification. Checking if all complete..." + if all_stories_complete; then + echo "[$AGENT_ID] All stories verified. Exiting." + exit 0 + fi + echo "[$AGENT_ID] Builders still working. Waiting 30s..." + sleep 30 + continue + else + echo "[$AGENT_ID] Could not claim any story. Checking if all complete..." + if all_stories_complete; then + echo "[$AGENT_ID] All stories complete. Exiting." + exit 0 + fi + echo "[$AGENT_ID] Stories exist but couldn't claim. Waiting 30s..." + sleep 30 + continue fi - echo "[$AGENT_ID] Stories exist but couldn't claim. Waiting 30s..." - sleep 30 - continue fi echo "[$AGENT_ID] Claimed story: $CLAIMED_STORY" @@ -308,13 +480,24 @@ while true; do # Run Claude echo "[$AGENT_ID] Running Claude (model: $CLAUDE_MODEL) for story: $CLAIMED_STORY" + CLAUDE_EXIT=0 claude --dangerously-skip-permissions \ --print \ --model "$CLAUDE_MODEL" \ -p "$PROMPT" \ - &> "$LOGFILE" || { - echo "[$AGENT_ID] Claude exited with error (code: $?). Check log: $LOGFILE" - } + &> "$LOGFILE" || CLAUDE_EXIT=$? + + # Detect hard failures (auth errors, crashes) — release claim so other agents can take it + if [ $CLAUDE_EXIT -ne 0 ] && [ -f "$LOGFILE" ]; then + if grep -q "authentication_error\|OAuth token has expired\|Failed to authenticate" "$LOGFILE"; then + echo "[$AGENT_ID] Claude auth failure detected. Releasing claim on $CLAIMED_STORY." + release_claim "$CLAIMED_STORY" || true + echo "[$AGENT_ID] Waiting 60s before retrying (auth may need refresh)..." + sleep 60 + continue + fi + echo "[$AGENT_ID] Claude exited with error (code: $CLAUDE_EXIT). Check log: $LOGFILE" + fi echo "[$AGENT_ID] Claude session complete. Pushing changes..." From 904afc1e6d69bad1834d17c5dc953277bcb44259 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Tue, 17 Feb 2026 16:11:07 -0500 Subject: [PATCH 12/21] feat: verifier agents and docker labels for container tracking - Add --verifier flag to ralph-parallel.sh for launching verifier agents - Add ralph.role, ralph.agent_id, ralph.project_dir docker labels to containers for external tooling (voice TUI panel filtering) - Verifiers use separate CLAUDE-verifier.md prompt - Orchestrator checks verified field when verifiers are in use - Add stale verification claim recovery Co-Authored-By: Claude Opus 4.6 --- parallel/CLAUDE-verifier.md | 101 ++++++++++++++++++++++++++++++ parallel/lib/docker-helpers.sh | 14 ++++- parallel/ralph-parallel.sh | 108 +++++++++++++++++++++++++++++++-- 3 files changed, 215 insertions(+), 8 deletions(-) create mode 100644 parallel/CLAUDE-verifier.md diff --git a/parallel/CLAUDE-verifier.md b/parallel/CLAUDE-verifier.md new file mode 100644 index 00000000..b18549b1 --- /dev/null +++ b/parallel/CLAUDE-verifier.md @@ -0,0 +1,101 @@ +# Ralph Verifier Agent Instructions + +You are **{{AGENT_ID}}** (role: verifier), an autonomous verification agent running in parallel with builder agents. You are in a sandboxed Docker container with `--dangerously-skip-permissions`. + +## Your Task + +Your assigned story is **{{CLAIMED_STORY}}** — it has `passes: true` set by a builder agent. Your job is to **independently verify** that the implementation actually works by running the project's tests and inspecting results against the acceptance criteria. + +1. Read the PRD at `prd.json` +2. Find story **{{CLAIMED_STORY}}** and read its acceptance criteria +3. Read ALL progress files: `progress.txt` and any `progress-*.txt` files for context on what was built +4. `git pull --rebase` to get the latest code +5. Auto-detect the test framework and run tests +6. Evaluate results against acceptance criteria +7. Update prd.json based on your findings (see Verification Outcomes below) +8. Commit and push your changes + +## Auto-Detect Test Framework + +Inspect the project root for build/test configuration: + +| File | Command | +|------|---------| +| `package.json` (with `scripts.test`) | `npm test` | +| `pyproject.toml` or `setup.py` | `pytest` | +| `Cargo.toml` | `cargo test` | +| `go.mod` | `go test ./...` | +| `Makefile` (with `test` target) | `make test` | +| `build.gradle` or `build.gradle.kts` | `./gradlew test` | +| `pom.xml` | `mvn test` | + +If multiple are present, prefer the one most relevant to the story's changes. If no test framework is found, note this in `verification_notes` and mark as verified (no tests to fail). + +## Verification Outcomes + +### If tests PASS and acceptance criteria are met: + +Update prd.json for **{{CLAIMED_STORY}}**: +```json +{ + "verified": true, + "verified_by": "{{AGENT_ID}}", + "verified_at": "", + "verification_notes": "All tests pass. " +} +``` + +### If tests FAIL or acceptance criteria are NOT met: + +**Bounce the story back to builders** by updating prd.json for **{{CLAIMED_STORY}}**: +```json +{ + "passes": false, + "claimed_by": null, + "claimed_at": null, + "verified": false, + "verified_by": null, + "verified_at": null, + "verification_notes": "FAILED: " +} +``` + +This clears the builder's claim so another builder can pick it up and fix the issues. + +## Push Protocol + +Always follow this sequence: +1. `git add prd.json` +2. `git commit -m "[{{AGENT_ID}}] Verify: {{CLAIMED_STORY}} — "` +3. `git pull --rebase origin ` +4. `git push origin ` +5. If push fails, repeat from step 3 (max 3 retries) + +## Critical Rules + +- **Do NOT modify source code** — you only modify `prd.json` +- **Do NOT claim additional stories** — the harness assigns stories to you +- **One story per iteration** — verify the assigned story and exit +- Run the full test suite, not just targeted tests, to catch regressions +- Be specific in `verification_notes` — builders need actionable feedback to fix failures +- If the test command itself fails to run (missing dependencies, build errors), that counts as a failure + +## Progress Report + +APPEND to `progress-{{AGENT_ID}}.txt`: +``` +## [Date/Time] - Verify {{CLAIMED_STORY}} +- Result: PASS/FAIL +- Tests run: +- Details: +--- +``` + +## Stop Condition + +After verifying a story, check if ALL stories have `passes: true` AND `verified: true`. + +If ALL stories are verified, reply with: +COMPLETE + +If there are still unverified stories, end your response normally (another iteration will pick up the next story). diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index b691c948..7d2ed155 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -54,7 +54,7 @@ launch_agent() { local container_memory="${6:-4g}" local container_cpus="${7:-2}" - # Determine network based on role + # Determine network based on role (verifiers use builder network — no internet needed) local network case "$agent_role" in researcher) network="$RESEARCHER_NETWORK" ;; @@ -65,8 +65,13 @@ launch_agent() { local project_dir_abs project_dir_abs="$(cd "$project_dir" && pwd)" - # PARALLEL_PROMPT is set by ralph-parallel.sh (points to ralph repo's CLAUDE-parallel.md) - local prompt_path="${PARALLEL_PROMPT:-$project_dir_abs/parallel/CLAUDE-parallel.md}" + # Select prompt based on role: verifiers get CLAUDE-verifier.md, others get CLAUDE-parallel.md + local prompt_path + if [ "$agent_role" = "verifier" ]; then + prompt_path="${VERIFIER_PROMPT:-$project_dir_abs/parallel/CLAUDE-verifier.md}" + else + prompt_path="${PARALLEL_PROMPT:-$project_dir_abs/parallel/CLAUDE-parallel.md}" + fi log_info "Launching container: $container_name (role=$agent_role, network=$network)" @@ -83,6 +88,9 @@ launch_agent() { --pids-limit=256 \ --cap-add=NET_ADMIN \ --cap-add=NET_RAW \ + --label "ralph.role=$agent_role" \ + --label "ralph.agent_id=$agent_id" \ + --label "ralph.project_dir=$project_dir_abs" \ -e "AGENT_ID=$agent_id" \ -e "AGENT_ROLE=$agent_role" \ -e "CLAUDE_MODEL=$claude_model" \ diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 048b2769..cecb7b54 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -31,6 +31,7 @@ source "$SCRIPT_DIR/lib/docker-helpers.sh" # --- Defaults --- NUM_BUILDERS=2 NUM_RESEARCHERS=0 +NUM_VERIFIERS=0 CLAUDE_MODEL="claude-sonnet-4-5-20250929" CONTAINER_MEMORY="4g" CONTAINER_CPUS="2" @@ -75,6 +76,14 @@ while [[ $# -gt 0 ]]; do NUM_RESEARCHERS="${1#*=}" shift ;; + --verifier) + NUM_VERIFIERS="$2" + shift 2 + ;; + --verifier=*) + NUM_VERIFIERS="${1#*=}" + shift + ;; --model) CLAUDE_MODEL="$2" shift 2 @@ -115,6 +124,7 @@ while [[ $# -gt 0 ]]; do echo " --image IMAGE Custom Docker image (default: ralph-agent:latest)" echo " --agents N Number of builder agents (default: 2)" echo " --researcher N Number of researcher agents (default: 0)" + echo " --verifier N Number of verifier agents (default: 0)" echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" echo " --memory SIZE Per-container memory limit (default: 4g)" echo " --cpus N Per-container CPU limit (default: 2)" @@ -136,7 +146,7 @@ while [[ $# -gt 0 ]]; do esac done -TOTAL_AGENTS=$((NUM_BUILDERS + NUM_RESEARCHERS)) +TOTAL_AGENTS=$((NUM_BUILDERS + NUM_RESEARCHERS + NUM_VERIFIERS)) if [ "$TOTAL_AGENTS" -eq 0 ]; then log_error "No agents configured. Use --agents N and/or --researcher N." @@ -153,6 +163,7 @@ PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" PRD_FILE="$PROJECT_DIR/prd.json" # CLAUDE-parallel.md lives in the ralph repo, not the project PARALLEL_PROMPT="$SCRIPT_DIR/CLAUDE-parallel.md" +VERIFIER_PROMPT="$SCRIPT_DIR/CLAUDE-verifier.md" if [ ! -f "$PRD_FILE" ]; then log_error "No prd.json found in $PROJECT_DIR" @@ -165,6 +176,11 @@ if [ ! -f "$PARALLEL_PROMPT" ]; then exit 1 fi +if [ "$NUM_VERIFIERS" -gt 0 ] && [ ! -f "$VERIFIER_PROMPT" ]; then + log_error "Missing parallel/CLAUDE-verifier.md prompt file (required when --verifier > 0)" + exit 1 +fi + if [ ! -d "$PROJECT_DIR/.git" ]; then log_error "$PROJECT_DIR is not a git repository" exit 1 @@ -181,7 +197,7 @@ log_info "====================" log_info "Project: $PROJECT_NAME" log_info "Branch: ${BRANCH_NAME:-}" log_info "Stories: $DONE_STORIES/$TOTAL_STORIES complete" -log_info "Agents: $NUM_BUILDERS builders, $NUM_RESEARCHERS researchers ($TOTAL_AGENTS total)" +log_info "Agents: $NUM_BUILDERS builders, $NUM_RESEARCHERS researchers, $NUM_VERIFIERS verifiers ($TOTAL_AGENTS total)" log_info "Image: ${CUSTOM_IMAGE:-$RALPH_IMAGE (default)}" log_info "Model: $CLAUDE_MODEL" log_info "Memory: $CONTAINER_MEMORY per container" @@ -298,6 +314,7 @@ launch_agents_for_role() { log_info "Launching agents..." launch_agents_for_role "builder" "$NUM_BUILDERS" launch_agents_for_role "researcher" "$NUM_RESEARCHERS" +launch_agents_for_role "verifier" "$NUM_VERIFIERS" log_info "All $TOTAL_AGENTS agents launched." echo "" @@ -319,14 +336,23 @@ read_from_bare_repo() { } # Helper: check if all stories are complete in the bare repo +# When verifiers are in use, requires passes AND verified for all stories. check_all_stories_complete() { local prd_content prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") [ -z "$prd_content" ] && return 1 - local incomplete - incomplete=$(echo "$prd_content" | jq '[.userStories[] | select(.passes == false)] | length' 2>/dev/null || echo "1") - [ "$incomplete" -eq 0 ] + if [ "$NUM_VERIFIERS" -gt 0 ]; then + # Require both passes and verified + local incomplete + incomplete=$(echo "$prd_content" | jq '[.userStories[] | select(.passes == false or .verified != true)] | length' 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] + else + # Passes-only (backward compat) + local incomplete + incomplete=$(echo "$prd_content" | jq '[.userStories[] | select(.passes == false)] | length' 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] + fi } recover_stale_claims() { @@ -401,6 +427,77 @@ recover_stale_claims() { fi } +recover_stale_verification_claims() { + [ "$NUM_VERIFIERS" -eq 0 ] && return + + local prd_content + prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + [ -z "$prd_content" ] && return + + local now_epoch + now_epoch=$(date +%s) + local stale_seconds=$((STALE_CLAIM_MINUTES * 60)) + + local vclaims + vclaims=$(echo "$prd_content" | jq -r ' + .userStories[] + | select(.passes == true and .verified != true and .verified_by != null and .verified_by != "") + | "\(.id)|\(.verified_by)|\(.verified_at // "")" + ' 2>/dev/null || echo "") + + [ -z "$vclaims" ] && return + + local cleared=false + local updated_prd="$prd_content" + while IFS='|' read -r story_id agent verified_at; do + [ -z "$story_id" ] && continue + [ -z "$verified_at" ] && continue + + local claimed_epoch + claimed_epoch=$(date -d "$verified_at" +%s 2>/dev/null \ + || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$verified_at" +%s 2>/dev/null \ + || echo "0") + + if [ "$claimed_epoch" -eq 0 ]; then + continue + fi + + local age=$((now_epoch - claimed_epoch)) + if [ "$age" -gt "$stale_seconds" ]; then + local container_name="ralph-${agent}" + if ! is_agent_running "$container_name"; then + log_warn "Stale verification claim: $story_id by $agent (${age}s old, container not running). Clearing." + updated_prd=$(echo "$updated_prd" | jq --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then + .verified_by = null | .verified_at = null + else . end + ) + ') + cleared=true + fi + fi + done <<< "$vclaims" + + if $cleared; then + local temp_dir + temp_dir=$(mktemp -d) + git clone "$BARE_REPO" "$temp_dir/work" 2>/dev/null + cd "$temp_dir/work" + git config user.name "ralph-orchestrator" + git config user.email "orchestrator@ralph-agent.local" + if [ -n "$BRANCH_NAME" ]; then + git checkout "$BRANCH_NAME" 2>/dev/null || true + fi + echo "$updated_prd" | jq '.' > prd.json + git add prd.json + git commit -m "[orchestrator] Clear stale verification claims" 2>/dev/null || true + git push origin 2>/dev/null || true + cd - > /dev/null + rm -rf "$temp_dir" + fi +} + while true; do sleep "$MONITOR_INTERVAL" @@ -417,6 +514,7 @@ while true; do # Recover stale claims recover_stale_claims + recover_stale_verification_claims # Check container health ALL_STOPPED=true From b3dcc25f8e3655aae6c93e3307cd8fbd572ddae3 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 20 Feb 2026 10:50:40 -0500 Subject: [PATCH 13/21] fix: complete fetch-before-verify and clean dirty trees between iterations Bug 1: Add git fetch origin before git rev-parse --verify in check_existing_claim() and claim_verification() to match the pattern already used in claim_story(). Without the fetch, rev-parse checks stale remote refs and skips the pull, causing agents to work on outdated prd.json state. Bug 5: Add git checkout/clean at the top of each iteration to remove any unstaged changes left by the previous Claude session, preventing rebase failures on the next pull. Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index f4776b9d..e5087b17 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -122,6 +122,8 @@ check_existing_claim() { # Pull latest prd.json local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") + # Always fetch first so we discover remote branches created by other agents + git fetch origin >&2 2>&1 || true if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then git pull --rebase >&2 2>&1 || { git rebase --abort >/dev/null 2>&1 || true @@ -191,6 +193,8 @@ claim_story() { # On a new branch with no remote tracking yet, pull will fail — that's fine, we continue local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") + # Always fetch first so we discover remote branches created by other agents + git fetch origin >&2 2>&1 || true if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then git pull --rebase >&2 2>&1 || { git rebase --abort >/dev/null 2>&1 || true @@ -247,6 +251,8 @@ claim_story() { git reset --hard HEAD~1 >&2 2>&1 local retry_branch retry_branch=$(git branch --show-current 2>/dev/null || echo "") + # Fetch first to discover remote branches created by other agents + git fetch origin >&2 2>&1 || true if git rev-parse --verify "origin/$retry_branch" >/dev/null 2>&1; then git pull --rebase >&2 2>&1 || { git rebase --abort >/dev/null 2>&1 || true @@ -266,6 +272,8 @@ claim_verification() { # Pull latest prd.json local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") + # Always fetch first so we discover remote branches created by other agents + git fetch origin >&2 2>&1 || true if git rev-parse --verify "origin/$current_branch" >/dev/null 2>&1; then git pull --rebase >&2 2>&1 || { git rebase --abort >/dev/null 2>&1 || true @@ -321,6 +329,8 @@ claim_verification() { git reset --hard HEAD~1 >&2 2>&1 local retry_branch retry_branch=$(git branch --show-current 2>/dev/null || echo "") + # Fetch first to discover remote branches created by other agents + git fetch origin >&2 2>&1 || true if git rev-parse --verify "origin/$retry_branch" >/dev/null 2>&1; then git pull --rebase >&2 2>&1 || { git rebase --abort >/dev/null 2>&1 || true @@ -414,6 +424,11 @@ while true; do exit 0 fi + # Clean any unstaged changes from previous iteration to prevent rebase failures + cd "$WORKSPACE" + git checkout -- . 2>/dev/null || true + git clean -fd 2>/dev/null || true + ITERATION=$((ITERATION + 1)) COMMIT=$(git rev-parse --short=6 HEAD 2>/dev/null || echo "000000") LOGFILE="${LOG_DIR}/${AGENT_ID}_iter${ITERATION}_${COMMIT}.log" From be73a933bab63901f346567caf27ffa520118e0d Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 20 Feb 2026 10:51:07 -0500 Subject: [PATCH 14/21] fix: exponential backoff and max retry limit for auth failures Previously, auth failures caused a fixed 60s sleep and immediate retry, which could loop indefinitely when tokens were revoked or rate-limited. Now tracks consecutive AUTH_FAILURES with a counter that: - Resets to 0 on successful Claude invocation - Logs actual error lines from the logfile for debugging - Exits after MAX_AUTH_FAILURES (5) consecutive failures - Uses exponential backoff: 60s, 120s, 240s, 480s (capped) Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index e5087b17..1940fb00 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -405,6 +405,9 @@ checkout_prd_branch echo "[$AGENT_ID] Entering main loop" +AUTH_FAILURES=0 +MAX_AUTH_FAILURES=5 + while true; do # Check stop signal if [ -s "$STOP_FILE" ]; then @@ -502,13 +505,28 @@ while true; do -p "$PROMPT" \ &> "$LOGFILE" || CLAUDE_EXIT=$? + # Reset auth failure counter on successful invocation + if [ $CLAUDE_EXIT -eq 0 ]; then + AUTH_FAILURES=0 + fi + # Detect hard failures (auth errors, crashes) — release claim so other agents can take it if [ $CLAUDE_EXIT -ne 0 ] && [ -f "$LOGFILE" ]; then if grep -q "authentication_error\|OAuth token has expired\|Failed to authenticate" "$LOGFILE"; then - echo "[$AGENT_ID] Claude auth failure detected. Releasing claim on $CLAIMED_STORY." + AUTH_FAILURES=$((AUTH_FAILURES + 1)) + echo "[$AGENT_ID] Claude auth failure #$AUTH_FAILURES/$MAX_AUTH_FAILURES. Releasing claim on $CLAIMED_STORY." + echo "[$AGENT_ID] Error details from log:" + grep -i "error\|rate\|limit\|auth" "$LOGFILE" | tail -5 || true release_claim "$CLAIMED_STORY" || true - echo "[$AGENT_ID] Waiting 60s before retrying (auth may need refresh)..." - sleep 60 + if [ "$AUTH_FAILURES" -ge "$MAX_AUTH_FAILURES" ]; then + echo "[$AGENT_ID] Reached max auth failures ($MAX_AUTH_FAILURES). Exiting to avoid infinite loop." + exit 1 + fi + # Exponential backoff: 60, 120, 240, 480, 480 (capped) + BACKOFF=$((60 * (1 << (AUTH_FAILURES - 1)))) + [ "$BACKOFF" -gt 480 ] && BACKOFF=480 + echo "[$AGENT_ID] Waiting ${BACKOFF}s before retrying (exponential backoff)..." + sleep "$BACKOFF" continue fi echo "[$AGENT_ID] Claude exited with error (code: $CLAUDE_EXIT). Check log: $LOGFILE" From 21abddbe7dae807133f954363455ee36c46b3e30 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Fri, 20 Feb 2026 10:53:01 -0500 Subject: [PATCH 15/21] fix: force-push bare repo sync and clean stale branches on launch Bug 2: Replace silent-fail bare repo push (2>/dev/null || true) with force-push that surfaces errors. If the project has rewritten history (e.g., squash-merge from a PR), a non-force push silently fails and agents start from stale code. Bug 3: After syncing the bare repo, delete all non-main/master branches. Previous runs leave behind feature branches that agents may accidentally discover and work on, causing confusion and wasted iterations. Co-Authored-By: Claude Opus 4.6 --- parallel/ralph-parallel.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index cecb7b54..4aa4ada0 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -270,7 +270,15 @@ else # Update the bare repo from the working directory log_info "Updating bare repo from project..." cd "$PROJECT_DIR" - git push "$BARE_REPO" --all 2>/dev/null || true + git push --force "$BARE_REPO" --all 2>&1 || { + log_error "Failed to sync bare repo from project. Manual intervention required." + exit 1 + } + # Clean stale feature branches from bare repo to prevent agents from seeing old state + log_info "Cleaning stale branches from bare repo..." + git --git-dir="$BARE_REPO" for-each-ref --format='%(refname:short)' refs/heads/ | \ + grep -v '^main$\|^master$' | \ + xargs -I{} git --git-dir="$BARE_REPO" branch -D {} 2>/dev/null || true cd - > /dev/null fi From cd0fecd36c3764727f4f56f712fe780162cdab7a Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Sat, 21 Feb 2026 09:54:29 -0500 Subject: [PATCH 16/21] feat: configurable git author identities via .ralph/friends.json Agents can now commit as custom identities loaded from a gitignored config file, with overflow agents falling back to default bot names. Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 10 ++++++++-- parallel/lib/docker-helpers.sh | 4 ++++ parallel/ralph-parallel.sh | 28 +++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 1940fb00..52282b12 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -75,8 +75,14 @@ setup_workspace() { # --- Step 4: Set git identity --- setup_git_identity() { - git config user.name "$AGENT_ID" - git config user.email "${AGENT_ID}@ralph-agent.local" + if [ -n "${GIT_AUTHOR_NAME_OVERRIDE:-}" ] && [ -n "${GIT_AUTHOR_EMAIL_OVERRIDE:-}" ]; then + git config user.name "$GIT_AUTHOR_NAME_OVERRIDE" + git config user.email "$GIT_AUTHOR_EMAIL_OVERRIDE" + echo "[$AGENT_ID] Committing as: $GIT_AUTHOR_NAME_OVERRIDE <$GIT_AUTHOR_EMAIL_OVERRIDE>" + else + git config user.name "$AGENT_ID" + git config user.email "${AGENT_ID}@ralph-agent.local" + fi git config pull.rebase true git config push.autoSetupRemote true } diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index 7d2ed155..f37c1737 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -53,6 +53,8 @@ launch_agent() { local max_iterations="$5" local container_memory="${6:-4g}" local container_cpus="${7:-2}" + local git_author_name="${8:-}" + local git_author_email="${9:-}" # Determine network based on role (verifiers use builder network — no internet needed) local network @@ -95,6 +97,8 @@ launch_agent() { -e "AGENT_ROLE=$agent_role" \ -e "CLAUDE_MODEL=$claude_model" \ -e "MAX_ITERATIONS=$max_iterations" \ + -e "GIT_AUTHOR_NAME_OVERRIDE=${git_author_name}" \ + -e "GIT_AUTHOR_EMAIL_OVERRIDE=${git_author_email}" \ -e "RALPH_EXTRA_DOMAINS=${RALPH_EXTRA_DOMAINS:-}" \ -v "$CLAUDE_AUTH_VOLUME:/claude-auth:ro" \ -v "$project_dir_abs/.ralph/repo.git:/repo.git:rw" \ diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 4aa4ada0..d1d73037 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -289,6 +289,20 @@ fi AGENT_NUM=0 declare -a CONTAINER_NAMES=() +# Load friend identities for git author spoofing (optional) +FRIENDS_FILE="$PROJECT_DIR/.ralph/friends.json" +declare -a FRIEND_NAMES=() +declare -a FRIEND_EMAILS=() +if [ -f "$FRIENDS_FILE" ]; then + while IFS= read -r name; do + FRIEND_NAMES+=("$name") + done < <(jq -r '.[].name' "$FRIENDS_FILE") + while IFS= read -r email; do + FRIEND_EMAILS+=("$email") + done < <(jq -r '.[].email' "$FRIENDS_FILE") + log_info "Loaded ${#FRIEND_NAMES[@]} friend identities from friends.json" +fi + launch_agents_for_role() { local role="$1" local count="$2" @@ -300,6 +314,16 @@ launch_agents_for_role() { local agent_id="agent-${AGENT_NUM}" local container_name="ralph-${agent_id}" + # Assign friend identity if available, otherwise use default agent identity + local git_author_name="" + local git_author_email="" + local idx=$((AGENT_NUM - 1)) + if [ ${#FRIEND_NAMES[@]} -gt 0 ] && [ "$idx" -lt ${#FRIEND_NAMES[@]} ]; then + git_author_name="${FRIEND_NAMES[$idx]}" + git_author_email="${FRIEND_EMAILS[$idx]}" + log_info "Agent $agent_id will commit as: $git_author_name <$git_author_email>" + fi + # Stop existing container with same name if present if docker inspect "$container_name" &> /dev/null; then log_warn "Container $container_name already exists. Removing." @@ -313,7 +337,9 @@ launch_agents_for_role() { "$CLAUDE_MODEL" \ "$MAX_ITERATIONS" \ "$CONTAINER_MEMORY" \ - "$CONTAINER_CPUS" + "$CONTAINER_CPUS" \ + "$git_author_name" \ + "$git_author_email" CONTAINER_NAMES+=("$container_name") done From 06c578a44195aab49222b3e52cefa4f41beacee3 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Sat, 21 Feb 2026 10:16:44 -0500 Subject: [PATCH 17/21] feat: story dependencies via dependsOn field for parallel execution control Add dependsOn support so stories can declare prerequisite story IDs that must pass before they become claimable. The claim_story() jq query now builds a set of passing IDs and filters accordingly. Existing PRDs without dependsOn work unchanged (defaults to empty array). Also creates ~/taskflow test project with 8-story diamond dependency DAG for testing 3-agent parallel runs. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 2 +- docker/agent-loop.sh | 12 +++++++++--- parallel/CLAUDE-parallel.md | 2 ++ prd.json.example | 3 +++ skills/ralph/SKILL.md | 21 ++++++++++++++++----- 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index f95bb927..121fb5ef 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,7 +7,7 @@ You are an autonomous coding agent working on a software project. 1. Read the PRD at `prd.json` (in the same directory as this file) 2. Read the progress log at `progress.txt` (check Codebase Patterns section first) 3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. -4. Pick the **highest priority** user story where `passes: false` +4. Pick the **highest priority** user story where `passes: false` and all `dependsOn` story IDs (if any) have `passes: true` 5. Implement that single user story 6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) 7. Update CLAUDE.md files if you discover reusable patterns (see below) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 52282b12..92ca976d 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -214,11 +214,17 @@ claim_story() { return 1 fi - # Find highest-priority unclaimed story (passes: false AND no claimed_by) + # Find highest-priority unclaimed story whose dependencies are all satisfied local story_id story_id=$(jq -r ' - .userStories - | map(select(.passes == false and (.claimed_by == null or .claimed_by == ""))) + . as $prd | + ($prd.userStories | map(select(.passes == true)) | map(.id)) as $passed | + $prd.userStories + | map(select( + .passes == false + and (.claimed_by == null or .claimed_by == "") + and ((.dependsOn // []) | all(. as $dep | $passed | any(. == $dep))) + )) | sort_by(.priority) | first | .id // empty diff --git a/parallel/CLAUDE-parallel.md b/parallel/CLAUDE-parallel.md index 63e02db8..5ab0adf6 100644 --- a/parallel/CLAUDE-parallel.md +++ b/parallel/CLAUDE-parallel.md @@ -6,6 +6,8 @@ You are **{{AGENT_ID}}**, an autonomous coding agent running in parallel with ot Your assigned story is **{{CLAIMED_STORY}}** — it has already been claimed for you in prd.json. Do NOT claim or work on any other story. +> **Note:** Stories may have a `dependsOn` field listing prerequisite story IDs. The harness only assigns stories whose dependencies are already complete. You don't need to check this yourself. + 1. Read the PRD at `prd.json` 2. Read ALL progress files: `progress.txt` and any `progress-*.txt` files (check Codebase Patterns section first) 3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. diff --git a/prd.json.example b/prd.json.example index fbc40668..41a0326a 100644 --- a/prd.json.example +++ b/prd.json.example @@ -26,6 +26,7 @@ "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-001"], "priority": 2, "passes": false, "notes": "" @@ -41,6 +42,7 @@ "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-001"], "priority": 3, "passes": false, "notes": "" @@ -56,6 +58,7 @@ "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-002", "US-003"], "priority": 4, "passes": false, "notes": "" diff --git a/skills/ralph/SKILL.md b/skills/ralph/SKILL.md index e402ab8d..5f90c9d9 100644 --- a/skills/ralph/SKILL.md +++ b/skills/ralph/SKILL.md @@ -33,6 +33,7 @@ Take a PRD (markdown file or text) and convert it to `prd.json` in your ralph di "Criterion 2", "Typecheck passes" ], + "dependsOn": [], "priority": 1, "passes": false, "notes": "" @@ -68,11 +69,15 @@ Ralph spawns a fresh Amp instance per iteration with no memory of previous work. Stories execute in priority order. Earlier stories must not depend on later ones. +For explicit dependency control, use the `dependsOn` field — an array of story IDs that must have `passes: true` before this story can be claimed. This complements priority ordering by enforcing hard prerequisites, which is especially useful for parallel agents where multiple stories run concurrently. + +**If a story requires another story to be completed first, list the prerequisite IDs in `dependsOn`.** Stories without dependencies use an empty array `[]`. + **Correct order:** 1. Schema/database changes (migrations) 2. Server actions / backend logic -3. UI components that use the backend -4. Dashboard/summary views that aggregate data +3. UI components that use the backend — `dependsOn: ["US-001"]` +4. Dashboard/summary views that aggregate data — `dependsOn: ["US-002", "US-003"]` **Wrong order:** 1. UI component (depends on schema that does not exist yet) @@ -121,9 +126,10 @@ Frontend stories are NOT complete until visually verified. Ralph will use the de 1. **Each user story becomes one JSON entry** 2. **IDs**: Sequential (US-001, US-002, etc.) 3. **Priority**: Based on dependency order, then document order -4. **All stories**: `passes: false` and empty `notes` -5. **branchName**: Derive from feature name, kebab-case, prefixed with `ralph/` -6. **Always add**: "Typecheck passes" to every story's acceptance criteria +4. **dependsOn**: If a story requires another story first, list prerequisite IDs in `dependsOn`. Use `[]` for stories with no prerequisites. +5. **All stories**: `passes: false` and empty `notes` +6. **branchName**: Derive from feature name, kebab-case, prefixed with `ralph/` +7. **Always add**: "Typecheck passes" to every story's acceptance criteria --- @@ -177,6 +183,7 @@ Add ability to mark tasks with different statuses. "Generate and run migration successfully", "Typecheck passes" ], + "dependsOn": [], "priority": 1, "passes": false, "notes": "" @@ -191,6 +198,7 @@ Add ability to mark tasks with different statuses. "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-001"], "priority": 2, "passes": false, "notes": "" @@ -206,6 +214,7 @@ Add ability to mark tasks with different statuses. "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-001"], "priority": 3, "passes": false, "notes": "" @@ -220,6 +229,7 @@ Add ability to mark tasks with different statuses. "Typecheck passes", "Verify in browser using dev-browser skill" ], + "dependsOn": ["US-002", "US-003"], "priority": 4, "passes": false, "notes": "" @@ -252,6 +262,7 @@ Before writing prd.json, verify: - [ ] **Previous run archived** (if prd.json exists with different branchName, archive it first) - [ ] Each story is completable in one iteration (small enough) - [ ] Stories are ordered by dependency (schema to backend to UI) +- [ ] Stories with prerequisites have correct `dependsOn` arrays - [ ] Every story has "Typecheck passes" as criterion - [ ] UI stories have "Verify in browser using dev-browser skill" as criterion - [ ] Acceptance criteria are verifiable (not vague) From fb037c6532b6c64bef0a1dec67ddc52930345777 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Sat, 21 Feb 2026 10:24:27 -0500 Subject: [PATCH 18/21] feat: halt orchestrator on agent auth failure (exit code 2) Agents now exit with code 2 (instead of 1) when auth failures are exhausted. The orchestrator detects this and immediately stops all agents rather than restarting them in an infinite loop, since all agents share the same credentials. Co-Authored-By: Claude Opus 4.6 --- docker/agent-loop.sh | 8 +++++++- parallel/README.md | 12 ++++++++++++ parallel/ralph-parallel.sh | 11 +++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index 92ca976d..d72eb8ee 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -14,6 +14,12 @@ set -euo pipefail # # Auth: Claude credentials are mounted via Docker volume at /home/agent/.claude # +# Exit codes: +# 0 - Clean exit (all stories complete, stop requested, or iteration limit) +# 1 - General failure (missing credentials, prompt error, etc.) +# 2 - Auth failure (expired/invalid credentials after MAX_AUTH_FAILURES retries) +# The orchestrator treats exit 2 as a signal to halt all agents. +# AGENT_ID="${AGENT_ID:?AGENT_ID is required}" AGENT_ROLE="${AGENT_ROLE:?AGENT_ROLE is required}" @@ -532,7 +538,7 @@ while true; do release_claim "$CLAIMED_STORY" || true if [ "$AUTH_FAILURES" -ge "$MAX_AUTH_FAILURES" ]; then echo "[$AGENT_ID] Reached max auth failures ($MAX_AUTH_FAILURES). Exiting to avoid infinite loop." - exit 1 + exit 2 fi # Exponential backoff: 60, 120, 240, 480, 480 (capped) BACKOFF=$((60 * (1 << (AUTH_FAILURES - 1)))) diff --git a/parallel/README.md b/parallel/README.md index 5da339ea..1a5ede0e 100644 --- a/parallel/README.md +++ b/parallel/README.md @@ -194,6 +194,18 @@ parallel/ └── logging.sh # Timestamped log helpers ``` +## Exit Codes & Auth Failure Halt + +Agent containers use distinct exit codes so the orchestrator can respond appropriately: + +| Exit Code | Meaning | Orchestrator Action | +|-----------|---------|---------------------| +| 0 | Clean exit (stories complete, stop requested, iteration limit) | No action | +| 2 | Auth failure (credentials expired after 5 retries) | **Halt all agents** | +| Other non-zero | Crash, OOM, or unexpected error | Restart the container | + +Since all agents share the same credential volume, a single auth failure means none can authenticate. When exit code 2 is detected, the orchestrator immediately stops all remaining containers, tears down networks, and exits with a message to refresh credentials. No restart loop occurs. + ## Per-Agent Progress Files Instead of all agents appending to one `progress.txt` (merge conflict risk), each agent writes to `progress-.txt`. The parallel prompt instructs agents to read ALL progress files for context and write only to their own. diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index d1d73037..99bc157b 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -560,6 +560,17 @@ while true; do if [ "$EXIT_CODE" = "0" ]; then log_info "Container $name exited cleanly (code 0)." + elif [ "$EXIT_CODE" = "2" ]; then + log_error "Container $name exited due to auth failure (code 2). Credentials may be expired." + log_error "Halting all agents." + echo "auth_failure" > "$PROJECT_DIR/.ralph/stop_requested" + for stop_name in "${CONTAINER_NAMES[@]}"; do + [ "$stop_name" = "$name" ] && continue + stop_agent "$stop_name" 10 + done + teardown_networks + log_error "All agents stopped. Refresh your Claude credentials and re-run." + exit 1 else log_warn "Container $name stopped unexpectedly (exit code: $EXIT_CODE). Restarting..." restart_agent "$name" From 28566bcfd9563dc62a52d7dcb01b5c9da68be618 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Sat, 21 Feb 2026 10:38:03 -0500 Subject: [PATCH 19/21] fix: use extended regex for bare repo branch cleanup on macOS macOS grep doesn't support \| alternation in basic regex, causing the stale branch cleanup to delete main/master from the bare repo. Switch to grep -E with (main|master) pattern. Co-Authored-By: Claude Opus 4.6 --- parallel/ralph-parallel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 99bc157b..370db890 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -277,7 +277,7 @@ else # Clean stale feature branches from bare repo to prevent agents from seeing old state log_info "Cleaning stale branches from bare repo..." git --git-dir="$BARE_REPO" for-each-ref --format='%(refname:short)' refs/heads/ | \ - grep -v '^main$\|^master$' | \ + grep -Ev '^(main|master)$' | \ xargs -I{} git --git-dir="$BARE_REPO" branch -D {} 2>/dev/null || true cd - > /dev/null fi From 54c840244950a0b21df8899b77ab0cb976d3e8be Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Wed, 25 Feb 2026 16:27:03 -0500 Subject: [PATCH 20/21] feat: evolve PRD format toward PRP standard with enriched context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enrich the spec format so agents have the context they need, with acceptance criteria as the primary quality gate. Support both prp.json (new) and prd.json (legacy) filenames across all scripts and prompts. Key additions: - Project-level: constraints, nonGoals, glossary, version, previousVersion - Story-level: verificationCommands, context (relevantFiles, hints, examples) - PRP versioning system for incremental feature evolution - Dual filename detection (prp.json preferred, prd.json fallback) - Versioned archiving (archive/{feature}/v{N}.prp.json) All new fields are optional — existing prd.json files work unchanged. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 40 +++++++-- docker/agent-loop.sh | 89 +++++++++++++------ parallel/CLAUDE-parallel.md | 40 +++++++-- parallel/CLAUDE-verifier.md | 41 ++++++--- parallel/ralph-parallel.sh | 69 +++++++++----- parallel/status.sh | 25 ++++-- prp.json.example | 152 +++++++++++++++++++++++++++++++ ralph.sh | 37 ++++---- skills/prd/SKILL.md | 92 ++++++++++++++++--- skills/ralph/SKILL.md | 173 +++++++++++++++++++++++++++++++----- 10 files changed, 624 insertions(+), 134 deletions(-) create mode 100644 prp.json.example diff --git a/CLAUDE.md b/CLAUDE.md index 121fb5ef..a728728d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,16 +4,36 @@ You are an autonomous coding agent working on a software project. ## Your Task -1. Read the PRD at `prd.json` (in the same directory as this file) +1. Read the spec file: check for `prp.json` first, fall back to `prd.json` (in the same directory as this file) 2. Read the progress log at `progress.txt` (check Codebase Patterns section first) -3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. -4. Pick the **highest priority** user story where `passes: false` and all `dependsOn` story IDs (if any) have `passes: true` -5. Implement that single user story -6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) -7. Update CLAUDE.md files if you discover reusable patterns (see below) -8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]` -9. Update the PRD to set `passes: true` for the completed story -10. Append your progress to `progress.txt` +3. Check you're on the correct branch from the spec's `branchName`. If not, check it out or create from main. +4. If the spec has a `previousVersion` field (non-null), read the prior version for context on what's already built +5. Pick the **highest priority** user story where `passes: false` and all `dependsOn` story IDs (if any) have `passes: true` +6. Before implementing, read the story's `context` block if present (see PRP Context below) +7. Implement that single user story +8. Run quality checks — if the story has `verificationCommands`, run those; otherwise use project defaults (typecheck, lint, test) +9. Update CLAUDE.md files if you discover reusable patterns (see below) +10. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]` +11. Update the spec to set `passes: true` for the completed story +12. Append your progress to `progress.txt` + +## PRP Context + +The spec file may contain enriched fields that help you work more effectively. Check for and use these if present: + +### Project-Level Fields +- **`constraints`**: Architectural decisions you MUST follow (e.g., "Use drizzle ORM", "Use server actions for mutations"). Treat these as hard requirements — do not deviate. +- **`nonGoals`**: Explicit scope boundaries. Before completing a story, verify your implementation doesn't accidentally build something listed as a non-goal. +- **`glossary`**: Domain term definitions. Use these when you encounter unfamiliar terms in the spec. + +### Story-Level Fields +- **`context.relevantFiles`**: Read these files before starting implementation — they contain the code you'll be modifying or the patterns you should follow. +- **`context.hints`**: Implementation guidance — what to reuse, what approach to take. +- **`context.examples`**: Code snippets showing patterns your new code should match. +- **`verificationCommands`**: Specific shell commands to validate your work. Run these instead of (or in addition to) generic quality checks. + +### Version Context +- **`previousVersion`**: If non-null, points to an archived prior version of this spec. Read it to understand what's already been built — stories with `passes: true` carried over from the previous version are already implemented. ## Progress Report Format @@ -73,9 +93,11 @@ Only update CLAUDE.md if you have **genuinely reusable knowledge** that would he ## Quality Requirements - ALL commits must pass your project's quality checks (typecheck, lint, test) +- If the story has `verificationCommands`, run those as part of your quality checks - Do NOT commit broken code - Keep changes focused and minimal - Follow existing code patterns +- If `constraints` exist in the spec, verify your implementation follows them ## Browser Testing (If Available) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index d72eb8ee..bc51858d 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -34,6 +34,18 @@ STOP_FILE="/harness-state/stop_requested" LOG_DIR="/agent-logs" ITERATION=0 +# Detect spec file: prefer prp.json, fall back to prd.json +detect_spec_file() { + local dir="${1:-.}" + if [ -f "$dir/prp.json" ]; then + echo "prp.json" + elif [ -f "$dir/prd.json" ]; then + echo "prd.json" + else + echo "prp.json" # default for new projects + fi +} + echo "[$AGENT_ID] Starting agent loop (role=$AGENT_ROLE, model=$CLAUDE_MODEL, max_iterations=$MAX_ITERATIONS)" # --- Step 1: Initialize firewall based on role --- @@ -93,15 +105,19 @@ setup_git_identity() { git config push.autoSetupRemote true } -# --- Step 5: Check out the correct branch from prd.json --- +# --- Step 5: Check out the correct branch from spec file --- checkout_prd_branch() { - if [ ! -f "$WORKSPACE/prd.json" ]; then - echo "[$AGENT_ID] WARNING: No prd.json found in workspace" + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$WORKSPACE/$spec_file" ]; then + echo "[$AGENT_ID] WARNING: No prp.json or prd.json found in workspace" return 1 fi + echo "[$AGENT_ID] Using spec file: $spec_file" local branch_name - branch_name=$(jq -r '.branchName // empty' "$WORKSPACE/prd.json" 2>/dev/null || echo "") + branch_name=$(jq -r '.branchName // empty' "$WORKSPACE/$spec_file" 2>/dev/null || echo "") if [ -z "$branch_name" ]; then echo "[$AGENT_ID] No branchName in prd.json, staying on current branch" @@ -131,7 +147,7 @@ checkout_prd_branch() { check_existing_claim() { cd "$WORKSPACE" - # Pull latest prd.json + # Pull latest spec local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") # Always fetch first so we discover remote branches created by other agents @@ -144,7 +160,10 @@ check_existing_claim() { } fi - if [ ! -f prd.json ]; then + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$spec_file" ]; then return 1 fi @@ -154,7 +173,7 @@ check_existing_claim() { | map(select(.passes == false and .claimed_by == $agent)) | first | .id // empty - ' prd.json 2>/dev/null || echo "") + ' "$spec_file" 2>/dev/null || echo "") if [ -n "$story_id" ]; then echo "[$AGENT_ID] Already owns incomplete story: $story_id" >&2 @@ -169,7 +188,10 @@ release_claim() { local story_id="$1" cd "$WORKSPACE" - if [ ! -f prd.json ]; then + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$spec_file" ]; then return 1 fi @@ -181,11 +203,11 @@ release_claim() { .claimed_by = null | .claimed_at = null else . end ) - ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + ' "$spec_file" > "${spec_file}.tmp" && mv "${spec_file}.tmp" "$spec_file" - git add prd.json >&2 2>&1 + git add "$spec_file" >&2 2>&1 git commit -m "[$AGENT_ID] Release: $story_id (Claude failure)" >&2 2>&1 || { - git checkout -- prd.json >/dev/null 2>&1 || true + git checkout -- "$spec_file" >/dev/null 2>&1 || true return 1 } git push >&2 2>&1 || { @@ -196,12 +218,12 @@ release_claim() { return 0 } -# --- Step 6: Claim a story in prd.json --- +# --- Step 6: Claim a story in spec file --- # Returns 0 and prints story ID if claimed, returns 1 if no stories available claim_story() { cd "$WORKSPACE" - # Pull latest prd.json (all git output to stderr to keep stdout clean for return value) + # Pull latest spec (all git output to stderr to keep stdout clean for return value) # On a new branch with no remote tracking yet, pull will fail — that's fine, we continue local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") @@ -215,8 +237,11 @@ claim_story() { } fi - if [ ! -f prd.json ]; then - echo "[$AGENT_ID] No prd.json found" >&2 + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$spec_file" ]; then + echo "[$AGENT_ID] No prp.json or prd.json found" >&2 return 1 fi @@ -234,7 +259,7 @@ claim_story() { | sort_by(.priority) | first | .id // empty - ' prd.json 2>/dev/null || echo "") + ' "$spec_file" 2>/dev/null || echo "") if [ -z "$story_id" ]; then echo "[$AGENT_ID] No unclaimed stories available" >&2 @@ -251,12 +276,12 @@ claim_story() { .claimed_by = $agent | .claimed_at = $ts else . end ) - ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + ' "$spec_file" > "${spec_file}.tmp" && mv "${spec_file}.tmp" "$spec_file" - git add prd.json >&2 2>&1 + git add "$spec_file" >&2 2>&1 git commit -m "[$AGENT_ID] Claim: $story_id" >&2 2>&1 || { echo "[$AGENT_ID] Failed to commit claim for $story_id" >&2 - git checkout -- prd.json >/dev/null 2>&1 || true + git checkout -- "$spec_file" >/dev/null 2>&1 || true return 1 } @@ -287,7 +312,7 @@ claim_story() { claim_verification() { cd "$WORKSPACE" - # Pull latest prd.json + # Pull latest spec local current_branch current_branch=$(git branch --show-current 2>/dev/null || echo "") # Always fetch first so we discover remote branches created by other agents @@ -300,8 +325,11 @@ claim_verification() { } fi - if [ ! -f prd.json ]; then - echo "[$AGENT_ID] No prd.json found" >&2 + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$spec_file" ]; then + echo "[$AGENT_ID] No prp.json or prd.json found" >&2 return 1 fi @@ -312,7 +340,7 @@ claim_verification() { | map(select(.passes == true and .verified != true and (.verified_by == null or .verified_by == ""))) | first | .id // empty - ' prd.json 2>/dev/null || echo "") + ' "$spec_file" 2>/dev/null || echo "") if [ -z "$story_id" ]; then echo "[$AGENT_ID] No stories ready for verification" >&2 @@ -329,12 +357,12 @@ claim_verification() { .verified_by = $agent | .verified_at = $ts else . end ) - ' prd.json > prd.json.tmp && mv prd.json.tmp prd.json + ' "$spec_file" > "${spec_file}.tmp" && mv "${spec_file}.tmp" "$spec_file" - git add prd.json >&2 2>&1 + git add "$spec_file" >&2 2>&1 git commit -m "[$AGENT_ID] Verify claim: $story_id" >&2 2>&1 || { echo "[$AGENT_ID] Failed to commit verification claim for $story_id" >&2 - git checkout -- prd.json >/dev/null 2>&1 || true + git checkout -- "$spec_file" >/dev/null 2>&1 || true return 1 } @@ -362,18 +390,21 @@ claim_verification() { # --- Step 7: Check if all stories are complete --- all_stories_complete() { - if [ ! -f "$WORKSPACE/prd.json" ]; then + local spec_file + spec_file=$(detect_spec_file "$WORKSPACE") + + if [ ! -f "$WORKSPACE/$spec_file" ]; then return 1 fi if [ "$AGENT_ROLE" = "verifier" ]; then # Verifiers check that all stories are both passing AND verified local incomplete - incomplete=$(jq '[.userStories[] | select(.passes == false or .verified != true)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") + incomplete=$(jq '[.userStories[] | select(.passes == false or .verified != true)] | length' "$WORKSPACE/$spec_file" 2>/dev/null || echo "1") [ "$incomplete" -eq 0 ] else local incomplete - incomplete=$(jq '[.userStories[] | select(.passes == false)] | length' "$WORKSPACE/prd.json" 2>/dev/null || echo "1") + incomplete=$(jq '[.userStories[] | select(.passes == false)] | length' "$WORKSPACE/$spec_file" 2>/dev/null || echo "1") [ "$incomplete" -eq 0 ] fi } diff --git a/parallel/CLAUDE-parallel.md b/parallel/CLAUDE-parallel.md index 5ab0adf6..642177fa 100644 --- a/parallel/CLAUDE-parallel.md +++ b/parallel/CLAUDE-parallel.md @@ -4,23 +4,43 @@ You are **{{AGENT_ID}}**, an autonomous coding agent running in parallel with ot ## Your Task -Your assigned story is **{{CLAIMED_STORY}}** — it has already been claimed for you in prd.json. Do NOT claim or work on any other story. +Your assigned story is **{{CLAIMED_STORY}}** — it has already been claimed for you in the spec file. Do NOT claim or work on any other story. > **Note:** Stories may have a `dependsOn` field listing prerequisite story IDs. The harness only assigns stories whose dependencies are already complete. You don't need to check this yourself. -1. Read the PRD at `prd.json` +1. Read the spec file: check for `prp.json` first, fall back to `prd.json` 2. Read ALL progress files: `progress.txt` and any `progress-*.txt` files (check Codebase Patterns section first) -3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. +3. Check you're on the correct branch from the spec's `branchName`. If not, check it out or create from main. 4. `git pull --rebase` to get the latest code -5. Implement **only** story **{{CLAIMED_STORY}}** -6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) -7. Update AGENTS.md files if you discover reusable patterns (see below) -8. If checks pass, commit ALL changes with message: `feat: {{CLAIMED_STORY}} - [Story Title]` -9. Update the PRD to set `passes: true` for **{{CLAIMED_STORY}}** only -10. Append your progress to `progress-{{AGENT_ID}}.txt` +5. If the spec has a `previousVersion` field (non-null), read the prior version for context on what's already built +6. Before implementing, read the story's `context` block if present (see PRP Context below) +7. Implement **only** story **{{CLAIMED_STORY}}** +8. Run quality checks — if the story has `verificationCommands`, run those; otherwise use project defaults +9. Update AGENTS.md files if you discover reusable patterns (see below) +10. If checks pass, commit ALL changes with message: `feat: {{CLAIMED_STORY}} - [Story Title]` +11. Update the spec to set `passes: true` for **{{CLAIMED_STORY}}** only +12. Append your progress to `progress-{{AGENT_ID}}.txt` **Important**: Do NOT modify `claimed_by` fields or claim additional stories. The harness script manages claiming. You implement the one story assigned to you. +## PRP Context + +The spec file may contain enriched fields that help you work more effectively. Check for and use these if present: + +### Project-Level Fields +- **`constraints`**: Architectural decisions you MUST follow (e.g., "Use drizzle ORM", "Use server actions for mutations"). Treat these as hard requirements — do not deviate. +- **`nonGoals`**: Explicit scope boundaries. Before completing a story, verify your implementation doesn't accidentally build something listed as a non-goal. +- **`glossary`**: Domain term definitions. Use these when you encounter unfamiliar terms in the spec. + +### Story-Level Fields +- **`context.relevantFiles`**: Read these files before starting implementation — they contain the code you'll be modifying or the patterns you should follow. +- **`context.hints`**: Implementation guidance — what to reuse, what approach to take. +- **`context.examples`**: Code snippets showing patterns your new code should match. +- **`verificationCommands`**: Specific shell commands to validate your work. Run these instead of (or in addition to) generic quality checks. + +### Version Context +- **`previousVersion`**: If non-null, points to an archived prior version of this spec. Read it to understand what's already been built — stories with `passes: true` carried over from the previous version are already implemented. + ## Per-Agent Progress Files - **Write** your progress to: `progress-{{AGENT_ID}}.txt` @@ -97,9 +117,11 @@ Before committing, check if any edited files have learnings worth preserving in ## Quality Requirements - ALL commits must pass your project's quality checks (typecheck, lint, test) +- If the story has `verificationCommands`, run those as part of your quality checks - Do NOT commit broken code - Keep changes focused and minimal - Follow existing code patterns +- If `constraints` exist in the spec, verify your implementation follows them ## Browser Testing (If Available) diff --git a/parallel/CLAUDE-verifier.md b/parallel/CLAUDE-verifier.md index b18549b1..7e119f52 100644 --- a/parallel/CLAUDE-verifier.md +++ b/parallel/CLAUDE-verifier.md @@ -6,18 +6,26 @@ You are **{{AGENT_ID}}** (role: verifier), an autonomous verification agent runn Your assigned story is **{{CLAIMED_STORY}}** — it has `passes: true` set by a builder agent. Your job is to **independently verify** that the implementation actually works by running the project's tests and inspecting results against the acceptance criteria. -1. Read the PRD at `prd.json` +1. Read the spec file: check for `prp.json` first, fall back to `prd.json` 2. Find story **{{CLAIMED_STORY}}** and read its acceptance criteria 3. Read ALL progress files: `progress.txt` and any `progress-*.txt` files for context on what was built 4. `git pull --rebase` to get the latest code -5. Auto-detect the test framework and run tests -6. Evaluate results against acceptance criteria -7. Update prd.json based on your findings (see Verification Outcomes below) -8. Commit and push your changes +5. Run verification (see Verification Strategy below) +6. If the spec has `constraints`, verify the implementation follows them +7. If the spec has `nonGoals`, verify the implementation doesn't violate scope boundaries +8. Evaluate results against acceptance criteria +9. Update the spec file based on your findings (see Verification Outcomes below) +10. Commit and push your changes -## Auto-Detect Test Framework +## Verification Strategy -Inspect the project root for build/test configuration: +### Story-Level `verificationCommands` (Preferred) + +If the story has a `verificationCommands` array, run those specific commands first. These are the most targeted verification for that story. + +### Auto-Detect Test Framework (Fallback) + +If no `verificationCommands` exist, inspect the project root for build/test configuration: | File | Command | |------|---------| @@ -31,11 +39,22 @@ Inspect the project root for build/test configuration: If multiple are present, prefer the one most relevant to the story's changes. If no test framework is found, note this in `verification_notes` and mark as verified (no tests to fail). +### Constraints Verification + +If the spec has a `constraints` array, check that the implementation follows them. For example: +- If a constraint says "Use drizzle ORM", verify the story doesn't use raw SQL or a different ORM +- If a constraint says "Use server actions for mutations", verify no API routes were added for mutations + +### Non-Goals Verification + +If the spec has a `nonGoals` array, check that the implementation doesn't accidentally build something out of scope. For example: +- If a non-goal says "No priority-based notifications", verify no notification code was added + ## Verification Outcomes ### If tests PASS and acceptance criteria are met: -Update prd.json for **{{CLAIMED_STORY}}**: +Update the spec file for **{{CLAIMED_STORY}}**: ```json { "verified": true, @@ -47,7 +66,7 @@ Update prd.json for **{{CLAIMED_STORY}}**: ### If tests FAIL or acceptance criteria are NOT met: -**Bounce the story back to builders** by updating prd.json for **{{CLAIMED_STORY}}**: +**Bounce the story back to builders** by updating the spec file for **{{CLAIMED_STORY}}**: ```json { "passes": false, @@ -65,7 +84,7 @@ This clears the builder's claim so another builder can pick it up and fix the is ## Push Protocol Always follow this sequence: -1. `git add prd.json` +1. `git add prp.json` (or `prd.json` — whichever exists) 2. `git commit -m "[{{AGENT_ID}}] Verify: {{CLAIMED_STORY}} — "` 3. `git pull --rebase origin ` 4. `git push origin ` @@ -73,7 +92,7 @@ Always follow this sequence: ## Critical Rules -- **Do NOT modify source code** — you only modify `prd.json` +- **Do NOT modify source code** — you only modify the spec file - **Do NOT claim additional stories** — the harness assigns stories to you - **One story per iteration** — verify the assigned story and exit - Run the full test suite, not just targeted tests, to catch regressions diff --git a/parallel/ralph-parallel.sh b/parallel/ralph-parallel.sh index 370db890..f8080937 100755 --- a/parallel/ralph-parallel.sh +++ b/parallel/ralph-parallel.sh @@ -3,14 +3,14 @@ set -euo pipefail # # ralph-parallel.sh — Parallel mode orchestrator for Ralph. # -# Launches N containerized Claude Code agents that work on prd.json stories +# Launches N containerized Claude Code agents that work on prp.json/prd.json stories # simultaneously. Each agent runs in a Docker container with network restrictions, # resource limits, and no host access. # # Usage: ./parallel/ralph-parallel.sh [options] [max_iterations] # # Options: -# --project DIR Project directory containing prd.json (default: current dir) +# --project DIR Project directory containing prp.json/prd.json (default: current dir) # --image IMAGE Custom Docker image (default: ralph-agent:latest, auto-built) # --agents N Number of builder agents (default: 2) # --researcher N Number of researcher agents with full internet (default: 0) @@ -120,7 +120,7 @@ while [[ $# -gt 0 ]]; do echo "Usage: $0 [options] [max_iterations]" echo "" echo "Options:" - echo " --project DIR Project directory with prd.json (default: current dir)" + echo " --project DIR Project directory with prp.json/prd.json (default: current dir)" echo " --image IMAGE Custom Docker image (default: ralph-agent:latest)" echo " --agents N Number of builder agents (default: 2)" echo " --researcher N Number of researcher agents (default: 0)" @@ -160,16 +160,24 @@ if [ -z "$PROJECT_DIR" ]; then fi # Resolve to absolute path PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" -PRD_FILE="$PROJECT_DIR/prd.json" +# Prefer prp.json, fall back to prd.json +if [ -f "$PROJECT_DIR/prp.json" ]; then + SPEC_FILE="$PROJECT_DIR/prp.json" +elif [ -f "$PROJECT_DIR/prd.json" ]; then + SPEC_FILE="$PROJECT_DIR/prd.json" +else + SPEC_FILE="" +fi # CLAUDE-parallel.md lives in the ralph repo, not the project PARALLEL_PROMPT="$SCRIPT_DIR/CLAUDE-parallel.md" VERIFIER_PROMPT="$SCRIPT_DIR/CLAUDE-verifier.md" -if [ ! -f "$PRD_FILE" ]; then - log_error "No prd.json found in $PROJECT_DIR" - log_error "Create a prd.json first (see prd.json.example)." +if [ -z "$SPEC_FILE" ]; then + log_error "No prp.json or prd.json found in $PROJECT_DIR" + log_error "Create a prp.json first (see prp.json.example)." exit 1 fi +SPEC_BASENAME=$(basename "$SPEC_FILE") if [ ! -f "$PARALLEL_PROMPT" ]; then log_error "Missing parallel/CLAUDE-parallel.md prompt file" @@ -187,14 +195,16 @@ if [ ! -d "$PROJECT_DIR/.git" ]; then fi # --- Display config --- -PROJECT_NAME=$(jq -r '.project // "unknown"' "$PRD_FILE" 2>/dev/null || echo "unknown") -BRANCH_NAME=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") -TOTAL_STORIES=$(jq '.userStories | length' "$PRD_FILE" 2>/dev/null || echo "?") -DONE_STORIES=$(jq '[.userStories[] | select(.passes == true)] | length' "$PRD_FILE" 2>/dev/null || echo "?") +PROJECT_NAME=$(jq -r '.project // "unknown"' "$SPEC_FILE" 2>/dev/null || echo "unknown") +BRANCH_NAME=$(jq -r '.branchName // empty' "$SPEC_FILE" 2>/dev/null || echo "") +SPEC_VERSION=$(jq -r '.version // 1' "$SPEC_FILE" 2>/dev/null || echo "1") +TOTAL_STORIES=$(jq '.userStories | length' "$SPEC_FILE" 2>/dev/null || echo "?") +DONE_STORIES=$(jq '[.userStories[] | select(.passes == true)] | length' "$SPEC_FILE" 2>/dev/null || echo "?") log_info "Ralph Parallel Mode" log_info "====================" -log_info "Project: $PROJECT_NAME" +log_info "Project: $PROJECT_NAME (v$SPEC_VERSION)" +log_info "Spec: $SPEC_BASENAME" log_info "Branch: ${BRANCH_NAME:-}" log_info "Stories: $DONE_STORIES/$TOTAL_STORIES complete" log_info "Agents: $NUM_BUILDERS builders, $NUM_RESEARCHERS researchers, $NUM_VERIFIERS verifiers ($TOTAL_AGENTS total)" @@ -369,11 +379,22 @@ read_from_bare_repo() { || echo "" } +# Helper: read spec from bare repo (try prp.json first, fall back to prd.json) +read_spec_from_bare_repo() { + local branch="${1:-$BRANCH_NAME}" + local content + content=$(read_from_bare_repo "prp.json" "$branch") + if [ -z "$content" ]; then + content=$(read_from_bare_repo "prd.json" "$branch") + fi + echo "$content" +} + # Helper: check if all stories are complete in the bare repo # When verifiers are in use, requires passes AND verified for all stories. check_all_stories_complete() { local prd_content - prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + prd_content=$(read_spec_from_bare_repo "$BRANCH_NAME") [ -z "$prd_content" ] && return 1 if [ "$NUM_VERIFIERS" -gt 0 ]; then @@ -390,9 +411,9 @@ check_all_stories_complete() { } recover_stale_claims() { - # Read prd.json from the bare repo (agents push there, not to project dir) + # Read spec from the bare repo (agents push there, not to project dir) local prd_content - prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + prd_content=$(read_spec_from_bare_repo "$BRANCH_NAME") [ -z "$prd_content" ] && return local now_epoch @@ -452,8 +473,11 @@ recover_stale_claims() { if [ -n "$BRANCH_NAME" ]; then git checkout "$BRANCH_NAME" 2>/dev/null || true fi - echo "$updated_prd" | jq '.' > prd.json - git add prd.json + # Write to whichever spec file exists in the checkout + local target_spec="prp.json" + [ -f "prd.json" ] && [ ! -f "prp.json" ] && target_spec="prd.json" + echo "$updated_prd" | jq '.' > "$target_spec" + git add "$target_spec" git commit -m "[orchestrator] Clear stale claims" 2>/dev/null || true git push origin 2>/dev/null || true cd - > /dev/null @@ -465,7 +489,7 @@ recover_stale_verification_claims() { [ "$NUM_VERIFIERS" -eq 0 ] && return local prd_content - prd_content=$(read_from_bare_repo "prd.json" "$BRANCH_NAME") + prd_content=$(read_spec_from_bare_repo "$BRANCH_NAME") [ -z "$prd_content" ] && return local now_epoch @@ -523,8 +547,11 @@ recover_stale_verification_claims() { if [ -n "$BRANCH_NAME" ]; then git checkout "$BRANCH_NAME" 2>/dev/null || true fi - echo "$updated_prd" | jq '.' > prd.json - git add prd.json + # Write to whichever spec file exists in the checkout + local target_spec="prp.json" + [ -f "prd.json" ] && [ ! -f "prp.json" ] && target_spec="prd.json" + echo "$updated_prd" | jq '.' > "$target_spec" + git add "$target_spec" git commit -m "[orchestrator] Clear stale verification claims" 2>/dev/null || true git push origin 2>/dev/null || true cd - > /dev/null @@ -585,7 +612,7 @@ while true; do # Check if all stories are complete (read from bare repo) if check_all_stories_complete; then - log_info "All PRD stories are complete!" + log_info "All stories are complete!" log_info "Shutting down agents..." for name in "${CONTAINER_NAMES[@]}"; do stop_agent "$name" 15 diff --git a/parallel/status.sh b/parallel/status.sh index 924832b3..13dd4b5f 100755 --- a/parallel/status.sh +++ b/parallel/status.sh @@ -23,7 +23,6 @@ if [ -z "$PROJECT_DIR" ]; then PROJECT_DIR="$(pwd)" fi PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" -PRD_FILE="$PROJECT_DIR/prd.json" BARE_REPO="$PROJECT_DIR/.ralph/repo.git" # Helper: read file from bare repo @@ -33,20 +32,36 @@ read_from_bare_repo() { } # Load project info — prefer bare repo (has latest agent pushes), fallback to working dir +# Try prp.json first, then prd.json PROJECT_NAME="unknown" +SPEC_VERSION="1" +SPEC_BASENAME="" PRD_CONTENT="" if [ -d "$BARE_REPO" ]; then - PRD_CONTENT=$(read_from_bare_repo "prd.json") + PRD_CONTENT=$(read_from_bare_repo "prp.json") + [ -n "$PRD_CONTENT" ] && SPEC_BASENAME="prp.json" + if [ -z "$PRD_CONTENT" ]; then + PRD_CONTENT=$(read_from_bare_repo "prd.json") + [ -n "$PRD_CONTENT" ] && SPEC_BASENAME="prd.json" + fi fi -if [ -z "$PRD_CONTENT" ] && [ -f "$PRD_FILE" ]; then - PRD_CONTENT=$(cat "$PRD_FILE") +if [ -z "$PRD_CONTENT" ]; then + if [ -f "$PROJECT_DIR/prp.json" ]; then + PRD_CONTENT=$(cat "$PROJECT_DIR/prp.json") + SPEC_BASENAME="prp.json" + elif [ -f "$PROJECT_DIR/prd.json" ]; then + PRD_CONTENT=$(cat "$PROJECT_DIR/prd.json") + SPEC_BASENAME="prd.json" + fi fi if [ -n "$PRD_CONTENT" ]; then PROJECT_NAME=$(echo "$PRD_CONTENT" | jq -r '.project // "unknown"' 2>/dev/null || echo "unknown") + SPEC_VERSION=$(echo "$PRD_CONTENT" | jq -r '.version // 1' 2>/dev/null || echo "1") fi echo "========================================" -echo " Ralph Parallel Status: $PROJECT_NAME" +echo " Ralph Parallel Status: $PROJECT_NAME (v$SPEC_VERSION)" +[ -n "$SPEC_BASENAME" ] && echo " Spec: $SPEC_BASENAME" echo "========================================" echo "" diff --git a/prp.json.example b/prp.json.example new file mode 100644 index 00000000..33d2470b --- /dev/null +++ b/prp.json.example @@ -0,0 +1,152 @@ +{ + "project": "MyApp", + "branchName": "ralph/task-priority", + "description": "Task Priority System - Add priority levels to tasks with visual indicators and filtering", + "version": 1, + "previousVersion": null, + + "constraints": [ + "Use drizzle ORM for all database operations", + "Use server actions in src/actions/ for mutations, not API routes", + "Use shadcn/ui for all new UI components", + "Follow the existing pattern in src/actions/ for new server actions" + ], + + "nonGoals": [ + "No priority-based notifications or reminders", + "No automatic priority assignment based on due date", + "No priority inheritance for subtasks" + ], + + "glossary": { + "priority": "Task urgency level: high, medium, or low", + "badge": "Colored label component showing task metadata" + }, + + "userStories": [ + { + "id": "US-001", + "title": "Add priority field to database", + "description": "As a developer, I need to store task priority so it persists across sessions.", + "acceptanceCriteria": [ + "Add priority column to tasks table: 'high' | 'medium' | 'low' (default 'medium')", + "Generate and run migration successfully", + "Typecheck passes" + ], + "verificationCommands": [ + "npx tsc --noEmit", + "npx drizzle-kit push" + ], + "dependsOn": [], + "priority": 1, + "passes": false, + "notes": "", + "context": { + "relevantFiles": [ + "src/db/schema.ts", + "drizzle.config.ts" + ], + "hints": [ + "Follow existing column patterns in schema.ts", + "Use pgEnum for the priority type" + ], + "examples": [ + "// Existing enum pattern in schema.ts:\nexport const statusEnum = pgEnum('status', ['pending', 'in_progress', 'done']);" + ] + } + }, + { + "id": "US-002", + "title": "Display priority indicator on task cards", + "description": "As a user, I want to see task priority at a glance so I know what needs attention first.", + "acceptanceCriteria": [ + "TaskCard component renders a Badge with color prop: red for high, yellow for medium, gray for low", + "Priority badge is visible without hover or click interaction", + "Typecheck passes", + "Verify in browser using dev-browser skill" + ], + "verificationCommands": [ + "npx tsc --noEmit" + ], + "dependsOn": ["US-001"], + "priority": 2, + "passes": false, + "notes": "", + "context": { + "relevantFiles": [ + "src/components/TaskCard.tsx", + "src/components/ui/Badge.tsx" + ], + "hints": [ + "Reuse existing Badge component — it already supports color variants", + "Priority data is already included in the task query response after US-001" + ], + "examples": [ + "// Existing Badge usage in StatusBadge.tsx:\n{task.status}" + ] + } + }, + { + "id": "US-003", + "title": "Add priority selector to task edit", + "description": "As a user, I want to change a task's priority when editing it.", + "acceptanceCriteria": [ + "Priority dropdown in task edit modal with options: High, Medium, Low", + "Shows current priority as selected value", + "Saves immediately on selection change via server action", + "Typecheck passes", + "Verify in browser using dev-browser skill" + ], + "verificationCommands": [ + "npx tsc --noEmit" + ], + "dependsOn": ["US-001"], + "priority": 3, + "passes": false, + "notes": "", + "context": { + "relevantFiles": [ + "src/components/TaskEditModal.tsx", + "src/actions/tasks.ts" + ], + "hints": [ + "Add updatePriority server action following existing patterns in tasks.ts", + "Use shadcn/ui Select component for the dropdown" + ], + "examples": [ + "// Existing server action pattern in tasks.ts:\nexport async function updateStatus(taskId: string, status: Status) {\n return db.update(tasks).set({ status }).where(eq(tasks.id, taskId));\n}" + ] + } + }, + { + "id": "US-004", + "title": "Filter tasks by priority", + "description": "As a user, I want to filter the task list to see only high-priority items when I'm focused.", + "acceptanceCriteria": [ + "Filter dropdown with options: All | High | Medium | Low", + "Filter persists in URL search params", + "Empty state message when no tasks match filter", + "Typecheck passes", + "Verify in browser using dev-browser skill" + ], + "verificationCommands": [ + "npx tsc --noEmit" + ], + "dependsOn": ["US-002", "US-003"], + "priority": 4, + "passes": false, + "notes": "", + "context": { + "relevantFiles": [ + "src/components/TaskList.tsx", + "src/components/FilterBar.tsx" + ], + "hints": [ + "Follow the existing status filter pattern in FilterBar.tsx", + "Use useSearchParams for URL param persistence" + ], + "examples": [] + } + } + ] +} diff --git a/ralph.sh b/ralph.sh index baff052a..af449202 100755 --- a/ralph.sh +++ b/ralph.sh @@ -34,29 +34,36 @@ if [[ "$TOOL" != "amp" && "$TOOL" != "claude" ]]; then exit 1 fi SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PRD_FILE="$SCRIPT_DIR/prd.json" +# Prefer prp.json, fall back to prd.json +if [ -f "$SCRIPT_DIR/prp.json" ]; then + SPEC_FILE="$SCRIPT_DIR/prp.json" +elif [ -f "$SCRIPT_DIR/prd.json" ]; then + SPEC_FILE="$SCRIPT_DIR/prd.json" +else + SPEC_FILE="$SCRIPT_DIR/prp.json" # default for new projects +fi PROGRESS_FILE="$SCRIPT_DIR/progress.txt" ARCHIVE_DIR="$SCRIPT_DIR/archive" LAST_BRANCH_FILE="$SCRIPT_DIR/.last-branch" # Archive previous run if branch changed -if [ -f "$PRD_FILE" ] && [ -f "$LAST_BRANCH_FILE" ]; then - CURRENT_BRANCH=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") +if [ -f "$SPEC_FILE" ] && [ -f "$LAST_BRANCH_FILE" ]; then + CURRENT_BRANCH=$(jq -r '.branchName // empty' "$SPEC_FILE" 2>/dev/null || echo "") LAST_BRANCH=$(cat "$LAST_BRANCH_FILE" 2>/dev/null || echo "") - + if [ -n "$CURRENT_BRANCH" ] && [ -n "$LAST_BRANCH" ] && [ "$CURRENT_BRANCH" != "$LAST_BRANCH" ]; then - # Archive the previous run - DATE=$(date +%Y-%m-%d) + # Archive the previous run — use versioned naming + SPEC_VERSION=$(jq -r '.version // 1' "$SPEC_FILE" 2>/dev/null || echo "1") # Strip "ralph/" prefix from branch name for folder FOLDER_NAME=$(echo "$LAST_BRANCH" | sed 's|^ralph/||') - ARCHIVE_FOLDER="$ARCHIVE_DIR/$DATE-$FOLDER_NAME" - - echo "Archiving previous run: $LAST_BRANCH" + ARCHIVE_FOLDER="$ARCHIVE_DIR/$FOLDER_NAME" + + echo "Archiving previous run: $LAST_BRANCH (v$SPEC_VERSION)" mkdir -p "$ARCHIVE_FOLDER" - [ -f "$PRD_FILE" ] && cp "$PRD_FILE" "$ARCHIVE_FOLDER/" - [ -f "$PROGRESS_FILE" ] && cp "$PROGRESS_FILE" "$ARCHIVE_FOLDER/" - echo " Archived to: $ARCHIVE_FOLDER" - + [ -f "$SPEC_FILE" ] && cp "$SPEC_FILE" "$ARCHIVE_FOLDER/v${SPEC_VERSION}.prp.json" + [ -f "$PROGRESS_FILE" ] && cp "$PROGRESS_FILE" "$ARCHIVE_FOLDER/v${SPEC_VERSION}.progress.txt" + echo " Archived to: $ARCHIVE_FOLDER/v${SPEC_VERSION}.*" + # Reset progress file for new run echo "# Ralph Progress Log" > "$PROGRESS_FILE" echo "Started: $(date)" >> "$PROGRESS_FILE" @@ -65,8 +72,8 @@ if [ -f "$PRD_FILE" ] && [ -f "$LAST_BRANCH_FILE" ]; then fi # Track current branch -if [ -f "$PRD_FILE" ]; then - CURRENT_BRANCH=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") +if [ -f "$SPEC_FILE" ]; then + CURRENT_BRANCH=$(jq -r '.branchName // empty' "$SPEC_FILE" 2>/dev/null || echo "") if [ -n "$CURRENT_BRANCH" ]; then echo "$CURRENT_BRANCH" > "$LAST_BRANCH_FILE" fi diff --git a/skills/prd/SKILL.md b/skills/prd/SKILL.md index 3c2f7acf..dd4ff437 100644 --- a/skills/prd/SKILL.md +++ b/skills/prd/SKILL.md @@ -71,6 +71,8 @@ Each story needs: - **Title:** Short descriptive name - **Description:** "As a [user], I want [feature] so that [benefit]" - **Acceptance Criteria:** Verifiable checklist of what "done" means +- **Implementation Context** (optional): Relevant files, hints, and code pattern examples +- **Verification Commands** (optional): Shell commands to validate the story Each story should be small enough to implement in one focused session. @@ -80,15 +82,38 @@ Each story should be small enough to implement in one focused session. **Description:** As a [user], I want [feature] so that [benefit]. **Acceptance Criteria:** -- [ ] Specific verifiable criterion -- [ ] Another criterion +- [ ] Specific testable assertion (not a description) +- [ ] Another testable assertion - [ ] Typecheck/lint passes - [ ] **[UI stories only]** Verify in browser using dev-browser skill + +**Verification Commands:** (optional) +- `npx tsc --noEmit` +- `npm test -- --grep "priority"` + +**Implementation Context:** (optional) +- **Relevant files:** `src/components/TaskCard.tsx`, `src/components/ui/Badge.tsx` +- **Hints:** Reuse existing Badge component — it already supports color variants +- **Pattern to follow:** + ```tsx + {task.status} + ``` ``` -**Important:** -- Acceptance criteria must be verifiable, not vague. "Works correctly" is bad. "Button shows confirmation dialog before deleting" is good. -- **For any story with UI changes:** Always include "Verify in browser using dev-browser skill" as acceptance criteria. This ensures visual verification of frontend work. +**Acceptance Criteria Rules:** +- Each criterion must be a **testable assertion**, not a description. An agent must be able to check pass/fail. +- Good: "TaskCard renders a Badge with color prop: red for high, yellow for medium, gray for low" +- Bad: "Priority is displayed nicely" or "Works correctly" +- **For any story with UI changes:** Always include "Verify in browser using dev-browser skill" as acceptance criteria. + +**Verification Commands:** +- Include explicit shell commands that validate the story (e.g., `npx tsc --noEmit`, `npm test -- --grep "feature"`) +- These give agents a concrete way to verify their work beyond manual inspection + +**Implementation Context:** +- **Relevant files**: List specific files the implementor should read or modify +- **Hints**: Brief guidance on approach — what to reuse, what pattern to follow +- **Code examples**: Show existing patterns from the codebase that the new code should match ### 4. Functional Requirements Numbered list of specific functionalities: @@ -98,24 +123,39 @@ Numbered list of specific functionalities: Be explicit and unambiguous. ### 5. Non-Goals (Out of Scope) -What this feature will NOT include. Critical for managing scope. +What this feature will NOT include. **This section is REQUIRED** — explicit scope boundaries prevent agents from gold-plating or building unrequested features. + +### 6. Constraints +Architectural decisions and technology requirements that agents MUST follow. This captures intent, not just current state. + +Examples: +- "Use drizzle ORM for all database operations" +- "Use server actions for mutations, not API routes" +- "All new components must use shadcn/ui" +- "Follow existing naming conventions in src/actions/" -### 6. Design Considerations (Optional) +### 7. Glossary (Optional) +Define domain-specific terms used in the PRD. Helps agents (and junior developers) understand the vocabulary without guessing. + +Format: +- **term**: Definition + +### 8. Design Considerations (Optional) - UI/UX requirements - Link to mockups if available - Relevant existing components to reuse -### 7. Technical Considerations (Optional) +### 9. Technical Considerations (Optional) - Known constraints or dependencies - Integration points with existing systems - Performance requirements -### 8. Success Metrics +### 10. Success Metrics How will success be measured? - "Reduce time to complete X by 50%" - "Increase conversion rate by 10%" -### 9. Open Questions +### 11. Open Questions Remaining questions or areas needing clarification. --- @@ -209,6 +249,17 @@ Add priority levels to tasks so users can focus on what matters most. Tasks can - No automatic priority assignment based on due date - No priority inheritance for subtasks +## Constraints + +- Use drizzle ORM for all database operations +- Use server actions in src/actions/ for mutations, not API routes +- Use shadcn/ui for all new UI components + +## Glossary + +- **priority**: Task urgency level — high, medium, or low +- **badge**: Colored label component showing task metadata + ## Technical Considerations - Reuse existing badge component with color variants @@ -229,6 +280,20 @@ Add priority levels to tasks so users can focus on what matters most. Tasks can --- +## Revision Mode + +When updating an existing feature (the user references a prior PRD or says "add X to feature Y"): + +1. Read the existing PRD for that feature +2. Note which user stories are unchanged vs new/modified +3. Generate the updated PRD as a **complete spec** (not a diff) +4. Mark unchanged stories clearly so the `/ralph` converter can carry over their status +5. Include a brief "Changes from v{N}" section at the top listing what's new + +This supports PRP versioning — the `/ralph` converter will handle archiving and version numbering. + +--- + ## Checklist Before saving the PRD: @@ -236,6 +301,11 @@ Before saving the PRD: - [ ] Asked clarifying questions with lettered options - [ ] Incorporated user's answers - [ ] User stories are small and specific +- [ ] Acceptance criteria are **testable assertions** (not descriptions) - [ ] Functional requirements are numbered and unambiguous -- [ ] Non-goals section defines clear boundaries +- [ ] Non-goals section defines clear boundaries (**required**) +- [ ] Constraints section captures architectural decisions +- [ ] UI stories have "Verify in browser using dev-browser skill" as criterion +- [ ] Verification commands included where applicable +- [ ] Implementation context included for non-trivial stories - [ ] Saved to `tasks/prd-[feature-name].md` diff --git a/skills/ralph/SKILL.md b/skills/ralph/SKILL.md index 5f90c9d9..d334309d 100644 --- a/skills/ralph/SKILL.md +++ b/skills/ralph/SKILL.md @@ -1,18 +1,20 @@ --- name: ralph -description: "Convert PRDs to prd.json format for the Ralph autonomous agent system. Use when you have an existing PRD and need to convert it to Ralph's JSON format. Triggers on: convert this prd, turn this into ralph format, create prd.json from this, ralph json." +description: "Convert PRDs to prp.json (or prd.json) format for the Ralph autonomous agent system. Use when you have an existing PRD and need to convert it to Ralph's JSON format. Triggers on: convert this prd, turn this into ralph format, create prp.json from this, ralph json." user-invocable: true --- -# Ralph PRD Converter +# Ralph PRP Converter -Converts existing PRDs to the prd.json format that Ralph uses for autonomous execution. +Converts existing PRDs to the `prp.json` format that Ralph uses for autonomous execution. + +> **Filename:** Output to `prp.json` (preferred). Ralph also supports `prd.json` as a legacy fallback — agents and scripts check for `prp.json` first, then `prd.json`. --- ## The Job -Take a PRD (markdown file or text) and convert it to `prd.json` in your ralph directory. +Take a PRD (markdown file or text) and convert it to `prp.json` in your ralph directory. --- @@ -23,20 +25,45 @@ Take a PRD (markdown file or text) and convert it to `prd.json` in your ralph di "project": "[Project Name]", "branchName": "ralph/[feature-name-kebab-case]", "description": "[Feature description from PRD title/intro]", + "version": 1, + "previousVersion": null, + + "constraints": [ + "Constraint 1 from PRD", + "Constraint 2 from PRD" + ], + + "nonGoals": [ + "Non-goal 1 from PRD", + "Non-goal 2 from PRD" + ], + + "glossary": { + "term": "Definition from PRD glossary" + }, + "userStories": [ { "id": "US-001", "title": "[Story title]", "description": "As a [user], I want [feature] so that [benefit]", "acceptanceCriteria": [ - "Criterion 1", - "Criterion 2", + "Testable assertion 1", + "Testable assertion 2", "Typecheck passes" ], + "verificationCommands": [ + "npx tsc --noEmit" + ], "dependsOn": [], "priority": 1, "passes": false, - "notes": "" + "notes": "", + "context": { + "relevantFiles": ["src/path/to/file.ts"], + "hints": ["Implementation guidance"], + "examples": ["// Code snippet showing pattern to follow"] + } } ] } @@ -48,7 +75,7 @@ Take a PRD (markdown file or text) and convert it to `prd.json` in your ralph di **Each story must be completable in ONE Ralph iteration (one context window).** -Ralph spawns a fresh Amp instance per iteration with no memory of previous work. If a story is too big, the LLM runs out of context before finishing and produces broken code. +Ralph spawns a fresh agent instance per iteration with no memory of previous work. If a story is too big, the LLM runs out of context before finishing and produces broken code. ### Right-sized stories: - Add a database column and migration @@ -85,18 +112,19 @@ For explicit dependency control, use the `dependsOn` field — an array of story --- -## Acceptance Criteria: Must Be Verifiable +## Acceptance Criteria: Must Be Testable Assertions -Each criterion must be something Ralph can CHECK, not something vague. +Each criterion must be a **testable assertion** that an agent can check pass/fail, not a description or aspiration. -### Good criteria (verifiable): +### Good criteria (testable assertions): - "Add `status` column to tasks table with default 'pending'" - "Filter dropdown has options: All, Active, Completed" - "Clicking delete shows confirmation dialog" +- "TaskCard renders Badge with color: red for high, yellow for medium" - "Typecheck passes" - "Tests pass" -### Bad criteria (vague): +### Bad criteria (vague or descriptive): - "Works correctly" - "User can do X easily" - "Good UX" @@ -123,13 +151,64 @@ Frontend stories are NOT complete until visually verified. Ralph will use the de ## Conversion Rules +### Project-Level Fields + +1. **project**: Project name from PRD title +2. **branchName**: Derive from feature name, kebab-case, prefixed with `ralph/` +3. **description**: Feature description from PRD intro +4. **version**: Integer starting at 1 (see Version Evolution below) +5. **previousVersion**: `null` for new specs (see Version Evolution below) +6. **constraints**: Extract from Constraints section of the PRD. If no explicit Constraints section, extract architectural decisions from Technical Considerations (e.g., "Use X for Y", "Follow pattern Z"). Array of strings. +7. **nonGoals**: Extract from Non-Goals section. Array of strings. +8. **glossary**: Extract from Glossary section. If no Glossary section but domain terms are used, generate definitions from context. Object with string keys/values. Omit if empty. + +### Story-Level Fields + 1. **Each user story becomes one JSON entry** 2. **IDs**: Sequential (US-001, US-002, etc.) 3. **Priority**: Based on dependency order, then document order 4. **dependsOn**: If a story requires another story first, list prerequisite IDs in `dependsOn`. Use `[]` for stories with no prerequisites. 5. **All stories**: `passes: false` and empty `notes` -6. **branchName**: Derive from feature name, kebab-case, prefixed with `ralph/` -7. **Always add**: "Typecheck passes" to every story's acceptance criteria +6. **Always add**: "Typecheck passes" to every story's acceptance criteria +7. **verificationCommands**: Extract from acceptance criteria that reference commands. Always include at least the project's typecheck command (e.g., `npx tsc --noEmit`). If the PRD mentions specific test commands, include those. Array of strings. Omit if only typecheck. +8. **context.relevantFiles**: Extract file references from the PRD's Design/Technical Considerations sections, or from story descriptions mentioning specific files. Array of strings. +9. **context.hints**: Extract implementation notes, approach guidance, or "reuse X" suggestions from the PRD. Array of strings. +10. **context.examples**: Extract code snippets from the PRD that show patterns to follow. Array of strings (each string is a code snippet). +11. **context**: Omit the entire context object if all three sub-fields would be empty. + +--- + +## Version Evolution + +### New Feature (version 1) + +For a brand new feature with no existing spec: +- Set `version: 1` +- Set `previousVersion: null` +- Standard conversion — all stories `passes: false` + +### Feature Revision (version 2+) + +When converting a PRD that revises an existing feature: + +1. **Check for existing spec**: Look for `prp.json` (or `prd.json`) in the ralph directory +2. **Compare branch names**: If the existing spec has the same `branchName` (same feature), this is a revision +3. **Archive the current spec**: + - Read the `version` from the existing spec (default 1 if absent) + - Create archive: `archive/{feature-name}/v{N}.prp.json` + - Archive progress file: `archive/{feature-name}/v{N}.progress.txt` +4. **Write the new spec**: + - Set `version` to previous version + 1 + - Set `previousVersion` to the archive path (e.g., `"archive/task-priority/v1.prp.json"`) +5. **Diff stories** between old and new: + - Story ID exists in new but not old → new story, `passes: false` + - Story ID exists in both, `acceptanceCriteria` unchanged → carry over `passes: true` from old + - Story ID exists in both, `acceptanceCriteria` changed → modified, `passes: false` + - Story ID exists in old but not new → removed (not in new spec, no action needed) + +### Different Feature + +If the existing spec has a **different** `branchName`, this is a new feature — archive the old spec using the standard archive flow (date-based), then write a fresh v1 spec. --- @@ -160,6 +239,14 @@ Each is one focused change that can be completed and verified independently. Add ability to mark tasks with different statuses. +## Constraints +- Use drizzle ORM for database changes +- Use shadcn/ui for all new components + +## Non-Goals +- No status-based notifications +- No automatic status transitions + ## Requirements - Toggle between pending/in-progress/done on task list - Filter list by status @@ -167,12 +254,22 @@ Add ability to mark tasks with different statuses. - Persist status in database ``` -**Output prd.json:** +**Output prp.json:** ```json { "project": "TaskApp", "branchName": "ralph/task-status", "description": "Task Status Feature - Track task progress with status indicators", + "version": 1, + "previousVersion": null, + "constraints": [ + "Use drizzle ORM for database changes", + "Use shadcn/ui for all new components" + ], + "nonGoals": [ + "No status-based notifications", + "No automatic status transitions" + ], "userStories": [ { "id": "US-001", @@ -183,10 +280,22 @@ Add ability to mark tasks with different statuses. "Generate and run migration successfully", "Typecheck passes" ], + "verificationCommands": [ + "npx tsc --noEmit" + ], "dependsOn": [], "priority": 1, "passes": false, - "notes": "" + "notes": "", + "context": { + "relevantFiles": [ + "src/db/schema.ts" + ], + "hints": [ + "Follow existing column patterns in schema.ts" + ], + "examples": [] + } }, { "id": "US-002", @@ -242,28 +351,44 @@ Add ability to mark tasks with different statuses. ## Archiving Previous Runs -**Before writing a new prd.json, check if there is an existing one from a different feature:** +**Before writing a new prp.json, check if there is an existing one from a different feature:** -1. Read the current `prd.json` if it exists +1. Read the current `prp.json` (or `prd.json`) if it exists 2. Check if `branchName` differs from the new feature's branch name -3. If different AND `progress.txt` has content beyond the header: +3. If **same feature** (same branchName): This is a version evolution — see Version Evolution above +4. If **different feature** AND `progress.txt` has content beyond the header: - Create archive folder: `archive/YYYY-MM-DD-feature-name/` - - Copy current `prd.json` and `progress.txt` to archive + - Copy current spec and `progress.txt` to archive - Reset `progress.txt` with fresh header -**The ralph.sh script handles this automatically** when you run it, but if you are manually updating prd.json between runs, archive first. +**The ralph.sh script handles this automatically** when you run it, but if you are manually updating the spec between runs, archive first. + +--- + +## Backward Compatibility + +All new fields are **optional**. The contract: +- If `prp.json` exists, use it. If not, fall back to `prd.json`. Both formats are identical. +- If new fields (`constraints`, `nonGoals`, `glossary`, `version`, `context`, `verificationCommands`) exist, agents use them. If not, current behavior is unchanged. +- The jq story-selection queries are UNCHANGED — they only touch `passes`, `claimed_by`, `dependsOn`, `priority`. +- Existing `prd.json` files from previous runs continue to work. +- `version` defaults to 1 if absent. `previousVersion` defaults to null. --- ## Checklist Before Saving -Before writing prd.json, verify: +Before writing prp.json, verify: -- [ ] **Previous run archived** (if prd.json exists with different branchName, archive it first) +- [ ] **Previous run archived** (if spec exists with different branchName, archive it first) +- [ ] **Version evolution handled** (if same branchName, archive and increment version) - [ ] Each story is completable in one iteration (small enough) - [ ] Stories are ordered by dependency (schema to backend to UI) - [ ] Stories with prerequisites have correct `dependsOn` arrays - [ ] Every story has "Typecheck passes" as criterion - [ ] UI stories have "Verify in browser using dev-browser skill" as criterion -- [ ] Acceptance criteria are verifiable (not vague) +- [ ] Acceptance criteria are **testable assertions** (not vague) - [ ] No story depends on a later story +- [ ] `constraints` captures architectural decisions from the PRD +- [ ] `nonGoals` captures scope boundaries from the PRD +- [ ] `context` blocks included for stories with relevant files/hints in the PRD From cc4c98b7baa45ac71b5ebe9138ac1567f541a637 Mon Sep 17 00:00:00 2001 From: Matt Gibbs Date: Thu, 26 Feb 2026 08:50:16 -0500 Subject: [PATCH 21/21] feat: add multi-PRP mode for parallel independent feature branches Add launch-multi-prp.sh orchestrator that runs 1 agent per PRP file, each on its own feature branch with no story contention. RALPH_BRANCH env var in agent-loop.sh bypasses spec-file branch discovery. Tested with 6 simultaneous agents against blackduck-polaris-mcp (PRs #15-#20). Co-Authored-By: Claude Opus 4.6 --- AGENTS.md | 18 ++ README.md | 52 ++++ docker/agent-loop.sh | 21 ++ parallel/README.md | 91 +++++- parallel/launch-multi-prp.sh | 546 +++++++++++++++++++++++++++++++++ parallel/lib/docker-helpers.sh | 2 + 6 files changed, 728 insertions(+), 2 deletions(-) create mode 100755 parallel/launch-multi-prp.sh diff --git a/AGENTS.md b/AGENTS.md index 0f00dcea..0bd31b29 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -56,3 +56,21 @@ Ralph supports running multiple agents in parallel via Docker containers. See `p - Each agent writes to its own `progress-.txt` to avoid merge conflicts - Builder agents have restricted network access (Claude API + npm only) - Researcher agents have full internet access + +### Multi-PRP Mode + +`launch-multi-prp.sh` runs independent feature branches simultaneously (1 agent per PRP): + +- Each PRP file specifies its own `branchName`; the orchestrator pre-creates branches +- `RALPH_BRANCH` env var tells each agent which branch to check out +- No story contention — each agent has its own story pool +- On completion, branches are fetched back and `gh pr create` commands are printed +- Use `--prp FILE` (repeatable) to specify which PRPs to run + +### Lessons Learned + +- **macOS bash 3.x**: Don't use `declare -A` (associative arrays). Use indexed parallel arrays. +- **Agent containers are independent**: The monitor loop crashing doesn't affect running agents. +- **Foundation stories take longer**: Types + API client stories are slower than incremental tool stories. +- **Token expiry planning**: All agents share one OAuth token. If it expires mid-run, agents that haven't finished enter a retry loop. Plan token validity for the full session. +- **PRP independence**: When two PRPs need the same helper, inline it in both as separate stories with "skip if already exists" guidance. diff --git a/README.md b/README.md index ccf2864a..b8b95447 100644 --- a/README.md +++ b/README.md @@ -286,6 +286,58 @@ Priority order (first wins): See [parallel/README.md](parallel/README.md) for full documentation. +## Multi-PRP Mode (Docker) + +Multi-PRP mode extends parallel mode for running **independent feature branches simultaneously**. Instead of N agents competing for stories on one PRD, each PRP file gets its own feature branch and dedicated agent. This is ideal for batching multiple independent features in a single launch. + +### How It Works + +1. You provide multiple PRP JSON files, each with its own `branchName` +2. The orchestrator pre-creates a feature branch per PRP in the bare repo +3. One container launches per PRP, targeted to its branch via `RALPH_BRANCH` +4. Agents work independently — no competition, no cross-branch coordination +5. On completion, branches are fetched back and PR creation commands are printed + +### Quick Start (Multi-PRP) + +```bash +# Launch 6 agents, one per PRP file +./parallel/launch-multi-prp.sh \ + --project /path/to/my-repo \ + --prp prps/prp-03.json \ + --prp prps/prp-04.json \ + --prp prps/prp-05.json \ + --prp prps/prp-06.json \ + --prp prps/prp-07.json \ + --model claude-sonnet-4-5-20250929 +``` + +### Options (Multi-PRP) + +```bash +./parallel/launch-multi-prp.sh \ + --project DIR \ # project git repo (required) + --prp FILE \ # PRP JSON file, relative to project (repeatable, required) + --model MODEL \ # Claude model (default: claude-sonnet-4-5-20250929) + --memory SIZE \ # per-container memory (default: 4g) + --cpus N \ # per-container CPUs (default: 2) + --max-iterations N \ # per-agent iteration cap (default: 0 = until done) + --allow-domain DOMAIN # extra firewall whitelist (repeatable) +``` + +### Single-PRP vs Multi-PRP + +| | `ralph-parallel.sh` | `launch-multi-prp.sh` | +|---|---|---| +| Branches | One shared branch | One branch per PRP | +| Agents | N agents compete for stories | 1 agent per PRP, no competition | +| Use case | Parallelize within a feature | Parallelize across features | +| Story claiming | Git atomic push (contention possible) | No contention (isolated branches) | +| Completion | All stories done → exit | All branches done → exit | +| Output | Stories marked `passes: true` | Branches synced + PR commands printed | + +See [parallel/README.md](parallel/README.md) for full documentation. + ## References - [Geoffrey Huntley's Ralph article](https://ghuntley.com/ralph/) diff --git a/docker/agent-loop.sh b/docker/agent-loop.sh index bc51858d..de86a771 100755 --- a/docker/agent-loop.sh +++ b/docker/agent-loop.sh @@ -25,6 +25,7 @@ AGENT_ID="${AGENT_ID:?AGENT_ID is required}" AGENT_ROLE="${AGENT_ROLE:?AGENT_ROLE is required}" MAX_ITERATIONS="${MAX_ITERATIONS:-0}" CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-5-20250929}" +RALPH_BRANCH="${RALPH_BRANCH:-}" REPO_PATH="/repo.git" WORKSPACE="/workspace" @@ -107,6 +108,26 @@ setup_git_identity() { # --- Step 5: Check out the correct branch from spec file --- checkout_prd_branch() { + # RALPH_BRANCH override: if set, skip spec file discovery and use directly + if [ -n "$RALPH_BRANCH" ]; then + echo "[$AGENT_ID] RALPH_BRANCH override: $RALPH_BRANCH" + local current_branch + current_branch=$(git branch --show-current 2>/dev/null || echo "") + if [ "$current_branch" = "$RALPH_BRANCH" ]; then + echo "[$AGENT_ID] Already on branch: $RALPH_BRANCH" + return 0 + fi + echo "[$AGENT_ID] Checking out branch: $RALPH_BRANCH" + if git show-ref --verify --quiet "refs/heads/$RALPH_BRANCH" 2>/dev/null; then + git checkout "$RALPH_BRANCH" + elif git show-ref --verify --quiet "refs/remotes/origin/$RALPH_BRANCH" 2>/dev/null; then + git checkout -b "$RALPH_BRANCH" "origin/$RALPH_BRANCH" + else + git checkout -b "$RALPH_BRANCH" + fi + return 0 + fi + local spec_file spec_file=$(detect_spec_file "$WORKSPACE") diff --git a/parallel/README.md b/parallel/README.md index 1a5ede0e..9c439b17 100644 --- a/parallel/README.md +++ b/parallel/README.md @@ -172,6 +172,92 @@ USER agent 2. `Dockerfile.ralph` in the project directory — auto-built 3. Default `ralph-agent:latest` — base image with Node.js only +## Multi-PRP Mode + +Multi-PRP mode runs independent feature branches simultaneously. Instead of N agents sharing one PRD, each PRP gets its own branch and dedicated agent. Use this when you have multiple independent features to build in parallel. + +### How It Works + +1. **Branch setup**: The orchestrator reads `branchName` from each PRP JSON file, creates a feature branch per PRP in the bare repo, and commits the PRP file to its branch +2. **Targeted launch**: Each container receives a `RALPH_BRANCH` env var telling it which branch to check out, bypassing the normal prp.json branch discovery +3. **Independent execution**: Agents work on their own branch with no cross-branch coordination or story contention +4. **Completion tracking**: The monitor loop checks each branch independently; when all branches report all stories passing, it fetches branches back and prints `gh pr create` commands + +### Quick Start (Multi-PRP) + +```bash +./parallel/launch-multi-prp.sh \ + --project /path/to/repo \ + --prp prps/feature-a.json \ + --prp prps/feature-b.json \ + --prp prps/feature-c.json \ + --model claude-sonnet-4-5-20250929 +``` + +### CLI Options (Multi-PRP) + +``` +./parallel/launch-multi-prp.sh [options] + +Required: + --project DIR Project git repository + --prp FILE PRP JSON file, relative to project dir (repeatable) + +Options: + --model MODEL Claude model (default: claude-sonnet-4-5-20250929) + --memory SIZE Per-container memory limit (default: 4g) + --cpus N Per-container CPU limit (default: 2) + --max-iterations N Per-agent iteration cap (default: 0 = until done) + --allow-domain D Extra domain to whitelist in firewall (repeatable) +``` + +### PRP File Requirements + +Each PRP JSON file must include: +- `branchName`: Feature branch name (e.g., `"ralph/prd-03-summarization"`) +- `userStories`: Array of stories with `id`, `passes`, `dependsOn`, etc. +- `project`: Project identifier (used for logging) + +### RALPH_BRANCH Override + +The `RALPH_BRANCH` env var is the key mechanism. When set in a container: +- `agent-loop.sh` skips its normal spec-file-based branch discovery +- Goes directly to the specified branch (checkout or create from remote) +- Backward compatible: empty string falls through to existing behavior + +This is passed as the 10th parameter to `launch_agent()` in `docker-helpers.sh`. + +### PRP Independence Pattern + +When multiple PRPs share helper code (e.g., the same utility function), each PRP should include the helper creation as its own story with a note like "skip if file already exists from another branch merge." This makes PRPs fully independent — each agent can complete its work without depending on another branch being merged first. + +### Differences from Single-PRP Mode + +| | `ralph-parallel.sh` | `launch-multi-prp.sh` | +|---|---|---| +| Branches | One shared branch | One branch per PRP | +| Agents | N agents compete for stories | 1 agent per PRP, no competition | +| Use case | Parallelize within a feature | Parallelize across features | +| Completion | All stories in one PRP done | All branches done independently | +| Output | Stories marked `passes: true` | Branches synced + PR commands | +| Agent targeting | Branch from prp.json | `RALPH_BRANCH` env var override | + +### Resource Planning + +Each container uses the configured memory and CPU limits. For N PRPs: +- Memory: N × `--memory` (e.g., 6 PRPs × 4GB = 24GB) +- CPUs: N × `--cpus` (e.g., 6 PRPs × 2 = 12 CPUs) +- Token budget: all agents share one OAuth token; plan for the total session duration + +### Monitoring + +The monitor loop runs every 30 seconds and: +- Checks each branch independently for story completion +- Recovers stale claims on incomplete branches +- Detects auth failures (exit code 2) and halts all agents +- Restarts crashed containers for incomplete branches +- Prints PR creation commands when all branches complete + ## File Layout ``` @@ -182,7 +268,8 @@ docker/ └── init-firewall-researcher.sh # No-op (full internet) parallel/ -├── ralph-parallel.sh # Host orchestrator: launch, monitor, restart +├── ralph-parallel.sh # Single-PRP: N agents, one branch +├── launch-multi-prp.sh # Multi-PRP: 1 agent per branch ├── stop.sh # Graceful shutdown ├── status.sh # Container status + story board + logs ├── CLAUDE-parallel.md # Parallel-aware prompt for agents @@ -190,7 +277,7 @@ parallel/ └── lib/ ├── auth.sh # Token retrieval: env > file > 1Password ├── network-setup.sh # Docker network create/teardown - ├── docker-helpers.sh # Container launch/stop/restart + ├── docker-helpers.sh # Container launch/stop/restart (10-param) └── logging.sh # Timestamped log helpers ``` diff --git a/parallel/launch-multi-prp.sh b/parallel/launch-multi-prp.sh new file mode 100755 index 00000000..c60339c8 --- /dev/null +++ b/parallel/launch-multi-prp.sh @@ -0,0 +1,546 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# launch-multi-prp.sh — Launch one containerized agent per PRP file. +# +# Each PRP gets its own feature branch and agent container. Agents work +# independently on their branches. On completion, branches are fetched +# back to the project and gh pr create commands are printed. +# +# Usage: +# ./parallel/launch-multi-prp.sh \ +# --project /path/to/repo \ +# --prp prps/prp-03.json \ +# --prp prps/prp-04.json \ +# [--model claude-sonnet-4-5-20250929] \ +# [--memory 4g] [--cpus 2] [--allow-domain example.com] +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RALPH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Source library scripts +source "$SCRIPT_DIR/lib/logging.sh" +source "$SCRIPT_DIR/lib/auth.sh" +source "$SCRIPT_DIR/lib/network-setup.sh" +source "$SCRIPT_DIR/lib/docker-helpers.sh" + +# --- Defaults --- +CLAUDE_MODEL="claude-sonnet-4-5-20250929" +CONTAINER_MEMORY="4g" +CONTAINER_CPUS="2" +MAX_ITERATIONS=0 +STALE_CLAIM_MINUTES=30 +PROJECT_DIR="" +declare -a PRP_FILES=() +declare -a EXTRA_DOMAINS=() + +# --- Parse arguments --- +while [[ $# -gt 0 ]]; do + case $1 in + --project) + PROJECT_DIR="$2" + shift 2 + ;; + --project=*) + PROJECT_DIR="${1#*=}" + shift + ;; + --prp) + PRP_FILES+=("$2") + shift 2 + ;; + --prp=*) + PRP_FILES+=("${1#*=}") + shift + ;; + --model) + CLAUDE_MODEL="$2" + shift 2 + ;; + --model=*) + CLAUDE_MODEL="${1#*=}" + shift + ;; + --memory) + CONTAINER_MEMORY="$2" + shift 2 + ;; + --memory=*) + CONTAINER_MEMORY="${1#*=}" + shift + ;; + --cpus) + CONTAINER_CPUS="$2" + shift 2 + ;; + --cpus=*) + CONTAINER_CPUS="${1#*=}" + shift + ;; + --max-iterations) + MAX_ITERATIONS="$2" + shift 2 + ;; + --max-iterations=*) + MAX_ITERATIONS="${1#*=}" + shift + ;; + --allow-domain) + EXTRA_DOMAINS+=("$2") + shift 2 + ;; + --allow-domain=*) + EXTRA_DOMAINS+=("${1#*=}") + shift + ;; + -h|--help) + echo "Usage: $0 --project DIR --prp FILE [--prp FILE ...] [options]" + echo "" + echo "Launch one containerized agent per PRP file. Each gets its own branch." + echo "" + echo "Required:" + echo " --project DIR Project git repository" + echo " --prp FILE PRP JSON file (relative to project dir, repeatable)" + echo "" + echo "Options:" + echo " --model MODEL Claude model (default: claude-sonnet-4-5-20250929)" + echo " --memory SIZE Per-container memory (default: 4g)" + echo " --cpus N Per-container CPUs (default: 2)" + echo " --max-iterations N Per-agent iteration cap (default: 0 = until done)" + echo " --allow-domain D Extra domain to whitelist (repeatable)" + exit 0 + ;; + *) + log_error "Unknown option: $1" + exit 1 + ;; + esac +done + +# --- Validate inputs --- +if [ -z "$PROJECT_DIR" ]; then + log_error "Missing --project DIR" + exit 1 +fi +PROJECT_DIR="$(cd "$PROJECT_DIR" && pwd)" + +if [ ${#PRP_FILES[@]} -eq 0 ]; then + log_error "No --prp files specified" + exit 1 +fi + +if [ ! -d "$PROJECT_DIR/.git" ]; then + log_error "$PROJECT_DIR is not a git repository" + exit 1 +fi + +# Resolve extra domains +if [ ${#EXTRA_DOMAINS[@]} -gt 0 ]; then + RALPH_EXTRA_DOMAINS=$(IFS=,; echo "${EXTRA_DOMAINS[*]}") + export RALPH_EXTRA_DOMAINS +fi + +# CLAUDE-parallel.md prompt lives in the ralph repo +PARALLEL_PROMPT="$SCRIPT_DIR/CLAUDE-parallel.md" +if [ ! -f "$PARALLEL_PROMPT" ]; then + log_error "Missing parallel/CLAUDE-parallel.md prompt file" + exit 1 +fi +export PARALLEL_PROMPT + +# --- Validate all PRP files and extract branch names --- +declare -a BRANCH_NAMES=() +declare -a PRP_BASENAMES=() +TOTAL_PRPS=${#PRP_FILES[@]} + +log_info "Ralph Multi-PRP Launcher" +log_info "========================" +log_info "Project: $PROJECT_DIR" +log_info "PRPs: $TOTAL_PRPS" +log_info "Model: $CLAUDE_MODEL" +log_info "Memory: $CONTAINER_MEMORY per container" +log_info "CPUs: $CONTAINER_CPUS per container" +echo "" + +for prp_file in "${PRP_FILES[@]}"; do + local_path="$PROJECT_DIR/$prp_file" + if [ ! -f "$local_path" ]; then + log_error "PRP file not found: $local_path" + exit 1 + fi + + branch=$(jq -r '.branchName // empty' "$local_path" 2>/dev/null || echo "") + if [ -z "$branch" ]; then + log_error "PRP file missing branchName: $prp_file" + exit 1 + fi + + project_name=$(jq -r '.project // "unknown"' "$local_path" 2>/dev/null || echo "unknown") + total_stories=$(jq '.userStories | length' "$local_path" 2>/dev/null || echo "?") + + BRANCH_NAMES+=("$branch") + PRP_BASENAMES+=("$(basename "$prp_file")") + + log_info " PRP: $prp_file -> branch: $branch ($total_stories stories)" +done +echo "" + +# --- Step 1: Build or verify Docker image --- +log_info "Checking Docker image..." +if ! docker image inspect "$RALPH_IMAGE" &> /dev/null; then + build_image "$RALPH_ROOT/docker" +else + log_info "Image $RALPH_IMAGE already exists." +fi + +# --- Step 2: Create Docker networks --- +create_networks + +# --- Step 3: Verify Claude auth volume --- +log_info "Checking Claude auth volume..." +if ! check_auth_volume; then + exit 1 +fi +log_info "Claude auth volume verified" + +# --- Step 4: Create/update bare repo and set up feature branches --- +BARE_REPO="$PROJECT_DIR/.ralph/repo.git" +if [ ! -d "$BARE_REPO" ]; then + log_info "Creating bare repo for agent coordination..." + mkdir -p "$PROJECT_DIR/.ralph" + git clone --bare --filter=blob:none "file://$PROJECT_DIR" "$BARE_REPO" + log_info "Bare repo created at $BARE_REPO" +else + log_info "Updating bare repo from project..." + cd "$PROJECT_DIR" + git push --force "$BARE_REPO" --all 2>&1 || { + log_error "Failed to sync bare repo from project." + exit 1 + } + cd - > /dev/null +fi + +# Clear stop signal +mkdir -p "$PROJECT_DIR/.ralph" +touch "$PROJECT_DIR/.ralph/stop_requested" +: > "$PROJECT_DIR/.ralph/stop_requested" + +# Create feature branches and commit each PRP to its branch +TEMP_CLONE=$(mktemp -d) +git clone "$BARE_REPO" "$TEMP_CLONE/work" 2>/dev/null +cd "$TEMP_CLONE/work" +git config user.name "ralph-orchestrator" +git config user.email "orchestrator@ralph-agent.local" + +for i in $(seq 0 $((TOTAL_PRPS - 1))); do + branch="${BRANCH_NAMES[$i]}" + prp_file="${PRP_FILES[$i]}" + local_path="$PROJECT_DIR/$prp_file" + prp_dir=$(dirname "$prp_file") + + log_info "Setting up branch: $branch" + + # Create or checkout the branch + if git show-ref --verify --quiet "refs/heads/$branch" 2>/dev/null; then + git checkout "$branch" + elif git show-ref --verify --quiet "refs/remotes/origin/$branch" 2>/dev/null; then + git checkout -b "$branch" "origin/$branch" + else + # Create from main/master + git checkout main 2>/dev/null || git checkout master 2>/dev/null || true + git checkout -b "$branch" + fi + + # Copy PRP file into the branch + mkdir -p "$prp_dir" + cp "$local_path" "$prp_file" + + # Also place as prp.json at root for agent-loop.sh detection + cp "$local_path" "prp.json" + + git add "$prp_file" "prp.json" + git commit -m "[orchestrator] Add PRP: $(basename "$prp_file") on $branch" 2>/dev/null || true + git push origin "$branch" 2>/dev/null || git push --set-upstream origin "$branch" 2>/dev/null + log_info " Branch $branch ready with PRP" +done + +cd - > /dev/null +rm -rf "$TEMP_CLONE" + +# --- Step 5: Load friend identities (optional) --- +FRIENDS_FILE="$PROJECT_DIR/.ralph/friends.json" +declare -a FRIEND_NAMES=() +declare -a FRIEND_EMAILS=() +if [ -f "$FRIENDS_FILE" ]; then + while IFS= read -r name; do + FRIEND_NAMES+=("$name") + done < <(jq -r '.[].name' "$FRIENDS_FILE") + while IFS= read -r email; do + FRIEND_EMAILS+=("$email") + done < <(jq -r '.[].email' "$FRIENDS_FILE") + log_info "Loaded ${#FRIEND_NAMES[@]} friend identities from friends.json" +fi + +# --- Step 6: Launch one container per PRP --- +declare -a CONTAINER_NAMES=() +declare -a AGENT_IDS=() +mkdir -p "$PROJECT_DIR/agent_logs" + +for i in $(seq 0 $((TOTAL_PRPS - 1))); do + agent_num=$((i + 1)) + agent_id="agent-${agent_num}" + container_name="ralph-${agent_id}" + branch="${BRANCH_NAMES[$i]}" + prp_basename="${PRP_BASENAMES[$i]}" + + # Assign friend identity if available + git_author_name="" + git_author_email="" + if [ ${#FRIEND_NAMES[@]} -gt 0 ] && [ "$i" -lt ${#FRIEND_NAMES[@]} ]; then + git_author_name="${FRIEND_NAMES[$i]}" + git_author_email="${FRIEND_EMAILS[$i]}" + log_info "Agent $agent_id ($prp_basename) will commit as: $git_author_name <$git_author_email>" + fi + + # Stop existing container if present + if docker inspect "$container_name" &> /dev/null; then + log_warn "Container $container_name already exists. Removing." + stop_agent "$container_name" 10 + fi + + log_info "Launching $agent_id for $prp_basename on branch $branch" + + launch_agent \ + "$agent_id" \ + "builder" \ + "$PROJECT_DIR" \ + "$CLAUDE_MODEL" \ + "$MAX_ITERATIONS" \ + "$CONTAINER_MEMORY" \ + "$CONTAINER_CPUS" \ + "$git_author_name" \ + "$git_author_email" \ + "$branch" + + CONTAINER_NAMES+=("$container_name") + AGENT_IDS+=("$agent_id") +done + +log_info "All $TOTAL_PRPS agents launched." +echo "" + +# --- Step 7: Monitor loop --- +MONITOR_INTERVAL=30 +log_info "Entering monitor loop (checking every ${MONITOR_INTERVAL}s)." +log_info "Stop: echo stop > $PROJECT_DIR/.ralph/stop_requested" +echo "" + +# Track per-branch completion (use parallel arrays instead of associative array for bash 3.x compat) +declare -a BRANCH_COMPLETE_FLAGS=() +for _i in $(seq 0 $((TOTAL_PRPS - 1))); do + BRANCH_COMPLETE_FLAGS+=("false") +done + +check_branch_complete() { + local branch="$1" + local prd_content + prd_content=$(git --git-dir="$BARE_REPO" show "${branch}:prp.json" 2>/dev/null || echo "") + [ -z "$prd_content" ] && return 1 + + local incomplete + incomplete=$(echo "$prd_content" | jq '[.userStories[] | select(.passes == false)] | length' 2>/dev/null || echo "1") + [ "$incomplete" -eq 0 ] +} + +recover_stale_claims_for_branch() { + local branch="$1" + local prd_content + prd_content=$(git --git-dir="$BARE_REPO" show "${branch}:prp.json" 2>/dev/null || echo "") + [ -z "$prd_content" ] && return + + local now_epoch + now_epoch=$(date +%s) + local stale_seconds=$((STALE_CLAIM_MINUTES * 60)) + + local claims + claims=$(echo "$prd_content" | jq -r ' + .userStories[] + | select(.passes == false and .claimed_by != null and .claimed_by != "") + | "\(.id)|\(.claimed_by)|\(.claimed_at // "")" + ' 2>/dev/null || echo "") + [ -z "$claims" ] && return + + local cleared=false + local updated_prd="$prd_content" + while IFS='|' read -r story_id agent claimed_at; do + [ -z "$story_id" ] && continue + [ -z "$claimed_at" ] && continue + + local claimed_epoch + claimed_epoch=$(date -d "$claimed_at" +%s 2>/dev/null \ + || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$claimed_at" +%s 2>/dev/null \ + || echo "0") + [ "$claimed_epoch" -eq 0 ] && continue + + local age=$((now_epoch - claimed_epoch)) + if [ "$age" -gt "$stale_seconds" ]; then + local container_name="ralph-${agent}" + if ! is_agent_running "$container_name"; then + log_warn "Stale claim on $branch: $story_id by $agent (${age}s). Clearing." + updated_prd=$(echo "$updated_prd" | jq --arg sid "$story_id" ' + .userStories |= map( + if .id == $sid then del(.claimed_by) | del(.claimed_at) else . end + ) + ') + cleared=true + fi + fi + done <<< "$claims" + + if $cleared; then + local temp_dir + temp_dir=$(mktemp -d) + git clone "$BARE_REPO" "$temp_dir/work" 2>/dev/null + cd "$temp_dir/work" + git config user.name "ralph-orchestrator" + git config user.email "orchestrator@ralph-agent.local" + git checkout "$branch" 2>/dev/null || true + echo "$updated_prd" | jq '.' > "prp.json" + git add "prp.json" + git commit -m "[orchestrator] Clear stale claims on $branch" 2>/dev/null || true + git push origin 2>/dev/null || true + cd - > /dev/null + rm -rf "$temp_dir" + fi +} + +while true; do + sleep "$MONITOR_INTERVAL" + + # Check stop signal + if [ -s "$PROJECT_DIR/.ralph/stop_requested" ]; then + log_info "Stop requested. Shutting down all agents..." + for name in "${CONTAINER_NAMES[@]}"; do + stop_agent "$name" 30 + done + teardown_networks + log_info "All agents stopped." + exit 0 + fi + + # Per-branch status check and stale claim recovery + ALL_BRANCHES_DONE=true + for i in $(seq 0 $((TOTAL_PRPS - 1))); do + branch="${BRANCH_NAMES[$i]}" + if [ "${BRANCH_COMPLETE_FLAGS[$i]}" = "true" ]; then + continue + fi + if check_branch_complete "$branch"; then + log_info "Branch $branch: ALL STORIES COMPLETE" + BRANCH_COMPLETE_FLAGS[$i]="true" + else + ALL_BRANCHES_DONE=false + recover_stale_claims_for_branch "$branch" + fi + done + + # Check container health + ALL_STOPPED=true + for i in $(seq 0 $((TOTAL_PRPS - 1))); do + name="${CONTAINER_NAMES[$i]}" + branch="${BRANCH_NAMES[$i]}" + + if is_agent_running "$name"; then + ALL_STOPPED=false + continue + fi + + EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' "$name" 2>/dev/null || echo "unknown") + + if [ "$EXIT_CODE" = "0" ]; then + log_info "Container $name exited cleanly (branch: $branch)." + elif [ "$EXIT_CODE" = "2" ]; then + log_error "Container $name: AUTH FAILURE (code 2). Halting all agents." + echo "auth_failure" > "$PROJECT_DIR/.ralph/stop_requested" + for stop_name in "${CONTAINER_NAMES[@]}"; do + [ "$stop_name" = "$name" ] && continue + stop_agent "$stop_name" 10 + done + teardown_networks + log_error "All agents stopped. Refresh credentials and re-run." + exit 1 + else + # Only restart if the branch isn't complete + if [ "${BRANCH_COMPLETE_FLAGS[$i]}" != "true" ]; then + log_warn "Container $name stopped (exit $EXIT_CODE). Restarting..." + restart_agent "$name" + if is_agent_running "$name"; then + ALL_STOPPED=false + else + log_error "Failed to restart $name" + fi + fi + fi + done + + # All branches complete? + if $ALL_BRANCHES_DONE; then + log_info "ALL BRANCHES COMPLETE!" + log_info "Shutting down remaining agents..." + for name in "${CONTAINER_NAMES[@]}"; do + stop_agent "$name" 15 + done + + # Fetch all branches back to project + log_info "Syncing branches back to project..." + cd "$PROJECT_DIR" + for branch in "${BRANCH_NAMES[@]}"; do + git fetch "$BARE_REPO" "$branch:$branch" 2>/dev/null || { + log_warn "Could not fetch branch $branch" + } + done + cd - > /dev/null + + teardown_networks + + echo "" + log_info "Done! Create PRs with:" + echo "" + for i in $(seq 0 $((TOTAL_PRPS - 1))); do + branch="${BRANCH_NAMES[$i]}" + prp_file="${PRP_FILES[$i]}" + desc=$(jq -r '.description // ""' "$PROJECT_DIR/$prp_file" 2>/dev/null || echo "") + echo " git push origin $branch && gh pr create --base main --head $branch --title \"${desc:0:70}\" --fill" + done + echo "" + exit 0 + fi + + if $ALL_STOPPED; then + log_info "All containers exited. Fetching branches..." + cd "$PROJECT_DIR" + for branch in "${BRANCH_NAMES[@]}"; do + git fetch "$BARE_REPO" "$branch:$branch" 2>/dev/null || true + done + cd - > /dev/null + + for name in "${CONTAINER_NAMES[@]}"; do + docker rm "$name" 2>/dev/null || true + done + teardown_networks + + echo "" + log_info "Status per branch:" + for i in $(seq 0 $((TOTAL_PRPS - 1))); do + branch="${BRANCH_NAMES[$i]}" + if [ "${BRANCH_COMPLETE_FLAGS[$i]}" = "true" ]; then + echo " $branch: COMPLETE" + else + echo " $branch: INCOMPLETE" + fi + done + echo "" + exit 0 + fi +done diff --git a/parallel/lib/docker-helpers.sh b/parallel/lib/docker-helpers.sh index f37c1737..bd3a4df0 100644 --- a/parallel/lib/docker-helpers.sh +++ b/parallel/lib/docker-helpers.sh @@ -55,6 +55,7 @@ launch_agent() { local container_cpus="${7:-2}" local git_author_name="${8:-}" local git_author_email="${9:-}" + local ralph_branch="${10:-}" # Determine network based on role (verifiers use builder network — no internet needed) local network @@ -100,6 +101,7 @@ launch_agent() { -e "GIT_AUTHOR_NAME_OVERRIDE=${git_author_name}" \ -e "GIT_AUTHOR_EMAIL_OVERRIDE=${git_author_email}" \ -e "RALPH_EXTRA_DOMAINS=${RALPH_EXTRA_DOMAINS:-}" \ + -e "RALPH_BRANCH=${ralph_branch}" \ -v "$CLAUDE_AUTH_VOLUME:/claude-auth:ro" \ -v "$project_dir_abs/.ralph/repo.git:/repo.git:rw" \ -v "$prompt_path:/parallel-prompt/CLAUDE-parallel.md:ro" \