From 6b2aebc65d2569c0cf9ea655492b5f7d09e419a3 Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 12:29:40 -0600 Subject: [PATCH 1/3] Remove orphan process detection from health check No longer needed: subagents run as native Claude Task tools (not background processes), CLAUDE_CODE_DISABLE_BACKGROUND_TASKS=1 prevents background spawning, and KillMode=mixed in systemd kills child processes when the service stops. --- CHANGELOG.md | 6 ++++ README.md | 2 +- lib/health-check.sh | 78 ----------------------------------------- tests/health-check.bats | 9 ----- 4 files changed, 7 insertions(+), 88 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcb48c1..ccfd4f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [Unreleased] + +### Removed + +- Orphan process detection from health check — no longer needed; subagents run as native Task tools with `CLAUDE_CODE_DISABLE_BACKGROUND_TASKS=1` + ## [1.2.2] - 2026-02-08 ### Fixed diff --git a/README.md b/README.md index 13e4ab1..c60cdef 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ The setup wizard will confirm when it receives the message and finish. Once done > A cron job runs every 5 minutes to monitor the webhook endpoint. It verifies the webhook is registered and re-registers it if needed. If the server is unreachable, it auto-restarts the service (throttled to once per 3 minutes, max 3 attempts). After exhausting restart attempts without recovery, it sends a Telegram alert and stops retrying until the server responds with HTTP 200. The restart counter auto-clears when the health endpoint returns HTTP 200. You can also reset it manually by deleting `$HOME/.claudio/.last_restart_attempt` and `$HOME/.claudio/.restart_fail_count`. > -> The health check also monitors: orphan `claude`/`node` processes (kills them after 30 minutes), disk usage (alerts above 90%), log file sizes (rotates files over 10MB), and backup freshness (alerts if the last backup is older than 2 hours). These thresholds are configurable via environment variables. +> The health check also monitors: disk usage (alerts above 90%), log file sizes (rotates files over 10MB), and backup freshness (alerts if the last backup is older than 2 hours). These thresholds are configurable via environment variables. ### Status diff --git a/lib/health-check.sh b/lib/health-check.sh index 1bad6ae..b4d41b4 100755 --- a/lib/health-check.sh +++ b/lib/health-check.sh @@ -6,7 +6,6 @@ # Sends a Telegram alert after 3 restart attempts if the service never recovers # # Additional checks (run when service is healthy): -# - Orphan claude/node processes not belonging to the active service # - Disk usage alerts (configurable threshold, default 90%) # - Log rotation (configurable max size, default 10MB) # - Backup freshness (alerts if last backup is older than threshold) @@ -120,77 +119,6 @@ _clear_fail_state() { rm -f "$RESTART_STAMP" "$FAIL_COUNT_FILE" } -# --- Orphan process detection --- -# Finds claude/node processes that are NOT children of the active claudio service. -# Orphans are logged and killed (SIGTERM). Returns the count found. -_check_orphan_processes() { - local service_pid="" - if [[ "$(uname)" == "Darwin" ]]; then - service_pid=$(launchctl list | awk '/com\.claudio\.server/{print $1}') - else - service_pid=$(systemctl --user show claudio --property=MainPID --value 2>/dev/null || echo "") - fi - # Normalize: "0" or empty means no active service PID - [[ "$service_pid" == "0" || -z "$service_pid" ]] && service_pid="" - - local orphan_count=0 - local pids - # Find claude and node processes owned by this user - pids=$(pgrep -u "$(id -u)" -f '(claude|node)' 2>/dev/null || true) - [[ -z "$pids" ]] && echo 0 && return - - for pid in $pids; do - # Skip our own process tree - [[ "$pid" == "$$" ]] && continue - - # Skip if it's a child of the service main PID - if [[ -n "$service_pid" ]]; then - local ancestor="$pid" - local is_child=false - # Walk up the process tree (max 20 levels to avoid loops) - for (( depth=0; depth<20; depth++ )); do - local ppid - ppid=$(ps -o ppid= -p "$ancestor" 2>/dev/null | tr -d ' ') || break - [[ -z "$ppid" || "$ppid" == "0" || "$ppid" == "1" ]] && break - if [[ "$ppid" == "$service_pid" ]]; then - is_child=true - break - fi - ancestor="$ppid" - done - [[ "$is_child" == true ]] && continue - fi - - # Skip processes started less than 30 minutes ago (could be active handlers) - local elapsed - if [[ "$(uname)" == "Darwin" ]]; then - # macOS: ps -o etime= gives [[dd-]hh:]mm:ss, convert to seconds - local etime - etime=$(ps -o etime= -p "$pid" 2>/dev/null | tr -d ' ') || continue - [[ -z "$etime" ]] && continue - elapsed=0 - local parts - IFS=: read -ra parts <<< "${etime//-/:}" - for part in "${parts[@]}"; do - elapsed=$(( elapsed * 60 + 10#$part )) - done - else - elapsed=$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ') || continue - fi - [[ -z "$elapsed" ]] && continue - (( elapsed < 1800 )) && continue - - # This looks like an orphan - local cmdline - cmdline=$(ps -o args= -p "$pid" 2>/dev/null | head -c 120) || cmdline="unknown" - log_warn "health-check" "Orphan process found: PID=$pid age=${elapsed}s cmd=$cmdline" - kill "$pid" 2>/dev/null || true - orphan_count=$((orphan_count + 1)) - done - - echo "$orphan_count" -} - # --- Disk usage check --- # Checks usage of all mounted partitions relevant to Claudio. # Returns 0 if all OK, 1 if any partition exceeds threshold. @@ -310,12 +238,6 @@ if [ "$http_code" = "200" ]; then # --- Additional system checks (only when service is healthy) --- alerts="" - # Orphan processes - orphans=$(_check_orphan_processes) - if (( orphans > 0 )); then - alerts="${alerts}Killed $orphans orphan process(es). " - fi - # Disk usage if ! _check_disk_usage; then alerts="${alerts}Disk usage above ${DISK_USAGE_THRESHOLD}%. " diff --git a/tests/health-check.bats b/tests/health-check.bats index 449eb3b..a975493 100644 --- a/tests/health-check.bats +++ b/tests/health-check.bats @@ -311,15 +311,6 @@ EOF ! grep -q "Backup stale" "$CLAUDIO_PATH/claudio.log" 2>/dev/null } -@test "orphan check runs without errors when no processes found" { - create_env_file - create_mock_curl_healthy - - run "$BATS_TEST_DIRNAME/../lib/health-check.sh" - - [ "$status" -eq 0 ] - ! grep -q "Orphan process" "$CLAUDIO_PATH/claudio.log" 2>/dev/null -} @test "cron_install adds cron entry" { source "$BATS_TEST_DIRNAME/../lib/service.sh" From 20975774ab643d7b2e93eb320073863f6c9538df Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 12:35:44 -0600 Subject: [PATCH 2/3] Add EXPERIMENTS.md to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3ca0ffc..16e2d54 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ logs/ *.db __pycache__/ .pytest_cache/ +EXPERIMENTS.md From 08163d09c5f1457e18886ed61d929ecee4d10471 Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 12:36:16 -0600 Subject: [PATCH 3/3] Remove CHANGELOG entry from this PR --- CHANGELOG.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccfd4f3..fcb48c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,5 @@ # Changelog -## [Unreleased] - -### Removed - -- Orphan process detection from health check — no longer needed; subagents run as native Task tools with `CLAUDE_CODE_DISABLE_BACKGROUND_TASKS=1` - ## [1.2.2] - 2026-02-08 ### Fixed