Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ logs/
*.db
__pycache__/
.pytest_cache/
EXPERIMENTS.md
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ The setup wizard will confirm when it receives the message and finish. Once done

> A cron job runs every 5 minutes to monitor the webhook endpoint. It verifies the webhook is registered and re-registers it if needed. If the server is unreachable, it auto-restarts the service (throttled to once per 3 minutes, max 3 attempts). After exhausting restart attempts without recovery, it sends a Telegram alert and stops retrying until the server responds with HTTP 200. The restart counter auto-clears when the health endpoint returns HTTP 200. You can also reset it manually by deleting `$HOME/.claudio/.last_restart_attempt` and `$HOME/.claudio/.restart_fail_count`.
>
> The health check also monitors: orphan `claude`/`node` processes (kills them after 30 minutes), disk usage (alerts above 90%), log file sizes (rotates files over 10MB), and backup freshness (alerts if the last backup is older than 2 hours). These thresholds are configurable via environment variables.
> The health check also monitors: disk usage (alerts above 90%), log file sizes (rotates files over 10MB), and backup freshness (alerts if the last backup is older than 2 hours). These thresholds are configurable via environment variables.

### Status

Expand Down
78 changes: 0 additions & 78 deletions lib/health-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# Sends a Telegram alert after 3 restart attempts if the service never recovers
#
# Additional checks (run when service is healthy):
# - Orphan claude/node processes not belonging to the active service
# - Disk usage alerts (configurable threshold, default 90%)
# - Log rotation (configurable max size, default 10MB)
# - Backup freshness (alerts if last backup is older than threshold)
Expand Down Expand Up @@ -120,77 +119,6 @@ _clear_fail_state() {
rm -f "$RESTART_STAMP" "$FAIL_COUNT_FILE"
}

# --- Orphan process detection ---
# Finds claude/node processes that are NOT children of the active claudio service.
# Orphans are logged and killed (SIGTERM). Returns the count found.
_check_orphan_processes() {
local service_pid=""
if [[ "$(uname)" == "Darwin" ]]; then
service_pid=$(launchctl list | awk '/com\.claudio\.server/{print $1}')
else
service_pid=$(systemctl --user show claudio --property=MainPID --value 2>/dev/null || echo "")
fi
# Normalize: "0" or empty means no active service PID
[[ "$service_pid" == "0" || -z "$service_pid" ]] && service_pid=""

local orphan_count=0
local pids
# Find claude and node processes owned by this user
pids=$(pgrep -u "$(id -u)" -f '(claude|node)' 2>/dev/null || true)
[[ -z "$pids" ]] && echo 0 && return

for pid in $pids; do
# Skip our own process tree
[[ "$pid" == "$$" ]] && continue

# Skip if it's a child of the service main PID
if [[ -n "$service_pid" ]]; then
local ancestor="$pid"
local is_child=false
# Walk up the process tree (max 20 levels to avoid loops)
for (( depth=0; depth<20; depth++ )); do
local ppid
ppid=$(ps -o ppid= -p "$ancestor" 2>/dev/null | tr -d ' ') || break
[[ -z "$ppid" || "$ppid" == "0" || "$ppid" == "1" ]] && break
if [[ "$ppid" == "$service_pid" ]]; then
is_child=true
break
fi
ancestor="$ppid"
done
[[ "$is_child" == true ]] && continue
fi

# Skip processes started less than 30 minutes ago (could be active handlers)
local elapsed
if [[ "$(uname)" == "Darwin" ]]; then
# macOS: ps -o etime= gives [[dd-]hh:]mm:ss, convert to seconds
local etime
etime=$(ps -o etime= -p "$pid" 2>/dev/null | tr -d ' ') || continue
[[ -z "$etime" ]] && continue
elapsed=0
local parts
IFS=: read -ra parts <<< "${etime//-/:}"
for part in "${parts[@]}"; do
elapsed=$(( elapsed * 60 + 10#$part ))
done
else
elapsed=$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ') || continue
fi
[[ -z "$elapsed" ]] && continue
(( elapsed < 1800 )) && continue

# This looks like an orphan
local cmdline
cmdline=$(ps -o args= -p "$pid" 2>/dev/null | head -c 120) || cmdline="unknown"
log_warn "health-check" "Orphan process found: PID=$pid age=${elapsed}s cmd=$cmdline"
kill "$pid" 2>/dev/null || true
orphan_count=$((orphan_count + 1))
done

echo "$orphan_count"
}

# --- Disk usage check ---
# Checks usage of all mounted partitions relevant to Claudio.
# Returns 0 if all OK, 1 if any partition exceeds threshold.
Expand Down Expand Up @@ -310,12 +238,6 @@ if [ "$http_code" = "200" ]; then
# --- Additional system checks (only when service is healthy) ---
alerts=""

# Orphan processes
orphans=$(_check_orphan_processes)
if (( orphans > 0 )); then
alerts="${alerts}Killed $orphans orphan process(es). "
fi

# Disk usage
if ! _check_disk_usage; then
alerts="${alerts}Disk usage above ${DISK_USAGE_THRESHOLD}%. "
Expand Down
9 changes: 0 additions & 9 deletions tests/health-check.bats
Original file line number Diff line number Diff line change
Expand Up @@ -311,15 +311,6 @@ EOF
! grep -q "Backup stale" "$CLAUDIO_PATH/claudio.log" 2>/dev/null
}

@test "orphan check runs without errors when no processes found" {
create_env_file
create_mock_curl_healthy

run "$BATS_TEST_DIRNAME/../lib/health-check.sh"

[ "$status" -eq 0 ]
! grep -q "Orphan process" "$CLAUDIO_PATH/claudio.log" 2>/dev/null
}

@test "cron_install adds cron entry" {
source "$BATS_TEST_DIRNAME/../lib/service.sh"
Expand Down
Loading