From 6f6b571b95f3ec2bd660f67797404da24895096c Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 09:01:02 -0600 Subject: [PATCH 1/3] Fix silent backup failures: retry alerts, detect unmounted drives - _send_alert() now retries 3 times with exponential backoff and logs failures instead of silently discarding them (was `> /dev/null || true`) - backup_run() validates that /mnt/* and /media/* destinations are actually mounted before attempting rsync - _check_backup_freshness() detects unmounted backup destinations and triggers a specific "drive not mounted" alert --- lib/backup.sh | 10 ++++++++ lib/health-check.sh | 59 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/lib/backup.sh b/lib/backup.sh index b8d0973..cdd3a1a 100644 --- a/lib/backup.sh +++ b/lib/backup.sh @@ -23,6 +23,16 @@ backup_run() { return 1 fi + # Verify the destination is a mount point when it looks like an external + # drive path (/mnt/*, /media/*). Catches disconnected drives that leave + # an empty mount point directory behind. + if [[ "$dest" == /mnt/* || "$dest" == /media/* ]]; then + if command -v mountpoint >/dev/null 2>&1 && ! mountpoint -q "$dest" 2>/dev/null; then + echo "Error: '$dest' is not a mounted filesystem. Is the drive connected?" >&2 + return 1 + fi + fi + # Resolve to absolute path (important for cron context) dest="$(cd "$dest" && pwd)" diff --git a/lib/health-check.sh b/lib/health-check.sh index 0982d57..a948d03 100755 --- a/lib/health-check.sh +++ b/lib/health-check.sh @@ -68,17 +68,40 @@ if [[ "$(uname)" != "Darwin" ]]; then fi # Send a Telegram alert message (standalone, no dependency on telegram.sh) +# Retries up to 3 times with exponential backoff. Logs failures instead of +# silently swallowing them, so we never lose alerts without knowing. _send_alert() { local message="$1" if [ -z "${TELEGRAM_BOT_TOKEN:-}" ] || [ -z "${TELEGRAM_CHAT_ID:-}" ]; then log_error "health-check" "Cannot send alert: TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID not set" return 1 fi - curl -s --connect-timeout 5 --max-time 10 \ - --config <(printf 'url = "https://api.telegram.org/bot%s/sendMessage"\n' "$TELEGRAM_BOT_TOKEN") \ - -d "chat_id=${TELEGRAM_CHAT_ID}" \ - --data-urlencode "text=${message}" \ - > /dev/null 2>&1 || true + + local attempt=0 + local max_retries=3 + while [ $attempt -le $max_retries ]; do + local response http_code + response=$(curl -s --connect-timeout 5 --max-time 10 -w "\n%{http_code}" \ + --config <(printf 'url = "https://api.telegram.org/bot%s/sendMessage"\n' "$TELEGRAM_BOT_TOKEN") \ + -d "chat_id=${TELEGRAM_CHAT_ID}" \ + --data-urlencode "text=${message}" 2>&1) || true + http_code=$(echo "$response" | tail -n1 | tr -d '\n') + [[ -z "$http_code" ]] && http_code="000" + + if [[ "$http_code" =~ ^2 ]]; then + return 0 + fi + + if [ $attempt -lt $max_retries ]; then + local delay=$(( 2 ** attempt )) # 1, 2, 4 + log_warn "health-check" "Alert send failed (HTTP $http_code), retrying in ${delay}s..." + sleep "$delay" + fi + ((attempt++)) || true + done + + log_error "health-check" "Failed to send alert after $((max_retries + 1)) attempts (HTTP $http_code): ${message:0:100}" + return 1 } # Read current attempt count (0 if file doesn't exist or invalid) @@ -233,8 +256,19 @@ _rotate_logs() { # --- Backup freshness check --- # Checks if the most recent backup is within BACKUP_MAX_AGE seconds. -# Returns 0 if fresh (or no backup dest configured), 1 if stale. +# Returns 0 if fresh (or no backup dest configured), 1 if stale or unmounted. _check_backup_freshness() { + # Fail loudly if the backup destination looks like an external drive + # path but isn't mounted (e.g., SSD disconnected via USB error — + # the dir stays as an empty mount point) + if [[ "$BACKUP_DEST" == /mnt/* || "$BACKUP_DEST" == /media/* ]]; then + if [[ -d "$BACKUP_DEST" ]] && command -v mountpoint >/dev/null 2>&1 \ + && ! mountpoint -q "$BACKUP_DEST" 2>/dev/null; then + log_warn "health-check" "Backup destination $BACKUP_DEST is not mounted" + return 1 + fi + fi + local backup_dir="$BACKUP_DEST/claudio-backups/hourly" [[ -d "$backup_dir" ]] || return 0 # no backups configured yet @@ -304,12 +338,20 @@ if [ "$http_code" = "200" ]; then # Log rotation rotated=$(_rotate_logs) - # Backup freshness + # Backup freshness (also checks mount for /mnt/* and /media/* destinations) if ! _check_backup_freshness; then - alerts="${alerts}Backups are stale. " + if [[ "$BACKUP_DEST" == /mnt/* || "$BACKUP_DEST" == /media/* ]] \ + && [[ -d "$BACKUP_DEST" ]] && command -v mountpoint >/dev/null 2>&1 \ + && ! mountpoint -q "$BACKUP_DEST" 2>/dev/null; then + alerts="${alerts}Backup drive not mounted ($BACKUP_DEST). " + else + alerts="${alerts}Backups are stale. " + fi fi # Send combined alert if anything needs attention + # || true: don't let alert delivery failure abort the health check (set -e) + # _send_alert already logs on failure internally if [[ -n "$alerts" ]]; then _send_alert "⚠️ Health check warnings: ${alerts}" || true fi @@ -390,6 +432,7 @@ elif [ "$http_code" = "000" ]; then if (( fail_count >= MAX_RESTART_ATTEMPTS )); then log_error "health-check" "Max restart attempts reached, sending alert" + # || true: don't abort script; _send_alert logs on failure internally _send_alert "⚠️ Claudio server is down after $MAX_RESTART_ATTEMPTS restart attempts. Please check the server manually." || true fi exit 1 From 113582aa4adbada5e8e922e6a33d1f94d08fde16 Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 09:44:15 -0600 Subject: [PATCH 2/3] Fix mount check for subdirectories and deduplicate unmount detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use findmnt --target instead of mountpoint -q to correctly handle subdirectories under mount points (e.g. /mnt/ssd/backups when /mnt/ssd is the actual mount). Falls back to mountpoint on the root component when findmnt is unavailable. - _check_backup_freshness now returns distinct exit codes (0=fresh, 1=stale, 2=unmounted) so the caller uses the exit code instead of re-running the mount check — eliminates the duplicated logic. --- lib/backup.sh | 23 ++++++++++++++++++----- lib/health-check.sh | 38 +++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/lib/backup.sh b/lib/backup.sh index cdd3a1a..8053cca 100644 --- a/lib/backup.sh +++ b/lib/backup.sh @@ -23,12 +23,25 @@ backup_run() { return 1 fi - # Verify the destination is a mount point when it looks like an external - # drive path (/mnt/*, /media/*). Catches disconnected drives that leave - # an empty mount point directory behind. + # Verify the destination sits on a mounted drive when it looks like an + # external drive path (/mnt/*, /media/*). Catches disconnected drives + # that leave an empty mount point directory behind. + # Uses findmnt --target which resolves subdirectories (e.g. /mnt/ssd/backups + # correctly finds /mnt/ssd). Falls back to mountpoint for the root component. if [[ "$dest" == /mnt/* || "$dest" == /media/* ]]; then - if command -v mountpoint >/dev/null 2>&1 && ! mountpoint -q "$dest" 2>/dev/null; then - echo "Error: '$dest' is not a mounted filesystem. Is the drive connected?" >&2 + local _not_mounted=false + if command -v findmnt >/dev/null 2>&1; then + local _mount_target + _mount_target=$(findmnt --target "$dest" -n -o TARGET 2>/dev/null) || _mount_target="" + [[ "$_mount_target" == "/" || -z "$_mount_target" ]] && _not_mounted=true + elif command -v mountpoint >/dev/null 2>&1; then + # Fallback: check the first two components (e.g. /mnt/ssd) + local _mount_root + _mount_root=$(echo "$dest" | cut -d/ -f1-3) + mountpoint -q "$_mount_root" 2>/dev/null || _not_mounted=true + fi + if [[ "$_not_mounted" == true ]]; then + echo "Error: '$dest' is not on a mounted filesystem. Is the drive connected?" >&2 return 1 fi fi diff --git a/lib/health-check.sh b/lib/health-check.sh index a948d03..92db8ee 100755 --- a/lib/health-check.sh +++ b/lib/health-check.sh @@ -256,16 +256,26 @@ _rotate_logs() { # --- Backup freshness check --- # Checks if the most recent backup is within BACKUP_MAX_AGE seconds. -# Returns 0 if fresh (or no backup dest configured), 1 if stale or unmounted. +# Returns 0 if fresh (or no backup dest configured), 1 if stale, 2 if unmounted. _check_backup_freshness() { # Fail loudly if the backup destination looks like an external drive # path but isn't mounted (e.g., SSD disconnected via USB error — - # the dir stays as an empty mount point) - if [[ "$BACKUP_DEST" == /mnt/* || "$BACKUP_DEST" == /media/* ]]; then - if [[ -d "$BACKUP_DEST" ]] && command -v mountpoint >/dev/null 2>&1 \ - && ! mountpoint -q "$BACKUP_DEST" 2>/dev/null; then + # the dir stays as an empty mount point). + # Uses findmnt --target which resolves subdirectories correctly. + if [[ "$BACKUP_DEST" == /mnt/* || "$BACKUP_DEST" == /media/* ]] && [[ -d "$BACKUP_DEST" ]]; then + local _not_mounted=false + if command -v findmnt >/dev/null 2>&1; then + local _mount_target + _mount_target=$(findmnt --target "$BACKUP_DEST" -n -o TARGET 2>/dev/null) || _mount_target="" + [[ "$_mount_target" == "/" || -z "$_mount_target" ]] && _not_mounted=true + elif command -v mountpoint >/dev/null 2>&1; then + local _mount_root + _mount_root=$(echo "$BACKUP_DEST" | cut -d/ -f1-3) + mountpoint -q "$_mount_root" 2>/dev/null || _not_mounted=true + fi + if [[ "$_not_mounted" == true ]]; then log_warn "health-check" "Backup destination $BACKUP_DEST is not mounted" - return 1 + return 2 fi fi @@ -338,15 +348,13 @@ if [ "$http_code" = "200" ]; then # Log rotation rotated=$(_rotate_logs) - # Backup freshness (also checks mount for /mnt/* and /media/* destinations) - if ! _check_backup_freshness; then - if [[ "$BACKUP_DEST" == /mnt/* || "$BACKUP_DEST" == /media/* ]] \ - && [[ -d "$BACKUP_DEST" ]] && command -v mountpoint >/dev/null 2>&1 \ - && ! mountpoint -q "$BACKUP_DEST" 2>/dev/null; then - alerts="${alerts}Backup drive not mounted ($BACKUP_DEST). " - else - alerts="${alerts}Backups are stale. " - fi + # Backup freshness (returns 0=fresh, 1=stale, 2=unmounted) + backup_rc=0 + _check_backup_freshness || backup_rc=$? + if (( backup_rc == 2 )); then + alerts="${alerts}Backup drive not mounted ($BACKUP_DEST). " + elif (( backup_rc == 1 )); then + alerts="${alerts}Backups are stale. " fi # Send combined alert if anything needs attention From 4ab34ab08f63303c5b3b44dbb2d8ffe936226063 Mon Sep 17 00:00:00 2001 From: Claudio <257189482+claudio-pi@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:04:17 -0600 Subject: [PATCH 3/3] Replace _send_alert with telegram_send_message Source telegram.sh in health-check.sh and delegate alert sending to telegram_send_message, which already handles retries via telegram_api, message chunking for >4096 chars, and parse-mode fallback. Removes ~30 lines of duplicated curl/retry logic from _send_alert. --- lib/health-check.sh | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/lib/health-check.sh b/lib/health-check.sh index 92db8ee..1bad6ae 100755 --- a/lib/health-check.sh +++ b/lib/health-check.sh @@ -15,6 +15,8 @@ set -euo pipefail # shellcheck source=lib/log.sh source "$(dirname "${BASH_SOURCE[0]}")/log.sh" +# shellcheck source=lib/telegram.sh +source "$(dirname "${BASH_SOURCE[0]}")/telegram.sh" CLAUDIO_PATH="$HOME/.claudio" CLAUDIO_ENV_FILE="$CLAUDIO_PATH/service.env" @@ -67,41 +69,15 @@ if [[ "$(uname)" != "Darwin" ]]; then export XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" fi -# Send a Telegram alert message (standalone, no dependency on telegram.sh) -# Retries up to 3 times with exponential backoff. Logs failures instead of -# silently swallowing them, so we never lose alerts without knowing. +# Send a Telegram alert message via telegram_send_message (which handles +# retries, chunking, and parse-mode fallback). _send_alert() { local message="$1" if [ -z "${TELEGRAM_BOT_TOKEN:-}" ] || [ -z "${TELEGRAM_CHAT_ID:-}" ]; then log_error "health-check" "Cannot send alert: TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID not set" return 1 fi - - local attempt=0 - local max_retries=3 - while [ $attempt -le $max_retries ]; do - local response http_code - response=$(curl -s --connect-timeout 5 --max-time 10 -w "\n%{http_code}" \ - --config <(printf 'url = "https://api.telegram.org/bot%s/sendMessage"\n' "$TELEGRAM_BOT_TOKEN") \ - -d "chat_id=${TELEGRAM_CHAT_ID}" \ - --data-urlencode "text=${message}" 2>&1) || true - http_code=$(echo "$response" | tail -n1 | tr -d '\n') - [[ -z "$http_code" ]] && http_code="000" - - if [[ "$http_code" =~ ^2 ]]; then - return 0 - fi - - if [ $attempt -lt $max_retries ]; then - local delay=$(( 2 ** attempt )) # 1, 2, 4 - log_warn "health-check" "Alert send failed (HTTP $http_code), retrying in ${delay}s..." - sleep "$delay" - fi - ((attempt++)) || true - done - - log_error "health-check" "Failed to send alert after $((max_retries + 1)) attempts (HTTP $http_code): ${message:0:100}" - return 1 + telegram_send_message "$TELEGRAM_CHAT_ID" "$message" } # Read current attempt count (0 if file doesn't exist or invalid)