From 276a5e3fcb812d9f6ded4bd662794ce7e6a673a1 Mon Sep 17 00:00:00 2001
From: Gemini Agent <gemini@example.com>
Date: Thu, 12 Feb 2026 11:21:03 +0100
Subject: [PATCH] Phase 2: Nightly prod-to-staging data refresh

Add bash script and Terraform resources for automated nightly
production-to-staging Neo4j data refresh via GCP disk snapshot restore.

- sync-prod-to-staging.sh: finds latest prod snapshot, stops staging VM,
  swaps disk (delete old + create from snapshot with same name), starts VM,
  validates Neo4j startup via serial port output
- Cloud Run Job (cloud-sdk:slim) to execute the script
- Cloud Scheduler at 0 3 * * * UTC (1 hour after daily snapshot)
---
 deploy/scripts/sync-prod-to-staging.sh | 206 +++++++++++++++++++++++++
 deploy/terraform/backup.tf             |  60 +++++++
 2 files changed, 266 insertions(+)
 create mode 100755 deploy/scripts/sync-prod-to-staging.sh

diff --git a/deploy/scripts/sync-prod-to-staging.sh b/deploy/scripts/sync-prod-to-staging.sh
new file mode 100755
index 0000000..0e2ca51
--- /dev/null
+++ b/deploy/scripts/sync-prod-to-staging.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# =============================================================================
+# Sync Production Neo4j Data to Staging
+# =============================================================================
+# This script refreshes staging Neo4j with production data by:
+#   1. Finding the latest READY snapshot of the production data disk
+#   2. Stopping the staging VM
+#   3. Detaching and deleting the old staging data disk
+#   4. Creating a new disk from the production snapshot (same name)
+#   5. Attaching the new disk and starting the staging VM
+#   6. Validating Neo4j starts successfully
+#
+# All operations use gcloud compute API calls (no SSH required).
+# Designed to run as a Cloud Run Job on a nightly schedule.
+#
+# Usage: ./sync-prod-to-staging.sh
+# =============================================================================
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+PROJECT="${GCP_PROJECT_ID:-ai-knowledge-base-42}"
+ZONE="${GCP_ZONE:-us-central1-a}"
+PROD_DISK="neo4j-prod-data-disk"
+STAGING_VM="neo4j-staging"
+DISK_NAME="neo4j-staging-data-disk"
+DEVICE_NAME="neo4j-staging-data"
+MAX_BOOT_ATTEMPTS=18
+BOOT_POLL_INTERVAL=10
+
+echo "=========================================="
+echo "Production -> Staging Neo4j Data Refresh"
+echo "=========================================="
+echo "Project:     ${PROJECT}"
+echo "Zone:        ${ZONE}"
+echo "Prod Disk:   ${PROD_DISK}"
+echo "Staging VM:  ${STAGING_VM}"
+echo "Target Disk: ${DISK_NAME}"
+echo "=========================================="
+echo ""
+
+# ---------------------------------------------------------------------------
+# Step 1: Find the latest READY snapshot of the production data disk
+# ---------------------------------------------------------------------------
+echo "[1/7] Finding latest READY snapshot of ${PROD_DISK}..."
+
+SNAPSHOT_NAME=$(gcloud compute snapshots list \
+    --project="${PROJECT}" \
+    --filter="sourceDisk~${PROD_DISK} AND status=READY" \
+    --sort-by="~creationTimestamp" \
+    --limit=1 \
+    --format="value(name)")
+
+if [[ -z "${SNAPSHOT_NAME}" ]]; then
+    echo "ERROR: No READY snapshot found for disk ${PROD_DISK}"
+    echo "Ensure the snapshot schedule policy is active and has run at least once."
+    exit 1
+fi
+
+echo "  Found snapshot: ${SNAPSHOT_NAME}"
+
+# ---------------------------------------------------------------------------
+# Step 2: Stop the staging VM
+# ---------------------------------------------------------------------------
+echo "[2/7] Stopping staging VM ${STAGING_VM}..."
+
+gcloud compute instances stop "${STAGING_VM}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --quiet
+
+echo "  VM stopped."
+
+# ---------------------------------------------------------------------------
+# Step 3: Detach the old data disk (may not exist on first run)
+# ---------------------------------------------------------------------------
+echo "[3/7] Detaching old data disk ${DISK_NAME} from ${STAGING_VM}..."
+
+gcloud compute instances detach-disk "${STAGING_VM}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --disk="${DISK_NAME}" \
+    --quiet || true
+
+echo "  Disk detached (or was not attached)."
+
+# ---------------------------------------------------------------------------
+# Step 4: Delete the old data disk (may not exist on first run)
+# ---------------------------------------------------------------------------
+echo "[4/7] Deleting old data disk ${DISK_NAME}..."
+
+gcloud compute disks delete "${DISK_NAME}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --quiet || true
+
+echo "  Disk deleted (or did not exist)."
+
+# ---------------------------------------------------------------------------
+# Step 5: Create new disk from the production snapshot
+# ---------------------------------------------------------------------------
+echo "[5/7] Creating new disk ${DISK_NAME} from snapshot ${SNAPSHOT_NAME}..."
+
+gcloud compute disks create "${DISK_NAME}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --source-snapshot="${SNAPSHOT_NAME}" \
+    --type="pd-ssd" \
+    --quiet
+
+echo "  Disk created."
+
+# ---------------------------------------------------------------------------
+# Step 6: Attach the new disk and start the staging VM
+# ---------------------------------------------------------------------------
+echo "[6/7] Attaching disk ${DISK_NAME} to ${STAGING_VM}..."
+
+gcloud compute instances attach-disk "${STAGING_VM}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --disk="${DISK_NAME}" \
+    --device-name="${DEVICE_NAME}" \
+    --mode=rw \
+    --quiet
+
+echo "  Disk attached."
+
+echo "  Starting staging VM ${STAGING_VM}..."
+
+gcloud compute instances start "${STAGING_VM}" \
+    --project="${PROJECT}" \
+    --zone="${ZONE}" \
+    --quiet
+
+echo "  VM start command issued."
+
+# ---------------------------------------------------------------------------
+# Step 7: Validate VM is running and Neo4j starts successfully
+# ---------------------------------------------------------------------------
+echo "[7/7] Waiting for VM to reach RUNNING status and Neo4j to start..."
+
+# Wait for VM to reach RUNNING status
+for i in $(seq 1 "${MAX_BOOT_ATTEMPTS}"); do
+    VM_STATUS=$(gcloud compute instances describe "${STAGING_VM}" \
+        --project="${PROJECT}" \
+        --zone="${ZONE}" \
+        --format="value(status)")
+
+    if [[ "${VM_STATUS}" == "RUNNING" ]]; then
+        echo "  VM is RUNNING (attempt ${i}/${MAX_BOOT_ATTEMPTS})"
+        break
+    fi
+
+    if [[ "${i}" -eq "${MAX_BOOT_ATTEMPTS}" ]]; then
+        echo "ERROR: VM did not reach RUNNING status within $((MAX_BOOT_ATTEMPTS * BOOT_POLL_INTERVAL)) seconds"
+        echo "  Current status: ${VM_STATUS}"
+        exit 1
+    fi
+
+    echo "  VM status: ${VM_STATUS} - waiting ${BOOT_POLL_INTERVAL}s (attempt ${i}/${MAX_BOOT_ATTEMPTS})..."
+    sleep "${BOOT_POLL_INTERVAL}"
+done
+
+# Check serial port output for Neo4j startup confirmation
+echo "  Checking serial port for Neo4j startup confirmation..."
+NEO4J_STARTED=false
+
+for i in $(seq 1 "${MAX_BOOT_ATTEMPTS}"); do
+    SERIAL_OUTPUT=$(gcloud compute instances get-serial-port-output "${STAGING_VM}" \
+        --project="${PROJECT}" \
+        --zone="${ZONE}" \
+        --port=1 2>/dev/null || echo "")
+
+    if echo "${SERIAL_OUTPUT}" | grep -q "Neo4j staging server started successfully"; then
+        NEO4J_STARTED=true
+        echo "  Neo4j startup confirmed via serial port (attempt ${i}/${MAX_BOOT_ATTEMPTS})"
+        break
+    fi
+
+    if [[ "${i}" -eq "${MAX_BOOT_ATTEMPTS}" ]]; then
+        echo "WARNING: Neo4j startup message not found in serial port output after $((MAX_BOOT_ATTEMPTS * BOOT_POLL_INTERVAL)) seconds"
+        echo "  The VM is running but Neo4j may still be starting."
+        echo "  Check manually: gcloud compute instances get-serial-port-output ${STAGING_VM} --zone=${ZONE} --project=${PROJECT}"
+        break
+    fi
+
+    echo "  Neo4j not yet started - waiting ${BOOT_POLL_INTERVAL}s (attempt ${i}/${MAX_BOOT_ATTEMPTS})..."
+    sleep "${BOOT_POLL_INTERVAL}"
+done
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo ""
+echo "=========================================="
+echo "Production -> Staging Refresh Complete"
+echo "=========================================="
+echo "Snapshot used: ${SNAPSHOT_NAME}"
+echo "Disk created:  ${DISK_NAME}"
+echo "VM status:     ${VM_STATUS}"
+echo "Neo4j started: ${NEO4J_STARTED}"
+echo "=========================================="
+
+exit 0
diff --git a/deploy/terraform/backup.tf b/deploy/terraform/backup.tf
index 9750816..df2b14a 100644
--- a/deploy/terraform/backup.tf
+++ b/deploy/terraform/backup.tf
@@ -66,3 +66,63 @@ resource "google_project_iam_member" "backup_ops_run_invoker" {
   role    = "roles/run.invoker"
   member  = "serviceAccount:${google_service_account.backup_ops.email}"
 }
+
+# =============================================================================
+# Nightly Production-to-Staging Data Refresh
+# =============================================================================
+# Restores staging Neo4j from the latest production disk snapshot.
+# Runs as a Cloud Run Job using gcloud CLI (no SSH, no VPC access needed).
+# =============================================================================
+
+# -----------------------------------------------------------------------------
+# Cloud Run Job — Staging Data Refresh
+# -----------------------------------------------------------------------------
+resource "google_cloud_run_v2_job" "staging_data_refresh" {
+  name     = "staging-data-refresh"
+  location = var.region
+
+  template {
+    template {
+      containers {
+        image   = "gcr.io/google.com/cloudsdktool/cloud-sdk:slim"
+        command = ["bash", "-c"]
+        args    = [file("${path.module}/../scripts/sync-prod-to-staging.sh")]
+
+        resources {
+          limits = {
+            cpu    = "1"
+            memory = "512Mi"
+          }
+        }
+      }
+
+      timeout         = "1800s"
+      max_retries     = 1
+      service_account = google_service_account.backup_ops.email
+    }
+  }
+}
+
+# -----------------------------------------------------------------------------
+# Cloud Scheduler — Nightly at 3 AM UTC (1 hour after snapshot)
+# -----------------------------------------------------------------------------
+resource "google_cloud_scheduler_job" "staging_data_refresh" {
+  name        = "staging-data-refresh-nightly"
+  description = "Nightly production-to-staging Neo4j data refresh via disk snapshot restore"
+  schedule    = "0 3 * * *"
+  time_zone   = "UTC"
+  region      = var.region
+
+  http_target {
+    http_method = "POST"
+    uri         = "https://${var.region}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${var.project_id}/jobs/${google_cloud_run_v2_job.staging_data_refresh.name}:run"
+
+    oauth_token {
+      service_account_email = google_service_account.scheduler.email
+    }
+  }
+
+  retry_config {
+    retry_count = 1
+  }
+}