From 276a5e3fcb812d9f6ded4bd662794ce7e6a673a1 Mon Sep 17 00:00:00 2001 From: Gemini Agent Date: Thu, 12 Feb 2026 11:21:03 +0100 Subject: [PATCH] Phase 2: Nightly prod-to-staging data refresh Add bash script and Terraform resources for automated nightly production-to-staging Neo4j data refresh via GCP disk snapshot restore. - sync-prod-to-staging.sh: finds latest prod snapshot, stops staging VM, swaps disk (delete old + create from snapshot with same name), starts VM, validates Neo4j startup via serial port output - Cloud Run Job (cloud-sdk:slim) to execute the script - Cloud Scheduler at 0 3 * * * UTC (1 hour after daily snapshot) --- deploy/scripts/sync-prod-to-staging.sh | 206 +++++++++++++++++++++++++ deploy/terraform/backup.tf | 60 +++++++ 2 files changed, 266 insertions(+) create mode 100755 deploy/scripts/sync-prod-to-staging.sh diff --git a/deploy/scripts/sync-prod-to-staging.sh b/deploy/scripts/sync-prod-to-staging.sh new file mode 100755 index 0000000..0e2ca51 --- /dev/null +++ b/deploy/scripts/sync-prod-to-staging.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# ============================================================================= +# Sync Production Neo4j Data to Staging +# ============================================================================= +# This script refreshes staging Neo4j with production data by: +# 1. Finding the latest READY snapshot of the production data disk +# 2. Stopping the staging VM +# 3. Detaching and deleting the old staging data disk +# 4. Creating a new disk from the production snapshot (same name) +# 5. Attaching the new disk and starting the staging VM +# 6. Validating Neo4j starts successfully +# +# All operations use gcloud compute API calls (no SSH required). +# Designed to run as a Cloud Run Job on a nightly schedule. +# +# Usage: ./sync-prod-to-staging.sh +# ============================================================================= + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +PROJECT="${GCP_PROJECT_ID:-ai-knowledge-base-42}" +ZONE="${GCP_ZONE:-us-central1-a}" +PROD_DISK="neo4j-prod-data-disk" +STAGING_VM="neo4j-staging" +DISK_NAME="neo4j-staging-data-disk" +DEVICE_NAME="neo4j-staging-data" +MAX_BOOT_ATTEMPTS=18 +BOOT_POLL_INTERVAL=10 + +echo "==========================================" +echo "Production -> Staging Neo4j Data Refresh" +echo "==========================================" +echo "Project: ${PROJECT}" +echo "Zone: ${ZONE}" +echo "Prod Disk: ${PROD_DISK}" +echo "Staging VM: ${STAGING_VM}" +echo "Target Disk: ${DISK_NAME}" +echo "==========================================" +echo "" + +# --------------------------------------------------------------------------- +# Step 1: Find the latest READY snapshot of the production data disk +# --------------------------------------------------------------------------- +echo "[1/7] Finding latest READY snapshot of ${PROD_DISK}..." + +SNAPSHOT_NAME=$(gcloud compute snapshots list \ + --project="${PROJECT}" \ + --filter="sourceDisk~${PROD_DISK} AND status=READY" \ + --sort-by="~creationTimestamp" \ + --limit=1 \ + --format="value(name)") + +if [[ -z "${SNAPSHOT_NAME}" ]]; then + echo "ERROR: No READY snapshot found for disk ${PROD_DISK}" + echo "Ensure the snapshot schedule policy is active and has run at least once." + exit 1 +fi + +echo " Found snapshot: ${SNAPSHOT_NAME}" + +# --------------------------------------------------------------------------- +# Step 2: Stop the staging VM +# --------------------------------------------------------------------------- +echo "[2/7] Stopping staging VM ${STAGING_VM}..." + +gcloud compute instances stop "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --quiet + +echo " VM stopped." + +# --------------------------------------------------------------------------- +# Step 3: Detach the old data disk (may not exist on first run) +# --------------------------------------------------------------------------- +echo "[3/7] Detaching old data disk ${DISK_NAME} from ${STAGING_VM}..." + +gcloud compute instances detach-disk "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --disk="${DISK_NAME}" \ + --quiet || true + +echo " Disk detached (or was not attached)." + +# --------------------------------------------------------------------------- +# Step 4: Delete the old data disk (may not exist on first run) +# --------------------------------------------------------------------------- +echo "[4/7] Deleting old data disk ${DISK_NAME}..." + +gcloud compute disks delete "${DISK_NAME}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --quiet || true + +echo " Disk deleted (or did not exist)." + +# --------------------------------------------------------------------------- +# Step 5: Create new disk from the production snapshot +# --------------------------------------------------------------------------- +echo "[5/7] Creating new disk ${DISK_NAME} from snapshot ${SNAPSHOT_NAME}..." + +gcloud compute disks create "${DISK_NAME}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --source-snapshot="${SNAPSHOT_NAME}" \ + --type="pd-ssd" \ + --quiet + +echo " Disk created." + +# --------------------------------------------------------------------------- +# Step 6: Attach the new disk and start the staging VM +# --------------------------------------------------------------------------- +echo "[6/7] Attaching disk ${DISK_NAME} to ${STAGING_VM}..." + +gcloud compute instances attach-disk "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --disk="${DISK_NAME}" \ + --device-name="${DEVICE_NAME}" \ + --mode=rw \ + --quiet + +echo " Disk attached." + +echo " Starting staging VM ${STAGING_VM}..." + +gcloud compute instances start "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --quiet + +echo " VM start command issued." + +# --------------------------------------------------------------------------- +# Step 7: Validate VM is running and Neo4j starts successfully +# --------------------------------------------------------------------------- +echo "[7/7] Waiting for VM to reach RUNNING status and Neo4j to start..." + +# Wait for VM to reach RUNNING status +for i in $(seq 1 "${MAX_BOOT_ATTEMPTS}"); do + VM_STATUS=$(gcloud compute instances describe "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --format="value(status)") + + if [[ "${VM_STATUS}" == "RUNNING" ]]; then + echo " VM is RUNNING (attempt ${i}/${MAX_BOOT_ATTEMPTS})" + break + fi + + if [[ "${i}" -eq "${MAX_BOOT_ATTEMPTS}" ]]; then + echo "ERROR: VM did not reach RUNNING status within $((MAX_BOOT_ATTEMPTS * BOOT_POLL_INTERVAL)) seconds" + echo " Current status: ${VM_STATUS}" + exit 1 + fi + + echo " VM status: ${VM_STATUS} - waiting ${BOOT_POLL_INTERVAL}s (attempt ${i}/${MAX_BOOT_ATTEMPTS})..." + sleep "${BOOT_POLL_INTERVAL}" +done + +# Check serial port output for Neo4j startup confirmation +echo " Checking serial port for Neo4j startup confirmation..." +NEO4J_STARTED=false + +for i in $(seq 1 "${MAX_BOOT_ATTEMPTS}"); do + SERIAL_OUTPUT=$(gcloud compute instances get-serial-port-output "${STAGING_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --port=1 2>/dev/null || echo "") + + if echo "${SERIAL_OUTPUT}" | grep -q "Neo4j staging server started successfully"; then + NEO4J_STARTED=true + echo " Neo4j startup confirmed via serial port (attempt ${i}/${MAX_BOOT_ATTEMPTS})" + break + fi + + if [[ "${i}" -eq "${MAX_BOOT_ATTEMPTS}" ]]; then + echo "WARNING: Neo4j startup message not found in serial port output after $((MAX_BOOT_ATTEMPTS * BOOT_POLL_INTERVAL)) seconds" + echo " The VM is running but Neo4j may still be starting." + echo " Check manually: gcloud compute instances get-serial-port-output ${STAGING_VM} --zone=${ZONE} --project=${PROJECT}" + break + fi + + echo " Neo4j not yet started - waiting ${BOOT_POLL_INTERVAL}s (attempt ${i}/${MAX_BOOT_ATTEMPTS})..." + sleep "${BOOT_POLL_INTERVAL}" +done + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +echo "" +echo "==========================================" +echo "Production -> Staging Refresh Complete" +echo "==========================================" +echo "Snapshot used: ${SNAPSHOT_NAME}" +echo "Disk created: ${DISK_NAME}" +echo "VM status: ${VM_STATUS}" +echo "Neo4j started: ${NEO4J_STARTED}" +echo "==========================================" + +exit 0 diff --git a/deploy/terraform/backup.tf b/deploy/terraform/backup.tf index 9750816..df2b14a 100644 --- a/deploy/terraform/backup.tf +++ b/deploy/terraform/backup.tf @@ -66,3 +66,63 @@ resource "google_project_iam_member" "backup_ops_run_invoker" { role = "roles/run.invoker" member = "serviceAccount:${google_service_account.backup_ops.email}" } + +# ============================================================================= +# Nightly Production-to-Staging Data Refresh +# ============================================================================= +# Restores staging Neo4j from the latest production disk snapshot. +# Runs as a Cloud Run Job using gcloud CLI (no SSH, no VPC access needed). +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Cloud Run Job — Staging Data Refresh +# ----------------------------------------------------------------------------- +resource "google_cloud_run_v2_job" "staging_data_refresh" { + name = "staging-data-refresh" + location = var.region + + template { + template { + containers { + image = "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" + command = ["bash", "-c"] + args = [file("${path.module}/../scripts/sync-prod-to-staging.sh")] + + resources { + limits = { + cpu = "1" + memory = "512Mi" + } + } + } + + timeout = "1800s" + max_retries = 1 + service_account = google_service_account.backup_ops.email + } + } +} + +# ----------------------------------------------------------------------------- +# Cloud Scheduler — Nightly at 3 AM UTC (1 hour after snapshot) +# ----------------------------------------------------------------------------- +resource "google_cloud_scheduler_job" "staging_data_refresh" { + name = "staging-data-refresh-nightly" + description = "Nightly production-to-staging Neo4j data refresh via disk snapshot restore" + schedule = "0 3 * * *" + time_zone = "UTC" + region = var.region + + http_target { + http_method = "POST" + uri = "https://${var.region}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${var.project_id}/jobs/${google_cloud_run_v2_job.staging_data_refresh.name}:run" + + oauth_token { + service_account_email = google_service_account.scheduler.email + } + } + + retry_config { + retry_count = 1 + } +}