diff --git a/deploy/scripts/dr-recovery-test.sh b/deploy/scripts/dr-recovery-test.sh new file mode 100755 index 0000000..be54f0c --- /dev/null +++ b/deploy/scripts/dr-recovery-test.sh @@ -0,0 +1,324 @@ +#!/bin/bash +# ============================================================================= +# DR Recovery Test +# ============================================================================= +# Monthly automated disaster recovery test that: +# 1. Finds the latest READY snapshot of the production Neo4j data disk +# 2. Creates a temporary disk from that snapshot +# 3. Creates a temporary VM with the DR startup script +# 4. Waits for Neo4j to become available +# 5. Validates data integrity via Neo4j HTTP REST API +# 6. Reports PASS/FAIL with all counts +# 7. Cleans up all temporary resources (VM + disk) on exit +# +# Designed to run as a Cloud Run Job on a monthly schedule. +# +# Usage: ./dr-recovery-test.sh +# ============================================================================= + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration (overridable via environment variables) +# --------------------------------------------------------------------------- +PROJECT="${GCP_PROJECT_ID:-ai-knowledge-base-42}" +ZONE="${GCP_ZONE:-us-central1-a}" +REGION="${GCP_REGION:-us-central1}" +PROD_DISK="neo4j-prod-data-disk" +DR_VM="neo4j-dr-test" +DR_DISK="neo4j-dr-test-disk" +DR_PASSWORD="dr-test-$(date +%Y%m%d)" +NETWORK="knowledge-base-vpc" +SUBNET="knowledge-base-subnet" +# Polling configuration +MAX_POLL_ATTEMPTS=30 +POLL_INTERVAL=10 + +# Validation thresholds +MIN_NODES=1000 +MIN_RELATIONSHIPS=5000 +MIN_ENTITIES=100 + +# --------------------------------------------------------------------------- +# Cleanup trap — always delete temporary resources on exit +# --------------------------------------------------------------------------- +cleanup() { + echo "" + echo "Cleaning up DR test resources..." + gcloud compute instances delete "${DR_VM}" --project="${PROJECT}" --zone="${ZONE}" --quiet 2>/dev/null || true + gcloud compute disks delete "${DR_DISK}" --project="${PROJECT}" --zone="${ZONE}" --quiet 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT + +# --------------------------------------------------------------------------- +# Helper: extract count from Neo4j HTTP REST API JSON response +# --------------------------------------------------------------------------- +extract_count() { + echo "$1" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['results'][0]['data'][0]['row'][0])" +} + +# --------------------------------------------------------------------------- +# Helper: run a Cypher query via Neo4j HTTP REST API +# --------------------------------------------------------------------------- +run_cypher() { + local query="$1" + curl -s -u "neo4j:${DR_PASSWORD}" \ + -H "Content-Type: application/json" \ + -d "{\"statements\": [{\"statement\": \"${query}\"}]}" \ + "http://${VM_IP}:7474/db/neo4j/tx/commit" +} + +echo "==========================================" +echo "DR Recovery Test" +echo "==========================================" +echo "Project: ${PROJECT}" +echo "Zone: ${ZONE}" +echo "Prod Disk: ${PROD_DISK}" +echo "DR VM: ${DR_VM}" +echo "DR Disk: ${DR_DISK}" +echo "==========================================" +echo "" + +# --------------------------------------------------------------------------- +# Step 1: Find the latest READY snapshot of the production data disk +# --------------------------------------------------------------------------- +echo "[1/6] Finding latest READY snapshot of ${PROD_DISK}..." + +SNAPSHOT_NAME=$(gcloud compute snapshots list \ + --project="${PROJECT}" \ + --filter="sourceDisk~${PROD_DISK} AND status=READY" \ + --sort-by="~creationTimestamp" \ + --limit=1 \ + --format="value(name)") + +if [[ -z "${SNAPSHOT_NAME}" ]]; then + echo "ERROR: No READY snapshot found for disk ${PROD_DISK}" + echo "Ensure the snapshot schedule policy is active and has run at least once." + exit 1 +fi + +echo " Found snapshot: ${SNAPSHOT_NAME}" + +# --------------------------------------------------------------------------- +# Step 2: Create a disk from the snapshot +# --------------------------------------------------------------------------- +echo "[2/6] Creating disk ${DR_DISK} from snapshot ${SNAPSHOT_NAME}..." + +gcloud compute disks create "${DR_DISK}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --source-snapshot="${SNAPSHOT_NAME}" \ + --type="pd-ssd" \ + --quiet + +echo " Disk created." + +# --------------------------------------------------------------------------- +# Step 3: Create a temporary VM with the DR startup script +# --------------------------------------------------------------------------- +echo "[3/6] Creating temporary VM ${DR_VM}..." + +# Write the DR startup script to a temp file (since this script runs inlined +# in a Cloud Run Job container, we cannot reference external files) +DR_STARTUP_SCRIPT=$(mktemp) +cat > "${DR_STARTUP_SCRIPT}" << 'STARTUP_EOF' +#!/bin/bash +set -euo pipefail + +DATA_DISK="/dev/disk/by-id/google-neo4j-dr-data" +MOUNT_POINT="/data" + +mkdir -p $MOUNT_POINT + +if ! blkid $DATA_DISK; then + echo "Formatting data disk..." + mkfs.ext4 -F $DATA_DISK +fi + +if ! mountpoint -q $MOUNT_POINT; then + mount $DATA_DISK $MOUNT_POINT +else + echo "Data disk already mounted at $MOUNT_POINT" +fi + +if ! grep -q "$DATA_DISK" /etc/fstab; then + echo "$DATA_DISK $MOUNT_POINT ext4 defaults 0 2" >> /etc/fstab +fi + +mkdir -p $MOUNT_POINT/neo4j/data $MOUNT_POINT/neo4j/logs $MOUNT_POINT/neo4j/plugins + +# Reset auth files so the new NEO4J_AUTH takes effect on restored data +rm -f $MOUNT_POINT/neo4j/data/dbms/auth.ini $MOUNT_POINT/neo4j/data/dbms/auth 2>/dev/null || true + +apt-get update +apt-get install -y docker.io +systemctl enable docker +systemctl start docker + +NEO4J_PASSWORD=$(curl -sf "http://metadata.google.internal/computeMetadata/v1/instance/attributes/neo4j-password" -H "Metadata-Flavor: Google" 2>/dev/null || echo "dr-test-password") + +docker pull neo4j:5.26-community +docker stop neo4j-dr 2>/dev/null || true +docker rm neo4j-dr 2>/dev/null || true + +docker run -d \ + --name neo4j-dr \ + --restart always \ + -p 7687:7687 \ + -p 7474:7474 \ + -v $MOUNT_POINT/neo4j/data:/data \ + -v $MOUNT_POINT/neo4j/logs:/logs \ + -e NEO4J_AUTH="neo4j/${NEO4J_PASSWORD}" \ + -e NEO4J_server_memory_heap_initial__size=2G \ + -e NEO4J_server_memory_heap_max__size=4G \ + -e NEO4J_server_memory_pagecache_size=1G \ + -e NEO4J_PLUGINS='["apoc"]' \ + -e NEO4J_dbms_security_procedures_unrestricted='apoc.*' \ + neo4j:5.26-community 2>&1 | tee /tmp/docker-run.log + +echo "Neo4j DR test server started successfully" +sleep 5 +docker logs neo4j-dr > $MOUNT_POINT/neo4j/docker-startup.log 2>&1 || true +STARTUP_EOF + +# Look up the backup-ops service account email +BACKUP_SA=$(gcloud iam service-accounts list \ + --project="${PROJECT}" \ + --filter="email~backup-ops" \ + --format="value(email)" \ + --limit=1) + +if [[ -z "${BACKUP_SA}" ]]; then + echo "WARNING: backup-ops service account not found, creating VM without explicit SA" + SA_FLAG="" +else + SA_FLAG="--service-account=${BACKUP_SA}" +fi + +gcloud compute instances create "${DR_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --machine-type="e2-standard-2" \ + --image-family="debian-12" \ + --image-project="debian-cloud" \ + --no-address \ + --network="${NETWORK}" \ + --subnet="${SUBNET}" \ + --tags="neo4j-dr-test" \ + --disk="name=${DR_DISK},device-name=neo4j-dr-data,mode=rw,auto-delete=no" \ + --metadata="neo4j-password=${DR_PASSWORD}" \ + --metadata-from-file="startup-script=${DR_STARTUP_SCRIPT}" \ + --no-restart-on-failure \ + ${SA_FLAG} \ + --quiet + +rm -f "${DR_STARTUP_SCRIPT}" +echo " VM created." + +# --------------------------------------------------------------------------- +# Step 4: Get the VM internal IP and wait for Neo4j +# --------------------------------------------------------------------------- +echo "[4/6] Getting VM internal IP and waiting for Neo4j to start..." + +VM_IP=$(gcloud compute instances describe "${DR_VM}" \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + --format="value(networkInterfaces[0].networkIP)") + +echo " VM internal IP: ${VM_IP}" +echo " Waiting for Neo4j HTTP API on port 7474 (up to $((MAX_POLL_ATTEMPTS * POLL_INTERVAL))s)..." + +NEO4J_READY=false +for i in $(seq 1 "${MAX_POLL_ATTEMPTS}"); do + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + -u "neo4j:${DR_PASSWORD}" \ + "http://${VM_IP}:7474/" 2>/dev/null || echo "000") + + if [[ "${HTTP_STATUS}" == "200" ]]; then + NEO4J_READY=true + echo " Neo4j is ready (attempt ${i}/${MAX_POLL_ATTEMPTS})" + break + fi + + if [[ "${i}" -eq "${MAX_POLL_ATTEMPTS}" ]]; then + echo "ERROR: Neo4j did not become available within $((MAX_POLL_ATTEMPTS * POLL_INTERVAL)) seconds" + echo " Last HTTP status: ${HTTP_STATUS}" + exit 1 + fi + + echo " Neo4j not ready (HTTP ${HTTP_STATUS}) - waiting ${POLL_INTERVAL}s (attempt ${i}/${MAX_POLL_ATTEMPTS})..." + sleep "${POLL_INTERVAL}" +done + +# --------------------------------------------------------------------------- +# Step 5: Run validation queries via Neo4j HTTP REST API +# --------------------------------------------------------------------------- +echo "[5/6] Running validation queries..." + +# Query: total node count +echo " Querying total node count..." +RESULT_NODES=$(run_cypher "MATCH (n) RETURN count(n)") +NODE_COUNT=$(extract_count "${RESULT_NODES}") +echo " Total nodes: ${NODE_COUNT}" + +# Query: total relationship count +echo " Querying total relationship count..." +RESULT_RELS=$(run_cypher "MATCH ()-[r]->() RETURN count(r)") +REL_COUNT=$(extract_count "${RESULT_RELS}") +echo " Total relationships: ${REL_COUNT}" + +# Query: Entity node count +echo " Querying Entity node count..." +RESULT_ENTITIES=$(run_cypher "MATCH (n:Entity) RETURN count(n)") +ENTITY_COUNT=$(extract_count "${RESULT_ENTITIES}") +echo " Entity nodes: ${ENTITY_COUNT}" + +# Query: Episodic node count (informational) +echo " Querying Episodic node count..." +RESULT_EPISODIC=$(run_cypher "MATCH (n:Episodic) RETURN count(n)") +EPISODIC_COUNT=$(extract_count "${RESULT_EPISODIC}") +echo " Episodic nodes: ${EPISODIC_COUNT}" + +# --------------------------------------------------------------------------- +# Step 6: Report PASS/FAIL +# --------------------------------------------------------------------------- +echo "" +echo "[6/6] Validation Results" +echo "==========================================" + +PASS=true + +if [[ "${NODE_COUNT}" -gt "${MIN_NODES}" ]]; then + echo " PASS: Total nodes (${NODE_COUNT}) > ${MIN_NODES}" +else + echo " FAIL: Total nodes (${NODE_COUNT}) <= ${MIN_NODES}" + PASS=false +fi + +if [[ "${REL_COUNT}" -gt "${MIN_RELATIONSHIPS}" ]]; then + echo " PASS: Total relationships (${REL_COUNT}) > ${MIN_RELATIONSHIPS}" +else + echo " FAIL: Total relationships (${REL_COUNT}) <= ${MIN_RELATIONSHIPS}" + PASS=false +fi + +if [[ "${ENTITY_COUNT}" -gt "${MIN_ENTITIES}" ]]; then + echo " PASS: Entity nodes (${ENTITY_COUNT}) > ${MIN_ENTITIES}" +else + echo " FAIL: Entity nodes (${ENTITY_COUNT}) <= ${MIN_ENTITIES}" + PASS=false +fi + +echo " INFO: Episodic nodes: ${EPISODIC_COUNT}" +echo "==========================================" + +if [[ "${PASS}" == "true" ]]; then + echo "DR RECOVERY TEST: PASS" + echo "==========================================" + exit 0 +else + echo "DR RECOVERY TEST: FAIL" + echo "==========================================" + exit 1 +fi diff --git a/deploy/terraform/backup.tf b/deploy/terraform/backup.tf index df2b14a..0339c71 100644 --- a/deploy/terraform/backup.tf +++ b/deploy/terraform/backup.tf @@ -126,3 +126,69 @@ resource "google_cloud_scheduler_job" "staging_data_refresh" { retry_count = 1 } } + +# ============================================================================= +# Monthly DR Recovery Test +# ============================================================================= +# Creates a temporary VM from the latest production disk snapshot, validates +# Neo4j data integrity via HTTP REST API, and reports PASS/FAIL. +# All temporary resources (VM + disk) are cleaned up automatically. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Cloud Run Job — DR Recovery Test +# ----------------------------------------------------------------------------- +resource "google_cloud_run_v2_job" "dr_recovery_test" { + name = "dr-recovery-test" + location = var.region + + template { + template { + containers { + image = "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" + command = ["bash", "-c"] + args = [file("${path.module}/../scripts/dr-recovery-test.sh")] + + resources { + limits = { + cpu = "1" + memory = "512Mi" + } + } + } + + timeout = "3600s" + max_retries = 0 + service_account = google_service_account.backup_ops.email + + vpc_access { + connector = google_vpc_access_connector.connector.id + egress = "PRIVATE_RANGES_ONLY" + } + } + } +} + +# ----------------------------------------------------------------------------- +# Cloud Scheduler — Monthly on the 1st at 4 AM UTC +# ----------------------------------------------------------------------------- +resource "google_cloud_scheduler_job" "dr_recovery_test" { + name = "dr-recovery-test-monthly" + description = "Monthly automated DR recovery test — creates temp VM from prod snapshot and validates Neo4j data" + schedule = "0 4 1 * *" + time_zone = "UTC" + region = var.region + + http_target { + http_method = "POST" + uri = "https://${var.region}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${var.project_id}/jobs/${google_cloud_run_v2_job.dr_recovery_test.name}:run" + + oauth_token { + service_account_email = google_service_account.scheduler.email + } + } + + retry_config { + retry_count = 0 + } +} diff --git a/deploy/terraform/scripts/neo4j-dr-startup.sh b/deploy/terraform/scripts/neo4j-dr-startup.sh new file mode 100755 index 0000000..b0fcb88 --- /dev/null +++ b/deploy/terraform/scripts/neo4j-dr-startup.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# ============================================================================= +# Neo4j DR Test Server Startup Script +# ============================================================================= +# Based on the production startup script, adapted for disaster recovery testing. +# Key differences from production: +# - Device name: neo4j-dr-data (not neo4j-prod-data) +# - Auth reset: removes existing auth files so DR_PASSWORD takes effect +# - Container name: neo4j-dr (not neo4j-prod) +# ============================================================================= + +set -euo pipefail + +# Mount the data disk (restored from production snapshot) +DATA_DISK="/dev/disk/by-id/google-neo4j-dr-data" +MOUNT_POINT="/data" + +# Create mount point +mkdir -p $MOUNT_POINT + +# Check if disk is formatted +if ! blkid $DATA_DISK; then + echo "Formatting data disk..." + mkfs.ext4 -F $DATA_DISK +fi + +# Mount disk only if not already mounted +if ! mountpoint -q $MOUNT_POINT; then + mount $DATA_DISK $MOUNT_POINT +else + echo "Data disk already mounted at $MOUNT_POINT" +fi + +# Add to fstab for persistence +if ! grep -q "$DATA_DISK" /etc/fstab; then + echo "$DATA_DISK $MOUNT_POINT ext4 defaults 0 2" >> /etc/fstab +fi + +# Create Neo4j directories +mkdir -p $MOUNT_POINT/neo4j/data +mkdir -p $MOUNT_POINT/neo4j/logs +mkdir -p $MOUNT_POINT/neo4j/plugins + +# Reset auth files so the new NEO4J_AUTH takes effect on restored data +# (production snapshot has its own auth credentials baked in) +rm -f $MOUNT_POINT/neo4j/data/dbms/auth.ini $MOUNT_POINT/neo4j/data/dbms/auth 2>/dev/null || true + +# Install Docker +apt-get update +apt-get install -y docker.io +systemctl enable docker +systemctl start docker + +# Retrieve Neo4j password from metadata server (with error handling) +NEO4J_PASSWORD=$(curl -sf "http://metadata.google.internal/computeMetadata/v1/instance/attributes/neo4j-password" -H "Metadata-Flavor: Google" 2>/dev/null || echo "dr-test-password") +echo "NEO4J_PASSWORD set to: ${NEO4J_PASSWORD:0:8}***" + +# Pull and run Neo4j +docker pull neo4j:5.26-community + +# Stop any existing container +docker stop neo4j-dr 2>/dev/null || true +docker rm neo4j-dr 2>/dev/null || true + +# Run Neo4j container - DR test configuration +# Same JVM memory and APOC settings as production +docker run -d \ + --name neo4j-dr \ + --restart always \ + -p 7687:7687 \ + -p 7474:7474 \ + -v $MOUNT_POINT/neo4j/data:/data \ + -v $MOUNT_POINT/neo4j/logs:/logs \ + -e NEO4J_AUTH="neo4j/${NEO4J_PASSWORD}" \ + -e NEO4J_server_memory_heap_initial__size=2G \ + -e NEO4J_server_memory_heap_max__size=4G \ + -e NEO4J_server_memory_pagecache_size=1G \ + -e NEO4J_PLUGINS='["apoc"]' \ + -e NEO4J_dbms_security_procedures_unrestricted='apoc.*' \ + neo4j:5.26-community 2>&1 | tee /tmp/docker-run.log + +echo "Neo4j DR test server started successfully" + +# Log docker container status for debugging +sleep 5 +docker logs neo4j-dr > $MOUNT_POINT/neo4j/docker-startup.log 2>&1 || true +docker inspect neo4j-dr >> $MOUNT_POINT/neo4j/docker-startup.log 2>&1 || true