diff --git a/.gitignore b/.gitignore index f018e29..964eb88 100644 --- a/.gitignore +++ b/.gitignore @@ -72,12 +72,10 @@ Thumbs.db # mypy .mypy_cache/ -# Terraform -terraform/.terraform/ -terraform/terraform.tfstate -terraform/terraform.tfstate.backup -terraform/terraform.tfvars -terraform/tfplan +# Monitoring volumes +docker/monitoring/grafana/data/ +docker/monitoring/prometheus/data/ +docker/monitoring/loki/data/ # Claude Code .claude/ diff --git a/Makefile b/Makefile deleted file mode 100644 index 9ad7c10..0000000 --- a/Makefile +++ /dev/null @@ -1,202 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Build & Operations Makefile -# ============================================================================= - -# Configuration -GCP_PROJECT_ID ?= $(shell gcloud config get-value project 2>/dev/null) -GCP_REGION ?= asia-northeast3 -GCP_ZONE ?= asia-northeast3-a -REGISTRY ?= $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT_ID)/speedcam - -# Image Tags -TAG ?= latest -MAIN_IMAGE = $(REGISTRY)/main:$(TAG) -OCR_IMAGE = $(REGISTRY)/ocr:$(TAG) -ALERT_IMAGE = $(REGISTRY)/alert:$(TAG) - -# Colors for output -GREEN = \033[0;32m -YELLOW = \033[0;33m -RED = \033[0;31m -NC = \033[0m - -.PHONY: help build push clean tf-init tf-plan tf-apply tf-destroy tf-output restart-services restart-main restart-ocr restart-alert status health dev-up dev-down dev-logs dev-build - -# ============================================================================= -# Help -# ============================================================================= - -help: ## Show this help message - @echo "Usage: make [target]" - @echo "" - @echo "Targets:" - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}' - @echo "" - @echo "Examples:" - @echo " make build push restart-services # Build, push, and restart (image update)" - @echo " make tf-plan # Preview infrastructure changes" - @echo " make tf-apply # Apply infrastructure changes" - @echo " make TAG=v1.0.0 build push # Build and push with specific tag" - -# ============================================================================= -# Build -# ============================================================================= - -build: build-main build-ocr build-alert ## Build all Docker images - -build-main: ## Build main service image - @echo "$(GREEN)Building main image...$(NC)" - docker build --platform linux/amd64 \ - -t $(MAIN_IMAGE) \ - -f docker/Dockerfile.main . - -build-ocr: ## Build OCR worker image - @echo "$(GREEN)Building OCR image...$(NC)" - docker build --platform linux/amd64 \ - -t $(OCR_IMAGE) \ - -f docker/Dockerfile.ocr . - -build-alert: ## Build alert worker image - @echo "$(GREEN)Building alert image...$(NC)" - docker build --platform linux/amd64 \ - -t $(ALERT_IMAGE) \ - -f docker/Dockerfile.alert . - -# ============================================================================= -# Push -# ============================================================================= - -push: push-main push-ocr push-alert ## Push all Docker images - -push-main: ## Push main service image - @echo "$(GREEN)Pushing main image...$(NC)" - docker push $(MAIN_IMAGE) - -push-ocr: ## Push OCR worker image - @echo "$(GREEN)Pushing OCR image...$(NC)" - docker push $(OCR_IMAGE) - -push-alert: ## Push alert worker image - @echo "$(GREEN)Pushing alert image...$(NC)" - docker push $(ALERT_IMAGE) - -# ============================================================================= -# Terraform (Infrastructure Management) -# ============================================================================= - -TF_DIR = terraform - -tf-init: ## Initialize Terraform - cd $(TF_DIR) && terraform init - -tf-plan: ## Preview infrastructure changes - cd $(TF_DIR) && terraform plan - -tf-apply: ## Apply infrastructure changes - cd $(TF_DIR) && terraform apply - -tf-destroy: ## Destroy all infrastructure - cd $(TF_DIR) && terraform destroy - -tf-output: ## Show Terraform outputs - cd $(TF_DIR) && terraform output - -# ============================================================================= -# Operations -# ============================================================================= - -restart-services: ## Restart all service instances - @echo "$(GREEN)Restarting services...$(NC)" - gcloud compute instances reset speedcam-main speedcam-ocr speedcam-alert --zone=$(GCP_ZONE) - -restart-main: ## Restart main service - gcloud compute instances reset speedcam-main --zone=$(GCP_ZONE) - -restart-ocr: ## Restart OCR worker - gcloud compute instances reset speedcam-ocr --zone=$(GCP_ZONE) - -restart-alert: ## Restart alert worker - gcloud compute instances reset speedcam-alert --zone=$(GCP_ZONE) - -status: ## Show deployment status - @echo "$(GREEN)Instance Status:$(NC)" - @gcloud compute instances list --filter="name~speedcam" \ - --format="table(name,zone,machineType,status,networkInterfaces[0].networkIP,networkInterfaces[0].accessConfigs[0].natIP)" - @echo "" - @echo "$(GREEN)Service URLs:$(NC)" - @MAIN_IP=$$(gcloud compute instances describe speedcam-main --zone=$(GCP_ZONE) --format='get(networkInterfaces[0].accessConfigs[0].natIP)' 2>/dev/null); \ - RMQ_IP=$$(gcloud compute instances describe speedcam-rabbitmq --zone=$(GCP_ZONE) --format='get(networkInterfaces[0].accessConfigs[0].natIP)' 2>/dev/null); \ - echo " API: http://$$MAIN_IP:8000/"; \ - echo " Swagger: http://$$MAIN_IP:8000/swagger/"; \ - echo " Health: http://$$MAIN_IP:8000/health/"; \ - echo " RabbitMQ: http://$$RMQ_IP:15672/" - -health: ## Check health of all services - @echo "$(GREEN)Checking health...$(NC)" - @MAIN_IP=$$(gcloud compute instances describe speedcam-main --zone=$(GCP_ZONE) --format='get(networkInterfaces[0].accessConfigs[0].natIP)' 2>/dev/null); \ - curl -s http://$$MAIN_IP:8000/health/ && echo "" - -logs-main: ## Show main service logs - gcloud compute ssh speedcam-main --zone=$(GCP_ZONE) \ - --command="docker logs \$$(docker ps -q) 2>&1 | tail -50" - -logs-ocr: ## Show OCR worker logs - gcloud compute ssh speedcam-ocr --zone=$(GCP_ZONE) \ - --command="docker logs \$$(docker ps -q) 2>&1 | tail -50" - -logs-alert: ## Show alert worker logs - gcloud compute ssh speedcam-alert --zone=$(GCP_ZONE) \ - --command="docker logs \$$(docker ps -q) 2>&1 | tail -50" - -ssh-main: ## SSH into main instance - gcloud compute ssh speedcam-main --zone=$(GCP_ZONE) - -ssh-ocr: ## SSH into OCR instance - gcloud compute ssh speedcam-ocr --zone=$(GCP_ZONE) - -ssh-alert: ## SSH into alert instance - gcloud compute ssh speedcam-alert --zone=$(GCP_ZONE) - -# ============================================================================= -# Cleanup -# ============================================================================= - -clean: ## Destroy infrastructure (use 'make tf-destroy' instead) - @echo "$(YELLOW)Infrastructure is now managed by Terraform.$(NC)" - @echo "$(YELLOW)Please use 'make tf-destroy' to destroy all resources.$(NC)" - -clean-services: ## Note: Services are managed by Terraform - @echo "$(YELLOW)Services are now managed by Terraform.$(NC)" - @echo "$(YELLOW)Please use 'make tf-destroy' to destroy all resources.$(NC)" - -clean-infra: ## Note: Infrastructure is managed by Terraform - @echo "$(YELLOW)Infrastructure is now managed by Terraform.$(NC)" - @echo "$(YELLOW)Please use 'make tf-destroy' to destroy all resources.$(NC)" - -clean-firewall: ## Note: Firewall rules are managed by Terraform - @echo "$(YELLOW)Firewall rules are now managed by Terraform.$(NC)" - @echo "$(YELLOW)Please use 'make tf-destroy' to destroy all resources.$(NC)" - -clean-registry: ## Delete Artifact Registry (not managed by Terraform) - @echo "$(RED)Deleting Artifact Registry...$(NC)" - -gcloud artifacts repositories delete speedcam --location=$(GCP_REGION) --quiet 2>/dev/null || true - -clean-all: clean clean-registry ## Note: Infrastructure is managed by Terraform - @echo "$(YELLOW)Infrastructure is now managed by Terraform.$(NC)" - @echo "$(YELLOW)Use 'make tf-destroy' to destroy infrastructure, then 'make clean-registry' for registry.$(NC)" - -# ============================================================================= -# Local Development -# ============================================================================= - -dev-up: ## Start local development environment - docker-compose -f docker/docker-compose.yml up -d - -dev-down: ## Stop local development environment - docker-compose -f docker/docker-compose.yml down - -dev-logs: ## Show local development logs - docker-compose -f docker/docker-compose.yml logs -f - -dev-build: ## Build local development images - docker-compose -f docker/docker-compose.yml build diff --git a/backend.env.example b/backend.env.example index 171facf..bc51cb1 100644 --- a/backend.env.example +++ b/backend.env.example @@ -83,16 +83,11 @@ LOG_LEVEL=info CORS_ALLOWED_ORIGINS=http://localhost:5173,http://localhost:3000 # =========================================== -# DataDog 설정 (Optional) -# =========================================== -# DataDog 모니터링을 사용하려면 아래 주석을 해제하고 설정 -# DD_API_KEY=your-datadog-api-key -# DD_SITE=ap1.datadoghq.com -# DD_AGENT_HOST=datadog-agent -# DD_TRACE_AGENT_PORT=8126 -# DD_ENV=dev -# DD_SERVICE=speedcam -# DD_LOGS_INJECTION=true -# DD_TRACE_SAMPLE_RATE=1 -# DD_PROFILING_ENABLED=true -# _DD_TRACE_WRITER_NATIVE=false # gunicorn prefork 호환 (ddtrace v4+) +# OpenTelemetry 설정 +# =========================================== +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_RESOURCE_ATTRIBUTES=service.namespace=speedcam,deployment.environment=dev +# Valid values: always_on, always_off, traceidratio, parentbased_always_on, parentbased_always_off, parentbased_traceidratio +OTEL_TRACES_SAMPLER=parentbased_always_on +OTEL_PYTHON_LOG_CORRELATION=true diff --git a/config/settings/base.py b/config/settings/base.py index f54208b..7c325de 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -29,6 +29,7 @@ "django_filters", "drf_yasg", "django_celery_results", + "django_prometheus", # Apps "apps.vehicles", "apps.detections", @@ -36,6 +37,7 @@ ] MIDDLEWARE = [ + "django_prometheus.middleware.PrometheusBeforeMiddleware", "corsheaders.middleware.CorsMiddleware", "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", @@ -44,6 +46,7 @@ "django.contrib.auth.middleware.AuthenticationMiddleware", "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", + "django_prometheus.middleware.PrometheusAfterMiddleware", ] ROOT_URLCONF = "config.urls" @@ -155,8 +158,9 @@ "disable_existing_loggers": False, "formatters": { "verbose": { - "format": "{levelname} {asctime} {module} {message}", + "format": "{levelname} {asctime} {module} [trace_id={otelTraceID} span_id={otelSpanID}] {message}", "style": "{", + "defaults": {"otelTraceID": "0", "otelSpanID": "0"}, }, }, "handlers": { diff --git a/config/urls.py b/config/urls.py index 5117ec7..bb61067 100644 --- a/config/urls.py +++ b/config/urls.py @@ -92,6 +92,8 @@ def health(request): re_path( r"^redoc/$", schema_view.with_ui("redoc", cache_timeout=0), name="schema-redoc" ), + # Prometheus Metrics + path("", include("django_prometheus.urls")), # API v1 path("api/v1/", include("apps.vehicles.urls")), path("api/v1/", include("apps.detections.urls")), diff --git a/datadog.env.example b/datadog.env.example deleted file mode 100644 index cc663a0..0000000 --- a/datadog.env.example +++ /dev/null @@ -1,13 +0,0 @@ -# =========================================== -# DataDog Agent Environment Variables -# =========================================== -# 사용법: 이 파일을 datadog.env로 복사하여 사용 -# cp datadog.env.example datadog.env - -DD_API_KEY=your-datadog-api-key -DD_SITE=ap1.datadoghq.com -DD_APM_ENABLED=true -DD_APM_NON_LOCAL_TRAFFIC=true -DD_DOGSTATSD_NON_LOCAL_TRAFFIC=true -DD_LOGS_ENABLED=true -DD_ENV=dev diff --git a/docker/docker-compose.monitoring.yml b/docker/docker-compose.monitoring.yml new file mode 100644 index 0000000..03d74d7 --- /dev/null +++ b/docker/docker-compose.monitoring.yml @@ -0,0 +1,170 @@ +services: + # =========================================== + # OpenTelemetry Collector + # =========================================== + otel-collector: + image: otel/opentelemetry-collector-contrib:0.98.0 + container_name: speedcam-otel-collector + command: ["--config", "/etc/otel-collector-config.yml"] + volumes: + - ./monitoring/otel-collector/otel-collector-config.yml:/etc/otel-collector-config.yml:ro + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "8889:8889" # Prometheus exporter + networks: + - speedcam-network + + # =========================================== + # Jaeger (Distributed Tracing) + # =========================================== + jaeger: + image: jaegertracing/all-in-one:1.57 + container_name: speedcam-jaeger + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" # Jaeger UI + - "14250:14250" # gRPC (collector) + networks: + - speedcam-network + + # =========================================== + # Prometheus (Metrics) + # =========================================== + prometheus: + image: prom/prometheus:v2.51.2 + container_name: speedcam-prometheus + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/prometheus/data:/prometheus + ports: + - "9090:9090" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--web.enable-remote-write-receiver" + networks: + - speedcam-network + + # =========================================== + # Grafana (Dashboards) + # =========================================== + grafana: + image: grafana/grafana:10.4.2 + container_name: speedcam-grafana + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/data:/var/lib/grafana + ports: + - "3000:3000" + depends_on: + - prometheus + - jaeger + - loki + networks: + - speedcam-network + + # =========================================== + # Loki (Log Aggregation) + # =========================================== + loki: + image: grafana/loki:2.9.6 + container_name: speedcam-loki + volumes: + - ./monitoring/loki/loki-config.yml:/etc/loki/local-config.yaml:ro + - ./monitoring/loki/data:/loki + ports: + - "3100:3100" + command: -config.file=/etc/loki/local-config.yaml + networks: + - speedcam-network + + # =========================================== + # Promtail (Log Shipper -> Loki) + # =========================================== + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + volumes: + - ./monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + depends_on: + - loki + networks: + - speedcam-network + + # =========================================== + # MySQL Exporter + # =========================================== + mysqld-exporter: + image: prom/mysqld-exporter:v0.15.1 + container_name: speedcam-mysqld-exporter + volumes: + - ./monitoring/mysqld-exporter/.my.cnf:/cfg/.my.cnf:ro + command: + - "--config.my-cnf=/cfg/.my.cnf" + ports: + - "9104:9104" + restart: unless-stopped + networks: + - speedcam-network + + # =========================================== + # Celery Exporter (Queue/Task Metrics) + # =========================================== + celery-exporter: + image: danihodovic/celery-exporter:0.10.3 + container_name: speedcam-celery-exporter + environment: + CE_BROKER_URL: "amqp://sa:1234@rabbitmq:5672//" + ports: + - "9808:9808" + restart: unless-stopped + networks: + - speedcam-network + + # =========================================== + # cAdvisor (Container Metrics) + # =========================================== + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + profiles: + - linux + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - "8080:8080" + networks: + - speedcam-network + + # =========================================== + # K6 Load Test Runner (on-demand) + # =========================================== + k6: + image: grafana/k6:latest + container_name: speedcam-k6 + profiles: + - loadtest + volumes: + - ./k6:/scripts + environment: + K6_PROMETHEUS_RW_SERVER_URL: http://prometheus:9090/api/v1/write + K6_PROMETHEUS_RW_TREND_AS_NATIVE_HISTOGRAM: "true" + MAIN_SERVICE_URL: http://main:8000 + networks: + - speedcam-network + +networks: + speedcam-network: + external: true diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8d9870a..94270e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -29,10 +29,11 @@ services: - "5672:5672" - "1883:1883" - "15672:15672" + - "15692:15692" volumes: - rabbitmq_data:/var/lib/rabbitmq command: > - bash -c "rabbitmq-plugins enable --offline rabbitmq_mqtt && + bash -c "rabbitmq-plugins enable --offline rabbitmq_mqtt rabbitmq_prometheus && rabbitmq-server" healthcheck: test: ["CMD", "rabbitmq-diagnostics", "check_running"] @@ -109,26 +110,11 @@ services: networks: - speedcam-network - # =========================================== - # Monitoring (Optional) - # =========================================== - datadog-agent: - image: gcr.io/datadoghq/agent:7 - container_name: speedcam-datadog - profiles: - - monitoring - env_file: - - ../datadog.env - ports: - - "8125:8125/udp" - - "8126:8126" - networks: - - speedcam-network - volumes: mysql_data: rabbitmq_data: networks: speedcam-network: + name: speedcam-network driver: bridge diff --git a/docker/k6/load-test.js b/docker/k6/load-test.js new file mode 100644 index 0000000..999909b --- /dev/null +++ b/docker/k6/load-test.js @@ -0,0 +1,115 @@ +import http from 'k6/http'; +import { check, group, sleep } from 'k6'; +import { Rate, Trend } from 'k6/metrics'; + +// Custom metrics +const errorRate = new Rate('errors'); +const vehicleCreateDuration = new Trend('vehicle_create_duration', true); + +const BASE_URL = __ENV.MAIN_SERVICE_URL || 'http://main:8000'; + +export const options = { + scenarios: { + // Scenario 1: Smoke test (basic connectivity) + smoke: { + executor: 'constant-vus', + vus: 1, + duration: '10s', + startTime: '0s', + tags: { scenario: 'smoke' }, + }, + // Scenario 2: Average load + average_load: { + executor: 'ramping-vus', + startVUs: 0, + stages: [ + { duration: '30s', target: 10 }, // ramp up + { duration: '1m', target: 10 }, // steady + { duration: '10s', target: 0 }, // ramp down + ], + startTime: '15s', + tags: { scenario: 'average_load' }, + }, + // Scenario 3: Spike test + spike: { + executor: 'ramping-vus', + startVUs: 0, + stages: [ + { duration: '5s', target: 30 }, // spike up + { duration: '15s', target: 30 }, // hold spike + { duration: '5s', target: 0 }, // recover + ], + startTime: '2m', + tags: { scenario: 'spike' }, + }, + }, + thresholds: { + http_req_duration: ['p(95)<500'], // 95% of requests under 500ms + errors: ['rate<0.1'], // error rate under 10% + }, +}; + +// Helper to generate random Korean plate number +function randomPlate() { + const nums1 = Math.floor(Math.random() * 900) + 100; + const chars = '가나다라마바사아자차카타파하'; + const char = chars.charAt(Math.floor(Math.random() * chars.length)); + const nums2 = Math.floor(Math.random() * 9000) + 1000; + return `${nums1}${char}${nums2}`; +} + +export default function () { + group('Health Check', function () { + const res = http.get(`${BASE_URL}/health/`); + check(res, { + 'health status 200': (r) => r.status === 200, + 'health is healthy': (r) => r.json('status') === 'healthy', + }); + errorRate.add(res.status !== 200); + }); + + group('Vehicle CRUD', function () { + // Create + const plate = randomPlate(); + const createPayload = JSON.stringify({ + plate_number: plate, + owner_name: `테스트유저_${__VU}`, + owner_phone: `010-${Math.floor(Math.random() * 9000) + 1000}-${Math.floor(Math.random() * 9000) + 1000}`, + }); + + const createRes = http.post(`${BASE_URL}/api/v1/vehicles/`, createPayload, { + headers: { 'Content-Type': 'application/json' }, + }); + + check(createRes, { + 'vehicle created 201': (r) => r.status === 201, + }); + errorRate.add(createRes.status !== 201); + vehicleCreateDuration.add(createRes.timings.duration); + + // List + const listRes = http.get(`${BASE_URL}/api/v1/vehicles/`); + check(listRes, { + 'vehicle list 200': (r) => r.status === 200, + }); + errorRate.add(listRes.status !== 200); + }); + + group('Detections Read', function () { + const res = http.get(`${BASE_URL}/api/v1/detections/`); + check(res, { + 'detections list 200': (r) => r.status === 200, + }); + errorRate.add(res.status !== 200); + }); + + group('Notifications Read', function () { + const res = http.get(`${BASE_URL}/api/v1/notifications/`); + check(res, { + 'notifications list 200': (r) => r.status === 200, + }); + errorRate.add(res.status !== 200); + }); + + sleep(1); +} diff --git a/docker/k6/mqtt-load-test.py b/docker/k6/mqtt-load-test.py new file mode 100644 index 0000000..89b836e --- /dev/null +++ b/docker/k6/mqtt-load-test.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +MQTT Load Test - IoT Device Simulation + +Simulates Raspberry Pi cameras sending detection messages via MQTT. +Full pipeline: MQTT → Detection (pending) → OCR Worker → Alert Worker +""" + +import argparse +import json +import os +import random +import threading +import time +from datetime import datetime, timedelta, timezone + +import paho.mqtt.client as mqtt + +# Config from environment +MQTT_HOST = os.getenv("MQTT_HOST", "rabbitmq") +MQTT_PORT = int(os.getenv("MQTT_PORT", "1883")) +MQTT_USER = os.getenv("MQTT_USER", "sa") +MQTT_PASS = os.getenv("MQTT_PASS", "1234") +TOPIC = "detections/new" + +# Locations for realistic simulation +LOCATIONS = [ + "서울시 강남구 테헤란로", + "서울시 서초구 반포대로", + "서울시 송파구 올림픽로", + "경기도 성남시 분당구 판교역로", + "인천시 연수구 송도대로", + "서울시 마포구 월드컵북로", + "서울시 영등포구 여의대방로", + "부산시 해운대구 해운대로", +] + +CAMERA_IDS = [f"CAM-{str(i).zfill(3)}" for i in range(1, 21)] + +# Stats +stats = { + "published": 0, + "failed": 0, + "total_latency_ms": 0, + "start_time": None, +} +stats_lock = threading.Lock() + + +def generate_message(): + """Generate a realistic detection message.""" + kst = timezone(timedelta(hours=9)) + speed_limit = random.choice([60.0, 80.0, 100.0, 110.0]) + detected_speed = speed_limit + random.uniform(5, 50) + + return json.dumps( + { + "camera_id": random.choice(CAMERA_IDS), + "location": random.choice(LOCATIONS), + "detected_speed": round(detected_speed, 1), + "speed_limit": speed_limit, + "detected_at": datetime.now(kst).isoformat(), + "image_gcs_uri": ( + f"gs://speedcam-bucket/detections/" + f"{int(time.time() * 1000)}-{random.randint(1000, 9999)}.jpg" + ), + } + ) + + +def publish_worker(worker_id, rate_per_sec, duration_sec): + """Single worker thread that publishes MQTT messages.""" + client = mqtt.Client( + callback_api_version=mqtt.CallbackAPIVersion.VERSION2, + protocol=mqtt.MQTTv311, + client_id=f"loadtest-{worker_id}-{os.getpid()}", + ) + client.username_pw_set(MQTT_USER, MQTT_PASS) + + try: + client.connect(MQTT_HOST, MQTT_PORT, keepalive=60) + client.loop_start() + except Exception as e: + print(f"[Worker-{worker_id}] Connection failed: {e}") + with stats_lock: + stats["failed"] += 1 + return + + interval = 1.0 / rate_per_sec if rate_per_sec > 0 else 1.0 + end_time = time.time() + duration_sec + + while time.time() < end_time: + msg = generate_message() + start = time.time() + result = client.publish(TOPIC, msg, qos=1) + + if result.rc == mqtt.MQTT_ERR_SUCCESS: + latency_ms = (time.time() - start) * 1000 + with stats_lock: + stats["published"] += 1 + stats["total_latency_ms"] += latency_ms + else: + with stats_lock: + stats["failed"] += 1 + + elapsed = time.time() - start + sleep_time = max(0, interval - elapsed) + if sleep_time > 0: + time.sleep(sleep_time) + + client.loop_stop() + client.disconnect() + + +def print_stats(): + """Print periodic stats.""" + elapsed = time.time() - stats["start_time"] + published = stats["published"] + failed = stats["failed"] + total = published + failed + rate = published / elapsed if elapsed > 0 else 0 + avg_latency = stats["total_latency_ms"] / published if published > 0 else 0 + + print(f"\n{'='*60}") + print(f" Elapsed: {elapsed:.1f}s | Published: {published} | Failed: {failed}") + print(f" Rate: {rate:.1f} msg/s | Avg Latency: {avg_latency:.2f}ms") + print(f" Error Rate: {(failed/total*100) if total > 0 else 0:.2f}%") + print(f"{'='*60}") + + +def run_load_test(workers, rate_per_worker, duration): + """Run the load test with multiple workers.""" + print("\n MQTT Load Test Starting") + print(f" Host: {MQTT_HOST}:{MQTT_PORT}") + print(f" Workers: {workers}") + print( + f" Rate: {rate_per_worker}/s per worker ({workers * rate_per_worker}/s total)" + ) + print(f" Duration: {duration}s") + print(f" Topic: {TOPIC}") + print() + + stats["start_time"] = time.time() + threads = [] + + for i in range(workers): + t = threading.Thread( + target=publish_worker, + args=(i, rate_per_worker, duration), + ) + t.start() + threads.append(t) + + # Print stats periodically + monitor_end = time.time() + duration + while time.time() < monitor_end: + time.sleep(5) + print_stats() + + for t in threads: + t.join(timeout=10) + + print("\n FINAL RESULTS") + print_stats() + + +def main(): + parser = argparse.ArgumentParser(description="MQTT Load Test") + parser.add_argument( + "--workers", type=int, default=5, help="Number of concurrent workers" + ) + parser.add_argument( + "--rate", type=int, default=2, help="Messages per second per worker" + ) + parser.add_argument( + "--duration", type=int, default=60, help="Test duration in seconds" + ) + args = parser.parse_args() + + run_load_test(args.workers, args.rate, args.duration) + + +if __name__ == "__main__": + main() diff --git a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..3063794 --- /dev/null +++ b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "default" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards + foldersFromFilesStructure: false diff --git a/docker/monitoring/grafana/provisioning/datasources/datasources.yml b/docker/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..19e94ac --- /dev/null +++ b/docker/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,29 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + + - name: Jaeger + type: jaeger + uid: jaeger + access: proxy + url: http://jaeger:16686 + editable: false + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + derivedFields: + - datasourceUid: jaeger + matcherRegex: "trace_id=(\\w+)" + name: TraceID + url: "$${__value.raw}" + datasourceName: Jaeger diff --git a/docker/monitoring/loki/loki-config.yml b/docker/monitoring/loki/loki-config.yml new file mode 100644 index 0000000..39c9a79 --- /dev/null +++ b/docker/monitoring/loki/loki-config.yml @@ -0,0 +1,37 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h # 7 days + max_global_streams_per_user: 10000 + ingestion_burst_size_mb: 16 + ingestion_rate_mb: 8 + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h diff --git a/docker/monitoring/mysqld-exporter/.my.cnf b/docker/monitoring/mysqld-exporter/.my.cnf new file mode 100644 index 0000000..d314180 --- /dev/null +++ b/docker/monitoring/mysqld-exporter/.my.cnf @@ -0,0 +1,5 @@ +[client] +user=sa +password=1234 +host=mysql +port=3306 diff --git a/docker/monitoring/otel-collector/otel-collector-config.yml b/docker/monitoring/otel-collector/otel-collector-config.yml new file mode 100644 index 0000000..56d2256 --- /dev/null +++ b/docker/monitoring/otel-collector/otel-collector-config.yml @@ -0,0 +1,39 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 1024 + resource: + attributes: + - key: service.namespace + value: speedcam + action: upsert + +exporters: + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + prometheus: + endpoint: 0.0.0.0:8889 + namespace: speedcam + resource_to_telemetry_conversion: + enabled: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/jaeger] + metrics: + receivers: [otlp] + processors: [batch, resource] + exporters: [prometheus] diff --git a/docker/monitoring/prometheus/prometheus.yml b/docker/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..39a18f9 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus.yml @@ -0,0 +1,34 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # --- Application --- + - job_name: "django" + metrics_path: /metrics + static_configs: + - targets: ["main:8000"] + + # --- OpenTelemetry Collector --- + - job_name: "otel-collector" + static_configs: + - targets: ["otel-collector:8889"] + + # --- Infrastructure --- + - job_name: "rabbitmq" + static_configs: + - targets: ["rabbitmq:15692"] + + - job_name: "mysql" + static_configs: + - targets: ["mysqld-exporter:9104"] + + # --- Workers --- + - job_name: "celery" + static_configs: + - targets: ["celery-exporter:9808"] + + # --- Container Resources --- + - job_name: "cadvisor" + static_configs: + - targets: ["cadvisor:8080"] diff --git a/docker/monitoring/promtail/promtail-config.yml b/docker/monitoring/promtail/promtail-config.yml new file mode 100644 index 0000000..4a64917 --- /dev/null +++ b/docker/monitoring/promtail/promtail-config.yml @@ -0,0 +1,31 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: name + values: + - "speedcam-.*" + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_name"] + regex: "/speedcam-(.*)" + target_label: "service" + pipeline_stages: + - regex: + expression: ".*trace_id=(?P[a-fA-F0-9]+).*" + - labels: + trace_id: diff --git a/docker/rabbitmq/enabled_plugins b/docker/rabbitmq/enabled_plugins deleted file mode 100644 index 5358cb0..0000000 --- a/docker/rabbitmq/enabled_plugins +++ /dev/null @@ -1 +0,0 @@ -[rabbitmq_management,rabbitmq_mqtt]. diff --git a/docker/rabbitmq/rabbitmq.conf b/docker/rabbitmq/rabbitmq.conf deleted file mode 100644 index 8192f11..0000000 --- a/docker/rabbitmq/rabbitmq.conf +++ /dev/null @@ -1,13 +0,0 @@ -# MQTT Plugin 설정 -mqtt.listeners.tcp.default = 1883 -mqtt.allow_anonymous = false -mqtt.default_user = sa -mqtt.default_pass = 1234 -mqtt.vhost = / -mqtt.exchange = amq.topic -mqtt.subscription_ttl = 86400000 -mqtt.prefetch = 10 - -# Management Plugin -management.tcp.port = 15672 - diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index e1af5e8..18cbdc5 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -1,12 +1,19 @@ -# GCP 배포 가이드 +# GCP 멀티 인스턴스 배포 가이드 ## 목차 + 1. [사전 요구사항](#1-사전-요구사항) 2. [시스템 아키텍처](#2-시스템-아키텍처) -3. [배포 단계](#3-배포-단계) -4. [배포 검증](#4-배포-검증) -5. [운영 가이드](#5-운영-가이드) -6. [트러블슈팅](#6-트러블슈팅) +3. [GCP 인프라 설정](#3-gcp-인프라-설정) +4. [배포 디렉토리 구조](#4-배포-디렉토리-구조) +5. [Docker Compose 파일](#5-docker-compose-파일) +6. [설정 파일](#6-설정-파일) +7. [Docker 이미지 빌드 및 배포](#7-docker-이미지-빌드-및-배포) +8. [배포 순서](#8-배포-순서) +9. [배포 검증](#9-배포-검증) +10. [운영 가이드](#10-운영-가이드) +11. [트러블슈팅](#11-트러블슈팅) +12. [리소스 정리](#12-리소스-정리) --- @@ -18,8 +25,7 @@ |------|------|----------| | Google Cloud SDK | 최신 | https://cloud.google.com/sdk/docs/install | | Docker | 20.10+ | https://docs.docker.com/get-docker/ | -| Terraform | 1.5+ | https://developer.hashicorp.com/terraform/downloads | -| Make | 3.81+ | 기본 설치됨 (macOS/Linux) | +| Docker Compose | 2.0+ | Docker Desktop 포함 또는 별도 설치 | ### 1.2 GCP 프로젝트 설정 @@ -39,12 +45,12 @@ gcloud services enable artifactregistry.googleapis.com ```bash # 프로젝트 설정 -export GCP_PROJECT_ID=your-project-id +export GCP_PROJECT_ID= export GCP_REGION=asia-northeast3 export GCP_ZONE=asia-northeast3-a -# Docker Registry -export REGISTRY=${GCP_REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/speedcam +# Docker Registry (Artifact Registry 사용) +export ARTIFACT_REGISTRY=${GCP_REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/speedcam ``` --- @@ -61,13 +67,14 @@ graph TB end subgraph GCP["Google Cloud Platform (asia-northeast3)"] - subgraph VPC["VPC Network (default)"] + subgraph VPC["VPC Network"] subgraph Compute["GCE Instances"] - RMQ[speedcam-rabbitmq
e2-small
10.178.0.2] - MySQL[speedcam-mysql
e2-small
10.178.0.3] - Main[speedcam-main
e2-medium
10.178.0.4] - OCR[speedcam-ocr
e2-medium
10.178.0.5] - Alert[speedcam-alert
e2-small
10.178.0.6] + DB[speedcam-db
e2-highmem-2
MySQL + Exporters] + MQ[speedcam-mq
e2-medium
RabbitMQ] + App[speedcam-app
e2-medium
Django API + Flower] + OCR[speedcam-ocr
e2-standard-2
OCR Worker] + Alert[speedcam-alert
e2-small
Alert Worker] + Mon[speedcam-mon
e2-standard-2
Monitoring Stack] end end @@ -79,22 +86,33 @@ graph TB FCM[Firebase FCM] end - Client -->|HTTP :8000| Main - Pi -->|MQTT :1883| RMQ + Client -->|HTTP :8000| App + Pi -->|MQTT :1883| MQ Pi -->|Upload| GCS - Main -->|AMQP :5672| RMQ - Main -->|MySQL :3306| MySQL + App -->|AMQP :5672| MQ + App -->|MySQL :3306| DB - OCR -->|AMQP| RMQ - OCR -->|MySQL| MySQL + OCR -->|AMQP| MQ + OCR -->|MySQL| DB OCR -->|Download| GCS - Alert -->|AMQP| RMQ - Alert -->|MySQL| MySQL + Alert -->|AMQP| MQ + Alert -->|MySQL| DB Alert -->|Push| FCM - AR -.->|Pull Image| Main + Mon -->|Scrape Metrics| DB + Mon -->|Scrape Metrics| MQ + Mon -->|Scrape Metrics| App + Mon -->|Scrape Metrics| OCR + Mon -->|Scrape Metrics| Alert + Mon -->|Collect Logs| DB + Mon -->|Collect Logs| MQ + Mon -->|Collect Logs| App + Mon -->|Collect Logs| OCR + Mon -->|Collect Logs| Alert + + AR -.->|Pull Image| App AR -.->|Pull Image| OCR AR -.->|Pull Image| Alert ``` @@ -182,351 +200,1262 @@ erDiagram ### 2.4 인스턴스 사양 -| 인스턴스 | 역할 | Machine Type | vCPU | Memory | 포트 | -|----------|------|--------------|------|--------|------| -| speedcam-rabbitmq | Message Broker | e2-small | 0.5-2 | 2GB | 5672, 1883, 15672 | -| speedcam-mysql | Database | e2-small | 0.5-2 | 2GB | 3306 | -| speedcam-main | Django API + MQTT | e2-medium | 1-2 | 4GB | 8000 | -| speedcam-ocr | OCR Worker (prefork) | e2-medium | 1-2 | 4GB | - | -| speedcam-alert | Alert Worker (gevent) | e2-small | 0.5-2 | 2GB | - | +| 인스턴스 이름 | 역할 | 머신 타입 | vCPU | Memory | 열어야 할 포트 (내부) | +|--------------|------|----------|------|--------|---------------------| +| `speedcam-db` | MySQL + Exporters | e2-highmem-2 | 2 | 16GB | 3306, 9104, 8080 | +| `speedcam-mq` | RabbitMQ | e2-medium | 2 | 4GB | 5672, 1883, 15672, 15692, 8080 | +| `speedcam-app` | Django API + Flower | e2-medium | 2 | 4GB | 8000, 5555, 8080 | +| `speedcam-ocr` | OCR Celery Worker | e2-standard-2 | 2 | 8GB | 8080 | +| `speedcam-alert` | Alert Celery Worker | e2-small | 2 | 2GB | 8080 | +| `speedcam-mon` | 모니터링 전체 스택 | e2-standard-2 | 2 | 8GB | 3000, 9090, 16686, 3100, 4317, 4318, 8889, 9808, 8080 | --- -## 3. 배포 단계 +## 3. GCP 인프라 설정 + +### 3.1 VPC 네트워크 생성 + +```bash +# VPC 네트워크 생성 (커스텀 모드) +gcloud compute networks create speedcam-vpc \ + --subnet-mode=custom \ + --bgp-routing-mode=regional + +# 서브넷 생성 (asia-northeast3) +gcloud compute networks subnets create speedcam-subnet \ + --network=speedcam-vpc \ + --region=${GCP_REGION} \ + --range=10.178.0.0/20 +``` + +### 3.2 방화벽 규칙 설정 -### 3.1 Step 1: Artifact Registry 설정 +#### 3.2.1 내부 통신 허용 + +```bash +# 내부 통신 허용 (VPC 내부에서만) +gcloud compute firewall-rules create speedcam-internal \ + --network=speedcam-vpc \ + --allow=tcp:3306,tcp:5672,tcp:1883,tcp:15672,tcp:15692,tcp:8000,tcp:5555,tcp:4317,tcp:4318,tcp:8889,tcp:9090,tcp:3000,tcp:16686,tcp:3100,tcp:9104,tcp:9808,tcp:8080,tcp:9080 \ + --source-ranges=10.178.0.0/20 \ + --target-tags=speedcam \ + --description="SpeedCam internal communication" +``` + +#### 3.2.2 외부 접근 허용 (필요한 서비스만) + +```bash +# Django API 외부 접근 (프론트엔드) +gcloud compute firewall-rules create speedcam-api-external \ + --network=speedcam-vpc \ + --allow=tcp:8000 \ + --source-ranges=0.0.0.0/0 \ + --target-tags=speedcam-app \ + --description="Django API external access" + +# MQTT 외부 접근 (Edge Device) +gcloud compute firewall-rules create speedcam-mqtt-external \ + --network=speedcam-vpc \ + --allow=tcp:1883 \ + --source-ranges=0.0.0.0/0 \ + --target-tags=speedcam-mq \ + --description="MQTT external access for edge devices" + +# Grafana UI 외부 접근 (운영자만) +gcloud compute firewall-rules create speedcam-grafana-external \ + --network=speedcam-vpc \ + --allow=tcp:3000 \ + --source-ranges=/32 \ + --target-tags=speedcam-mon \ + --description="Grafana external access (admin only)" +``` + +### 3.3 Artifact Registry 설정 ```bash # 저장소 생성 gcloud artifacts repositories create speedcam \ - --repository-format=docker \ - --location=${GCP_REGION} \ - --description="Speedcam MSA Docker images" + --repository-format=docker \ + --location=${GCP_REGION} \ + --description="Speedcam MSA Docker images" # Docker 인증 설정 gcloud auth configure-docker ${GCP_REGION}-docker.pkg.dev ``` -### 3.2 Step 2: Docker 이미지 빌드 +### 3.4 GCE 인스턴스 생성 ```bash -# linux/amd64 플랫폼으로 빌드 (GCE용) -docker build --platform linux/amd64 \ - -t ${REGISTRY}/main:latest \ - -f docker/Dockerfile.main . +# 1. speedcam-db 인스턴스 +gcloud compute instances create speedcam-db \ + --zone=${GCP_ZONE} \ + --machine-type=e2-highmem-2 \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' + +# 2. speedcam-mq 인스턴스 +gcloud compute instances create speedcam-mq \ + --zone=${GCP_ZONE} \ + --machine-type=e2-medium \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam,speedcam-mq \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' + +# 3. speedcam-app 인스턴스 +gcloud compute instances create speedcam-app \ + --zone=${GCP_ZONE} \ + --machine-type=e2-medium \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam,speedcam-app \ + --scopes=cloud-platform \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' + +# 4. speedcam-ocr 인스턴스 +gcloud compute instances create speedcam-ocr \ + --zone=${GCP_ZONE} \ + --machine-type=e2-standard-2 \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam \ + --scopes=cloud-platform \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' + +# 5. speedcam-alert 인스턴스 +gcloud compute instances create speedcam-alert \ + --zone=${GCP_ZONE} \ + --machine-type=e2-small \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam \ + --scopes=cloud-platform \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' + +# 6. speedcam-mon 인스턴스 +gcloud compute instances create speedcam-mon \ + --zone=${GCP_ZONE} \ + --machine-type=e2-standard-2 \ + --network-interface=subnet=speedcam-subnet,no-address \ + --tags=speedcam,speedcam-mon \ + --metadata=startup-script='#!/bin/bash +apt-get update +apt-get install -y docker.io docker-compose +systemctl start docker +systemctl enable docker' +``` -docker build --platform linux/amd64 \ - -t ${REGISTRY}/ocr:latest \ - -f docker/Dockerfile.ocr . +--- -docker build --platform linux/amd64 \ - -t ${REGISTRY}/alert:latest \ - -f docker/Dockerfile.alert . +## 4. 배포 디렉토리 구조 + +각 인스턴스에 배포할 파일 구조: + +``` +deploy/ +├── env/ +│ ├── backend.env # Django/Celery 공통 환경변수 +│ ├── mysql.env # MySQL 전용 (DB 인스턴스만) +│ └── rabbitmq.env # RabbitMQ 전용 (MQ 인스턴스만) +├── compose/ +│ ├── docker-compose.db.yml +│ ├── docker-compose.mq.yml +│ ├── docker-compose.app.yml +│ ├── docker-compose.ocr.yml +│ ├── docker-compose.alert.yml +│ └── docker-compose.mon.yml +├── config/ +│ ├── mysql/ +│ │ └── init.sql +│ ├── monitoring/ +│ │ ├── otel-collector/ +│ │ │ └── otel-collector-config.yml +│ │ ├── prometheus/ +│ │ │ └── prometheus.yml +│ │ ├── loki/ +│ │ │ └── loki-config.yml +│ │ ├── promtail/ +│ │ │ ├── promtail-config.yml # 각 인스턴스용 +│ │ │ └── promtail-config.mon.yml # 모니터링 인스턴스용 +│ │ ├── grafana/ +│ │ │ └── provisioning/ +│ │ │ ├── datasources/ +│ │ │ │ └── datasources.yml +│ │ │ └── dashboards/ +│ │ │ └── dashboards.yml +│ │ └── mysqld-exporter/ +│ │ └── .my.cnf +│ └── credentials/ +│ └── (GCP, Firebase 인증 파일) +└── images/ + └── (빌드된 이미지는 Artifact Registry 사용) +``` + +--- + +## 5. Docker Compose 파일 + +### 5.1 로컬 → 멀티 인스턴스 주요 변경점 + +| 항목 | 로컬 (현재) | 멀티 인스턴스 | +|------|------------|--------------| +| 네트워크 | `networks: speedcam-network` (bridge) | `network_mode: host` | +| 서비스 디스커버리 | 컨테이너명 (`mysql`, `rabbitmq`) | GCP 내부 IP | +| 포트 매핑 | `ports: "3306:3306"` | 불필요 (host 모드에서 직접 바인딩) | +| depends_on | 서비스 간 의존성 | 제거 (다른 인스턴스에 있으므로) | +| 이미지 | `build: context` | Artifact Registry 이미지 | +| cAdvisor | 모니터링에 1개 | 모든 인스턴스에 1개씩 | +| Promtail | 모니터링에 1개 | 모든 인스턴스에 1개씩 | + +### 5.2 docker-compose.db.yml + +speedcam-db 인스턴스에 배포 + +```yaml +services: + mysql: + image: mysql:8.0 + container_name: speedcam-mysql + restart: always + network_mode: host + env_file: + - ../env/mysql.env + volumes: + - mysql_data:/var/lib/mysql + - ../config/mysql/init.sql:/docker-entrypoint-initdb.d/init.sql:ro + healthcheck: + test: ["CMD", "mysqladmin", "ping", "-h", "localhost"] + interval: 10s + timeout: 5s + retries: 5 + + mysqld-exporter: + image: prom/mysqld-exporter:v0.15.1 + container_name: speedcam-mysqld-exporter + restart: always + network_mode: host + volumes: + - ../config/monitoring/mysqld-exporter/.my.cnf:/cfg/.my.cnf:ro + command: + - "--config.my-cnf=/cfg/.my.cnf" + depends_on: + mysql: + condition: service_healthy + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + +volumes: + mysql_data: ``` -### 3.3 Step 3: Docker 이미지 푸시 +### 5.3 docker-compose.mq.yml + +speedcam-mq 인스턴스에 배포 + +```yaml +services: + rabbitmq: + image: rabbitmq:3.13-management + container_name: speedcam-rabbitmq + restart: always + network_mode: host + env_file: + - ../env/rabbitmq.env + volumes: + - rabbitmq_data:/var/lib/rabbitmq + command: > + bash -c "rabbitmq-plugins enable --offline rabbitmq_mqtt rabbitmq_prometheus && + rabbitmq-server" + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "check_running"] + interval: 10s + timeout: 5s + retries: 5 + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + +volumes: + rabbitmq_data: +``` -```bash -docker push ${REGISTRY}/main:latest -docker push ${REGISTRY}/ocr:latest -docker push ${REGISTRY}/alert:latest +### 5.4 docker-compose.app.yml + +speedcam-app 인스턴스에 배포 (이미지는 Artifact Registry에서 pull) + +```yaml +services: + main: + image: ${ARTIFACT_REGISTRY}/speedcam-main:latest + container_name: speedcam-main + restart: always + network_mode: host + env_file: + - ../env/backend.env + volumes: + - ../config/credentials:/app/credentials:ro + + flower: + image: ${ARTIFACT_REGISTRY}/speedcam-main:latest + container_name: speedcam-flower + restart: always + network_mode: host + env_file: + - ../env/backend.env + command: celery -A config flower --port=5555 + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro ``` -### 3.4 Step 4: 방화벽 규칙 생성 +### 5.5 docker-compose.ocr.yml + +speedcam-ocr 인스턴스에 배포 + +```yaml +services: + ocr-worker: + image: ${ARTIFACT_REGISTRY}/speedcam-ocr:latest + container_name: speedcam-ocr + restart: always + network_mode: host + env_file: + - ../env/backend.env + volumes: + - ../config/credentials:/app/credentials:ro + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro +``` -```bash -# 내부 통신용 -gcloud compute firewall-rules create speedcam-internal \ - --network=default \ - --allow=tcp:3306,tcp:5672,tcp:1883,tcp:15672,tcp:8000 \ - --source-ranges=10.0.0.0/8 \ - --target-tags=speedcam +### 5.6 docker-compose.alert.yml + +speedcam-alert 인스턴스에 배포 + +```yaml +services: + alert-worker: + image: ${ARTIFACT_REGISTRY}/speedcam-alert:latest + container_name: speedcam-alert + restart: always + network_mode: host + env_file: + - ../env/backend.env + volumes: + - ../config/credentials:/app/credentials:ro + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro +``` -# 외부 접근용 -gcloud compute firewall-rules create speedcam-external \ - --network=default \ - --allow=tcp:8000,tcp:15672 \ - --source-ranges=0.0.0.0/0 \ - --target-tags=speedcam-web +### 5.7 docker-compose.mon.yml + +speedcam-mon 인스턴스에 배포 + +```yaml +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:0.98.0 + container_name: speedcam-otel-collector + restart: always + network_mode: host + volumes: + - ../config/monitoring/otel-collector/otel-collector-config.yml:/etc/otel-collector-config.yml:ro + command: ["--config", "/etc/otel-collector-config.yml"] + + jaeger: + image: jaegertracing/all-in-one:1.57 + container_name: speedcam-jaeger + restart: always + network_mode: host + environment: + - COLLECTOR_OTLP_ENABLED=true + + prometheus: + image: prom/prometheus:v2.51.2 + container_name: speedcam-prometheus + restart: always + network_mode: host + volumes: + - ../config/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--web.enable-remote-write-receiver" + + grafana: + image: grafana/grafana:10.4.2 + container_name: speedcam-grafana + restart: always + network_mode: host + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ../config/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - grafana_data:/var/lib/grafana + + loki: + image: grafana/loki:2.9.6 + container_name: speedcam-loki + restart: always + network_mode: host + volumes: + - ../config/monitoring/loki/loki-config.yml:/etc/loki/local-config.yaml:ro + - loki_data:/loki + command: -config.file=/etc/loki/local-config.yaml + + promtail: + image: grafana/promtail:2.9.6 + container_name: speedcam-promtail + restart: always + network_mode: host + volumes: + - ../config/monitoring/promtail/promtail-config.mon.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + + celery-exporter: + image: danihodovic/celery-exporter:0.10.3 + container_name: speedcam-celery-exporter + restart: always + network_mode: host + environment: + CE_BROKER_URL: "amqp://sa:@${MQ_HOST}:5672//" + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + container_name: speedcam-cadvisor + restart: always + network_mode: host + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + +volumes: + prometheus_data: + grafana_data: + loki_data: ``` -### 3.5 Step 5: 인프라 인스턴스 생성 +--- + +## 6. 설정 파일 + +### 6.1 환경 변수 파일 -#### RabbitMQ +#### 6.1.1 backend.env + +모든 앱/워커 인스턴스에 배포. 컨테이너명을 IP로 교체. ```bash -gcloud compute instances create-with-container speedcam-rabbitmq \ - --zone=${GCP_ZONE} \ - --machine-type=e2-small \ - --tags=speedcam,speedcam-web \ - --container-image=rabbitmq:3.13-management \ - --container-env="RABBITMQ_DEFAULT_USER=sa,RABBITMQ_DEFAULT_PASS=1234" +# Django +SECRET_KEY= +DJANGO_SETTINGS_MODULE=config.settings.prod +DEBUG=False + +# Database — ${DB_HOST}를 실제 IP로 교체 +DB_HOST=${DB_HOST} +DB_PORT=3306 +DB_USER=sa +DB_PASSWORD= +DB_NAME=speedcam +DB_NAME_VEHICLES=speedcam_vehicles +DB_NAME_DETECTIONS=speedcam_detections +DB_NAME_NOTIFICATIONS=speedcam_notifications + +# RabbitMQ — ${MQ_HOST}를 실제 IP로 교체 +CELERY_BROKER_URL=amqp://sa:@${MQ_HOST}:5672// +RABBITMQ_HOST=${MQ_HOST} +MQTT_PORT=1883 +MQTT_USER=sa +MQTT_PASS= + +# GCS / Firebase +GOOGLE_APPLICATION_CREDENTIALS=/app/credentials/gcp-cloud-storage.json +FIREBASE_CREDENTIALS=/app/credentials/firebase-service-account.json + +# Workers +OCR_CONCURRENCY=4 +ALERT_CONCURRENCY=100 +OCR_MOCK=false +FCM_MOCK=false + +# Gunicorn +GUNICORN_WORKERS=4 +GUNICORN_THREADS=2 + +# Logging +LOG_LEVEL=info + +# CORS — 프론트엔드 도메인으로 교체 +CORS_ALLOWED_ORIGINS=https://your-frontend-domain.com + +# OpenTelemetry — ${MON_HOST}를 실제 IP로 교체 +OTEL_EXPORTER_OTLP_ENDPOINT=http://${MON_HOST}:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_RESOURCE_ATTRIBUTES=service.namespace=speedcam,deployment.environment=prod +OTEL_TRACES_SAMPLER=parentbased_always_on +OTEL_PYTHON_LOG_CORRELATION=true ``` -#### MySQL +#### 6.1.2 mysql.env + +DB 인스턴스에 배포. ```bash -gcloud compute instances create-with-container speedcam-mysql \ - --zone=${GCP_ZONE} \ - --machine-type=e2-small \ - --tags=speedcam \ - --container-image=mysql:8.0 \ - --container-env="MYSQL_ROOT_PASSWORD=root,MYSQL_USER=sa,MYSQL_PASSWORD=1234,MYSQL_DATABASE=speedcam" +MYSQL_ROOT_PASSWORD= +MYSQL_USER=sa +MYSQL_PASSWORD= +MYSQL_DATABASE=speedcam ``` -### 3.6 Step 6: 인프라 초기화 +#### 6.1.3 rabbitmq.env + +MQ 인스턴스에 배포. ```bash -# RabbitMQ MQTT 플러그인 활성화 -gcloud compute ssh speedcam-rabbitmq --zone=${GCP_ZONE} \ - --command="docker exec \$(docker ps -q) rabbitmq-plugins enable rabbitmq_mqtt" +RABBITMQ_DEFAULT_USER=sa +RABBITMQ_DEFAULT_PASS= +``` -# MySQL 추가 데이터베이스 생성 -gcloud compute ssh speedcam-mysql --zone=${GCP_ZONE} \ - --command="docker exec \$(docker ps -q) mysql -u root -proot -e \" - CREATE DATABASE IF NOT EXISTS speedcam_vehicles; - CREATE DATABASE IF NOT EXISTS speedcam_detections; - CREATE DATABASE IF NOT EXISTS speedcam_notifications; - GRANT ALL PRIVILEGES ON speedcam_vehicles.* TO 'sa'@'%'; - GRANT ALL PRIVILEGES ON speedcam_detections.* TO 'sa'@'%'; - GRANT ALL PRIVILEGES ON speedcam_notifications.* TO 'sa'@'%'; - FLUSH PRIVILEGES;\"" +### 6.2 모니터링 설정 + +#### 6.2.1 prometheus.yml + +모니터링 인스턴스에 배포. 모든 타겟을 실제 IP로 지정. + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # --- Application --- + - job_name: "django" + metrics_path: /metrics + static_configs: + - targets: ["${APP_HOST}:8000"] + + # --- OpenTelemetry Collector --- + - job_name: "otel-collector" + static_configs: + - targets: ["localhost:8889"] + + # --- Infrastructure --- + - job_name: "rabbitmq" + static_configs: + - targets: ["${MQ_HOST}:15692"] + + - job_name: "mysql" + static_configs: + - targets: ["${DB_HOST}:9104"] + + # --- Workers --- + - job_name: "celery" + static_configs: + - targets: ["localhost:9808"] + + # --- Container Resources (모든 인스턴스) --- + - job_name: "cadvisor" + static_configs: + - targets: + - "${DB_HOST}:8080" + - "${MQ_HOST}:8080" + - "${APP_HOST}:8080" + - "${OCR_HOST}:8080" + - "${ALERT_HOST}:8080" + - "localhost:8080" + relabel_configs: + - source_labels: [__address__] + target_label: instance ``` -### 3.7 Step 7: Internal IP 확인 +#### 6.2.2 otel-collector-config.yml + +모니터링 인스턴스에 배포. + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 1024 + resource: + attributes: + - key: service.namespace + value: speedcam + action: upsert + +exporters: + otlp/jaeger: + endpoint: localhost:4317 + tls: + insecure: true + prometheus: + endpoint: 0.0.0.0:8889 + namespace: speedcam + resource_to_telemetry_conversion: + enabled: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/jaeger] + metrics: + receivers: [otlp] + processors: [batch, resource] + exporters: [prometheus] +``` -```bash -# 인스턴스 IP 확인 -gcloud compute instances list --filter="name~speedcam" \ - --format="table(name,networkInterfaces[0].networkIP)" +#### 6.2.3 promtail-config.yml (앱/워커/DB/MQ 인스턴스 공통) + +각 인스턴스에 배포. Loki 주소를 모니터링 인스턴스 IP로 지정. + +```yaml +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://${MON_HOST}:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: name + values: + - "speedcam-.*" + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_name"] + regex: "/speedcam-(.*)" + target_label: "service" + pipeline_stages: + - regex: + expression: ".*trace_id=(?P[a-f0-9]+).*" + - labels: + trace_id: +``` -# 예시 출력: -# speedcam-rabbitmq 10.178.0.2 -# speedcam-mysql 10.178.0.3 +#### 6.2.4 promtail-config.mon.yml (모니터링 인스턴스 전용) + +Loki가 같은 인스턴스이므로 localhost. + +```yaml +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://localhost:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: name + values: + - "speedcam-.*" + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_name"] + regex: "/speedcam-(.*)" + target_label: "service" + pipeline_stages: + - regex: + expression: ".*trace_id=(?P[a-f0-9]+).*" + - labels: + trace_id: ``` -### 3.8 Step 8: 서비스 인스턴스 생성 +#### 6.2.5 grafana/provisioning/datasources/datasources.yml + +모니터링 인스턴스에 배포. + +```yaml +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false + + - name: Jaeger + type: jaeger + access: proxy + url: http://localhost:16686 + editable: false + + - name: Loki + type: loki + access: proxy + url: http://localhost:3100 + editable: false + jsonData: + derivedFields: + - datasourceUid: jaeger + matcherRegex: "trace_id=(\\w+)" + name: TraceID + url: "$${__value.raw}" + datasourceName: Jaeger +``` + +#### 6.2.6 mysqld-exporter/.my.cnf + +DB 인스턴스에 배포. + +```ini +[client] +user=sa +password= +host=localhost +port=3306 +``` + +### 6.3 envsubst를 활용한 IP 자동 주입 + +수동으로 IP를 교체하는 대신 환경변수 파일 하나로 관리할 수 있다. ```bash -# 환경 변수 (Internal IP로 대체) -RABBITMQ_IP=10.178.0.2 -MYSQL_IP=10.178.0.3 +# deploy/env/hosts.env — IP 정의 (이것만 수정) +export DB_HOST=10.178.0.11 +export MQ_HOST=10.178.0.12 +export APP_HOST=10.178.0.13 +export OCR_HOST=10.178.0.14 +export ALERT_HOST=10.178.0.15 +export MON_HOST=10.178.0.20 +``` -# Main Service -gcloud compute instances create-with-container speedcam-main \ - --zone=${GCP_ZONE} \ - --machine-type=e2-medium \ - --tags=speedcam,speedcam-web \ - --scopes=cloud-platform \ - --container-image=${REGISTRY}/main:latest \ - --container-env="DJANGO_SETTINGS_MODULE=config.settings.dev,\ -DB_HOST=${MYSQL_IP},DB_PORT=3306,\ -DB_NAME=speedcam,DB_NAME_VEHICLES=speedcam_vehicles,\ -DB_NAME_DETECTIONS=speedcam_detections,DB_NAME_NOTIFICATIONS=speedcam_notifications,\ -DB_USER=sa,DB_PASSWORD=1234,\ -CELERY_BROKER_URL=amqp://sa:1234@${RABBITMQ_IP}:5672//,\ -RABBITMQ_HOST=${RABBITMQ_IP},MQTT_PORT=1883,MQTT_USER=sa,MQTT_PASS=1234,\ -OCR_MOCK=true,FCM_MOCK=true" +```bash +# 배포 스크립트 예시 +source env/hosts.env -# OCR Worker -gcloud compute instances create-with-container speedcam-ocr \ - --zone=${GCP_ZONE} \ - --machine-type=e2-medium \ - --tags=speedcam \ - --scopes=cloud-platform \ - --container-image=${REGISTRY}/ocr:latest \ - --container-env="DJANGO_SETTINGS_MODULE=config.settings.dev,\ -DB_HOST=${MYSQL_IP},DB_PORT=3306,\ -DB_NAME=speedcam,DB_NAME_VEHICLES=speedcam_vehicles,\ -DB_NAME_DETECTIONS=speedcam_detections,DB_NAME_NOTIFICATIONS=speedcam_notifications,\ -DB_USER=sa,DB_PASSWORD=1234,\ -CELERY_BROKER_URL=amqp://sa:1234@${RABBITMQ_IP}:5672//,\ -OCR_CONCURRENCY=2,OCR_MOCK=true" +# 템플릿에서 실제 설정 파일 생성 +envsubst < config/monitoring/prometheus/prometheus.yml.template \ + > config/monitoring/prometheus/prometheus.yml -# Alert Worker -gcloud compute instances create-with-container speedcam-alert \ - --zone=${GCP_ZONE} \ - --machine-type=e2-small \ - --tags=speedcam \ - --scopes=cloud-platform \ - --container-image=${REGISTRY}/alert:latest \ - --container-env="DJANGO_SETTINGS_MODULE=config.settings.dev,\ -DB_HOST=${MYSQL_IP},DB_PORT=3306,\ -DB_NAME=speedcam,DB_NAME_VEHICLES=speedcam_vehicles,\ -DB_NAME_DETECTIONS=speedcam_detections,DB_NAME_NOTIFICATIONS=speedcam_notifications,\ -DB_USER=sa,DB_PASSWORD=1234,\ -CELERY_BROKER_URL=amqp://sa:1234@${RABBITMQ_IP}:5672//,\ -ALERT_CONCURRENCY=50,FCM_MOCK=true" +envsubst < env/backend.env.template \ + > env/backend.env + +envsubst < config/monitoring/promtail/promtail-config.yml.template \ + > config/monitoring/promtail/promtail-config.yml + +envsubst < compose/docker-compose.mon.yml.template \ + > compose/docker-compose.mon.yml ``` -### 3.9 Step 9: Django 마이그레이션 +이렇게 하면 GCP 계정이 바뀌어도 `hosts.env`만 수정하고 envsubst를 다시 실행하면 된다. + +--- + +## 7. Docker 이미지 빌드 및 배포 + +### 7.1 Docker 이미지 빌드 ```bash -gcloud compute ssh speedcam-main --zone=${GCP_ZONE} --command="\ -docker exec \$(docker ps -q) python manage.py makemigrations vehicles detections notifications && \ -docker exec \$(docker ps -q) python manage.py migrate --database=default --noinput && \ -docker exec \$(docker ps -q) python manage.py migrate vehicles --database=vehicles_db --noinput && \ -docker exec \$(docker ps -q) python manage.py migrate detections --database=detections_db --noinput && \ -docker exec \$(docker ps -q) python manage.py migrate notifications --database=notifications_db --noinput" +# linux/amd64 플랫폼으로 빌드 (GCE용) +docker build --platform linux/amd64 \ + -t ${ARTIFACT_REGISTRY}/speedcam-main:latest \ + -f docker/Dockerfile.main . + +docker build --platform linux/amd64 \ + -t ${ARTIFACT_REGISTRY}/speedcam-ocr:latest \ + -f docker/Dockerfile.ocr . + +docker build --platform linux/amd64 \ + -t ${ARTIFACT_REGISTRY}/speedcam-alert:latest \ + -f docker/Dockerfile.alert . +``` + +### 7.2 Docker 이미지 푸시 + +```bash +docker push ${ARTIFACT_REGISTRY}/speedcam-main:latest +docker push ${ARTIFACT_REGISTRY}/speedcam-ocr:latest +docker push ${ARTIFACT_REGISTRY}/speedcam-alert:latest ``` --- -## 4. 배포 검증 +## 8. 배포 순서 -### 4.1 인스턴스 상태 확인 +인프라 → 앱 → 모니터링 순서로 배포한다. + +### Step 1: IP 확인 ```bash -gcloud compute instances list --filter="name~speedcam" +# GCP 콘솔 또는 CLI에서 각 인스턴스 내부 IP 확인 +gcloud compute instances list --filter="name~speedcam" \ + --format="table(name, networkInterfaces[0].networkIP)" + +# 예시 출력: +# NAME INTERNAL_IP +# speedcam-db 10.178.0.11 +# speedcam-mq 10.178.0.12 +# speedcam-app 10.178.0.13 +# speedcam-ocr 10.178.0.14 +# speedcam-alert 10.178.0.15 +# speedcam-mon 10.178.0.20 ``` -### 4.2 서비스 헬스체크 +### Step 2: 설정 파일에 IP 주입 ```bash -# Main API External IP 확인 -MAIN_IP=$(gcloud compute instances describe speedcam-main \ - --zone=${GCP_ZONE} \ - --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +# backend.env, prometheus.yml, promtail-config.yml 등에서 +# ${DB_HOST}, ${MQ_HOST} 등을 실제 IP로 교체 +# (sed 또는 envsubst 사용 가능) + +# envsubst 예시 +source deploy/env/hosts.env +envsubst < deploy/env/backend.env.template > deploy/env/backend.env +envsubst < deploy/config/monitoring/prometheus/prometheus.yml.template > deploy/config/monitoring/prometheus/prometheus.yml +``` -# Health Check -curl http://${MAIN_IP}:8000/health/ -# Expected: {"status": "healthy"} +### Step 3: DB 인스턴스 배포 (먼저) -# Swagger UI -echo "Swagger: http://${MAIN_IP}:8000/swagger/" +```bash +gcloud compute ssh speedcam-db --zone=${GCP_ZONE} + +cd deploy/compose +docker compose -f docker-compose.db.yml up -d + +# MySQL healthy 확인 +docker compose -f docker-compose.db.yml ps +docker logs speedcam-mysql ``` -### 4.3 RabbitMQ 확인 +### Step 4: MQ 인스턴스 배포 ```bash -RABBITMQ_IP=$(gcloud compute instances describe speedcam-rabbitmq \ - --zone=${GCP_ZONE} \ - --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +gcloud compute ssh speedcam-mq --zone=${GCP_ZONE} + +cd deploy/compose +docker compose -f docker-compose.mq.yml up -d + +# RabbitMQ healthy 확인 +docker compose -f docker-compose.mq.yml ps +docker logs speedcam-rabbitmq +``` + +### Step 5: App 인스턴스 배포 + +```bash +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} + +cd deploy/compose +docker compose -f docker-compose.app.yml up -d -echo "RabbitMQ Management: http://${RABBITMQ_IP}:15672/" -# Credentials: sa / 1234 +# curl localhost:8000/health/ 로 확인 +curl localhost:8000/health/ ``` -### 4.4 Worker 로그 확인 +### Step 6: Django 마이그레이션 + +```bash +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} + +docker exec speedcam-main python manage.py makemigrations vehicles detections notifications +docker exec speedcam-main python manage.py migrate --database=default --noinput +docker exec speedcam-main python manage.py migrate vehicles --database=vehicles_db --noinput +docker exec speedcam-main python manage.py migrate detections --database=detections_db --noinput +docker exec speedcam-main python manage.py migrate notifications --database=notifications_db --noinput +``` + +### Step 7: Worker 인스턴스 배포 ```bash # OCR Worker -gcloud compute ssh speedcam-ocr --zone=${GCP_ZONE} \ - --command="docker logs \$(docker ps -q) 2>&1 | tail -20" +gcloud compute ssh speedcam-ocr --zone=${GCP_ZONE} +cd deploy/compose +docker compose -f docker-compose.ocr.yml up -d # Alert Worker -gcloud compute ssh speedcam-alert --zone=${GCP_ZONE} \ - --command="docker logs \$(docker ps -q) 2>&1 | tail -20" +gcloud compute ssh speedcam-alert --zone=${GCP_ZONE} +cd deploy/compose +docker compose -f docker-compose.alert.yml up -d ``` +### Step 8: 모니터링 인스턴스 배포 (마지막) + +```bash +gcloud compute ssh speedcam-mon --zone=${GCP_ZONE} + +cd deploy/compose +docker compose -f docker-compose.mon.yml up -d + +# Prometheus targets 확인 +curl -s localhost:9090/api/v1/targets | python3 -c " +import json, sys +data = json.load(sys.stdin) +for t in data['data']['activeTargets']: + print(f\"{t['labels']['job']:20s} {t['labels']['instance']:30s} {t['health']}\") +" +``` + +--- + +## 9. 배포 검증 + +배포 후 아래 항목을 순서대로 확인한다. + +### 9.1 인프라 + +- [ ] MySQL 접속: `mysql -h ${DB_HOST} -u sa -p` +- [ ] RabbitMQ Management UI: `http://${MQ_HOST}:15672` (sa/) +- [ ] RabbitMQ Prometheus metrics: `curl http://${MQ_HOST}:15692/metrics | head` + +### 9.2 애플리케이션 + +- [ ] Django health: `curl http://${APP_HOST}:8000/health/` +- [ ] Django metrics: `curl http://${APP_HOST}:8000/metrics | head` +- [ ] Swagger UI: `http://${APP_HOST}:8000/swagger/` +- [ ] Flower: `http://${APP_HOST}:5555` + +### 9.3 워커 + +- [ ] OCR Worker 로그: `gcloud compute ssh speedcam-ocr --zone=${GCP_ZONE} --command="docker logs speedcam-ocr"` +- [ ] Alert Worker 로그: `gcloud compute ssh speedcam-alert --zone=${GCP_ZONE} --command="docker logs speedcam-alert"` +- [ ] RabbitMQ에서 consumer 확인: Management UI → Queues → ocr_queue, fcm_queue + +### 9.4 모니터링 + +- [ ] Prometheus targets 전부 UP: `http://${MON_HOST}:9090/targets` +- [ ] Grafana 접속: `http://${MON_HOST}:3000` (admin/admin) +- [ ] Grafana 데이터소스 3개 연결: Prometheus, Jaeger, Loki +- [ ] Jaeger에서 서비스 트레이스 확인: `http://${MON_HOST}:16686` +- [ ] Loki에서 로그 확인: Grafana → Explore → Loki → `{service="main"}` +- [ ] cAdvisor: 6개 인스턴스 모두 cadvisor:8080 → Prometheus에서 수집 확인 + --- -## 5. 운영 가이드 +## 10. 운영 가이드 -### 5.1 인스턴스 재시작 +### 10.1 서비스 재시작 ```bash -# 개별 재시작 -gcloud compute instances reset speedcam-main --zone=${GCP_ZONE} +# 개별 인스턴스 재시작 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} +cd deploy/compose +docker compose -f docker-compose.app.yml restart # 전체 서비스 재시작 -gcloud compute instances reset speedcam-main speedcam-ocr speedcam-alert --zone=${GCP_ZONE} +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} --command="cd deploy/compose && docker compose -f docker-compose.app.yml restart" +gcloud compute ssh speedcam-ocr --zone=${GCP_ZONE} --command="cd deploy/compose && docker compose -f docker-compose.ocr.yml restart" +gcloud compute ssh speedcam-alert --zone=${GCP_ZONE} --command="cd deploy/compose && docker compose -f docker-compose.alert.yml restart" ``` -### 5.2 이미지 업데이트 배포 +### 10.2 이미지 업데이트 배포 ```bash # 1. 새 이미지 빌드 & 푸시 -docker build --platform linux/amd64 -t ${REGISTRY}/main:latest -f docker/Dockerfile.main . -docker push ${REGISTRY}/main:latest - -# 2. 인스턴스 재시작 (새 이미지 pull) -gcloud compute instances reset speedcam-main --zone=${GCP_ZONE} +docker build --platform linux/amd64 -t ${ARTIFACT_REGISTRY}/speedcam-main:latest -f docker/Dockerfile.main . +docker push ${ARTIFACT_REGISTRY}/speedcam-main:latest + +# 2. 인스턴스에서 새 이미지 pull 및 재시작 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} +cd deploy/compose +docker compose -f docker-compose.app.yml pull +docker compose -f docker-compose.app.yml up -d ``` -### 5.3 스케일링 +### 10.3 스케일링 ```bash # OCR Worker 추가 인스턴스 -gcloud compute instances create-with-container speedcam-ocr-2 \ +gcloud compute instances create speedcam-ocr-2 \ --zone=${GCP_ZONE} \ - --machine-type=e2-medium \ + --machine-type=e2-standard-2 \ + --network-interface=subnet=speedcam-subnet,no-address \ --tags=speedcam \ - --scopes=cloud-platform \ - --container-image=${REGISTRY}/ocr:latest \ - --container-env="..." # 동일한 환경변수 + --scopes=cloud-platform + +# 동일한 설정 파일로 배포 +gcloud compute scp --recurse deploy/ speedcam-ocr-2:~ --zone=${GCP_ZONE} +gcloud compute ssh speedcam-ocr-2 --zone=${GCP_ZONE} +cd deploy/compose +docker compose -f docker-compose.ocr.yml up -d ``` -### 5.4 로그 모니터링 +### 10.4 로그 모니터링 ```bash # 실시간 로그 -gcloud compute ssh speedcam-main --zone=${GCP_ZONE} \ - --command="docker logs -f \$(docker ps -q)" +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker logs -f speedcam-main" + +# 최근 로그 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker logs --tail 100 speedcam-main" ``` +### 10.5 IP 변경 시 수정 대상 + +GCP 계정/프로젝트 변경으로 IP가 바뀌면 아래 파일만 교체하면 된다. + +| 파일 | 교체 대상 | 배포 위치 | +|------|-----------|-----------| +| `env/backend.env` | `DB_HOST`, `RABBITMQ_HOST`, `CELERY_BROKER_URL`, `OTEL_EXPORTER_OTLP_ENDPOINT` | app, ocr, alert | +| `config/monitoring/prometheus/prometheus.yml` | 모든 targets IP | mon | +| `config/monitoring/promtail/promtail-config.yml` | Loki URL (`${MON_HOST}`) | db, mq, app, ocr, alert | +| `docker-compose.mon.yml` | `CE_BROKER_URL` 의 MQ IP | mon | + +Grafana datasources, OTel Collector config, mysqld-exporter .my.cnf는 같은 인스턴스 내 통신(localhost)이므로 IP 변경 영향 없음. + --- -## 6. 트러블슈팅 +## 11. 트러블슈팅 -### 6.1 컨테이너 시작 실패 +### 11.1 컨테이너 시작 실패 ```bash # 컨테이너 상태 확인 -gcloud compute ssh speedcam-main --zone=${GCP_ZONE} \ +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ --command="docker ps -a" # 종료된 컨테이너 로그 확인 -gcloud compute ssh speedcam-main --zone=${GCP_ZONE} \ - --command="docker logs \$(docker ps -aq | head -1)" +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker logs speedcam-main" ``` -### 6.2 DB 연결 실패 +### 11.2 DB 연결 실패 ```bash # MySQL 연결 테스트 -gcloud compute ssh speedcam-main --zone=${GCP_ZONE} \ - --command="docker exec \$(docker ps -q) python -c \" +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker exec speedcam-main python -c \" import pymysql -conn = pymysql.connect(host='10.178.0.3', user='sa', password='1234', database='speedcam') +conn = pymysql.connect(host='${DB_HOST}', user='sa', password='', database='speedcam') print('Connected!') conn.close()\"" + +# 방화벽 규칙 확인 +gcloud compute firewall-rules list --filter="name~speedcam" ``` -### 6.3 MQTT 연결 실패 +### 11.3 MQTT 연결 실패 ```bash # RabbitMQ MQTT 플러그인 상태 확인 -gcloud compute ssh speedcam-rabbitmq --zone=${GCP_ZONE} \ - --command="docker exec \$(docker ps -q) rabbitmq-plugins list | grep mqtt" +gcloud compute ssh speedcam-mq --zone=${GCP_ZONE} \ + --command="docker exec speedcam-rabbitmq rabbitmq-plugins list | grep mqtt" + +# MQTT 포트 listening 확인 +gcloud compute ssh speedcam-mq --zone=${GCP_ZONE} \ + --command="netstat -tlnp | grep 1883" ``` -### 6.4 이미지 Pull 실패 +### 11.4 이미지 Pull 실패 ```bash # 서비스 계정 권한 확인 -gcloud compute instances describe speedcam-main --zone=${GCP_ZONE} \ +gcloud compute instances describe speedcam-app --zone=${GCP_ZONE} \ --format='get(serviceAccounts[0].scopes)' # cloud-platform 스코프 필요 +# 없으면 인스턴스 재생성 또는 scope 추가 +``` + +### 11.5 host 모드 네트워크 문제 + +```bash +# 포트 사용 확인 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="netstat -tlnp | grep 8000" + +# 컨테이너 네트워크 모드 확인 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker inspect speedcam-main | grep NetworkMode" +``` + +### 11.6 모니터링 메트릭 수집 실패 + +```bash +# Prometheus targets 상태 확인 +curl http://${MON_HOST}:9090/api/v1/targets | python3 -m json.tool + +# cAdvisor 접근 확인 +curl http://${APP_HOST}:8080/metrics | head + +# Promtail 로그 확인 +gcloud compute ssh speedcam-app --zone=${GCP_ZONE} \ + --command="docker logs speedcam-promtail" ``` --- -## 7. 리소스 정리 +## 12. 리소스 정리 ```bash # 모든 인스턴스 삭제 gcloud compute instances delete \ - speedcam-rabbitmq speedcam-mysql \ - speedcam-main speedcam-ocr speedcam-alert \ + speedcam-db speedcam-mq speedcam-app \ + speedcam-ocr speedcam-alert speedcam-mon \ --zone=${GCP_ZONE} --quiet # 방화벽 규칙 삭제 -gcloud compute firewall-rules delete speedcam-internal speedcam-external --quiet +gcloud compute firewall-rules delete \ + speedcam-internal \ + speedcam-api-external \ + speedcam-mqtt-external \ + speedcam-grafana-external \ + --quiet # Artifact Registry 삭제 -gcloud artifacts repositories delete speedcam --location=${GCP_REGION} --quiet +gcloud artifacts repositories delete speedcam \ + --location=${GCP_REGION} --quiet + +# VPC 서브넷 삭제 +gcloud compute networks subnets delete speedcam-subnet \ + --region=${GCP_REGION} --quiet + +# VPC 네트워크 삭제 +gcloud compute networks delete speedcam-vpc --quiet ``` --- @@ -536,3 +1465,4 @@ gcloud artifacts repositories delete speedcam --location=${GCP_REGION} --quiet | 날짜 | 버전 | 변경 내용 | |------|------|----------| | 2026-01-23 | 1.0 | 초기 문서 작성 | +| 2026-02-06 | 2.0 | 멀티 인스턴스 배포 방식으로 전면 재작성 (6개 인스턴스, docker-compose + host mode, 모니터링 스택 추가) | diff --git a/docs/MONITORING.md b/docs/MONITORING.md new file mode 100644 index 0000000..65251c4 --- /dev/null +++ b/docs/MONITORING.md @@ -0,0 +1,452 @@ +# 모니터링 스택 가이드 + +## 1. 아키텍처 개요 + +``` +App Services (main, ocr-worker, alert-worker) + │ OTLP gRPC (:4317) + ▼ +OTel Collector ──traces──► Jaeger (:16686) ──► Grafana (:3000) + │ ▲ + └──metrics──► Prometheus (:9090) ──────────────┘ + ▲ │ +cAdvisor ─────────────┤ Loki (:3100) ◄── Promtail +Django /metrics ──────┤ ▲ +RabbitMQ :15692 ──────┤ Docker logs +mysqld-exporter ──────┤ +celery-exporter ──────┘ + +K6 (부하테스트) ──prometheus remote write──► Prometheus +``` + +--- + +## 2. 서비스 구성 + +### 2.1 전체 서비스 목록 + +| 서비스 | 이미지 | 포트 | 역할 | +|--------|--------|------|------| +| **otel-collector** | `otel/opentelemetry-collector-contrib:0.98.0` | 4317 (gRPC), 4318 (HTTP), 8889 | 트레이스/메트릭 수집 허브 | +| **jaeger** | `jaegertracing/all-in-one:1.57` | 16686 (UI), 14250 | 분산 트레이싱 저장/UI | +| **prometheus** | `prom/prometheus:v2.51.2` | 9090 | 메트릭 수집/저장/쿼리 | +| **grafana** | `grafana/grafana:10.4.2` | 3000 | 통합 대시보드 | +| **loki** | `grafana/loki:2.9.6` | 3100 | 로그 집계/저장 | +| **promtail** | `grafana/promtail:2.9.6` | - | Docker 로그 → Loki 전송 | +| **cadvisor** | `gcr.io/cadvisor/cadvisor:v0.49.1` | 8080 | 컨테이너 리소스 메트릭 | +| **mysqld-exporter** | `prom/mysqld-exporter:v0.15.1` | 9104 | MySQL 메트릭 노출 | +| **celery-exporter** | `danihodovic/celery-exporter:0.10.3` | 9808 | Celery 큐/태스크 메트릭 | +| **k6** | `grafana/k6:latest` | - | 부하 테스트 (on-demand) | + +### 2.2 Prometheus Scrape Targets + +| Job | Target | 수집 항목 | +|-----|--------|-----------| +| `django` | `main:8000/metrics` | HTTP 요청 수, 응답 시간, DB 쿼리 수 | +| `otel-collector` | `otel-collector:8889` | OTel에서 변환된 앱 메트릭 | +| `rabbitmq` | `rabbitmq:15692` | 큐 깊이, 메시지 rate, 커넥션, 채널 | +| `mysql` | `mysqld-exporter:9104` | 쿼리 수, 커넥션, InnoDB 버퍼, 슬로우 쿼리 | +| `celery` | `celery-exporter:9808` | 태스크 성공/실패, 실행 시간, 큐 길이 | +| `cadvisor` | `cadvisor:8080` | 컨테이너 CPU, 메모리, 네트워크 I/O | + +--- + +## 3. 실행 방법 + +### 3.1 기본 서비스만 (모니터링 없이) + +```bash +cd docker +docker compose up -d +``` + +앱은 모니터링 스택 없이도 정상 동작함. OTel Collector에 연결 실패해도 앱은 죽지 않음 (graceful fallback). + +### 3.2 모니터링 포함 + +```bash +cd docker +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d +``` + +### 3.3 부하 테스트 포함 (k6) + +```bash +# k6 서비스는 profiles: [loadtest] 이므로 명시적 실행 필요 +cd docker +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + run k6 run --out experimental-prometheus-rw /scripts/load-test.js +``` + +### 3.4 모니터링만 재시작 (앱 유지) + +```bash +cd docker +docker compose -f docker-compose.monitoring.yml restart prometheus grafana +``` + +--- + +## 4. 접속 정보 + +| 서비스 | URL | 인증 | +|--------|-----|------| +| **Grafana** | http://localhost:3000 | admin / admin | +| **Prometheus** | http://localhost:9090 | 없음 | +| **Jaeger** | http://localhost:16686 | 없음 | +| **RabbitMQ Management** | http://localhost:15672 | sa / 1234 | +| **Flower** | http://localhost:5555 | 없음 | +| **cAdvisor** | http://localhost:8080 | 없음 | +| **Django /metrics** | http://localhost:8000/metrics | 없음 | + +--- + +## 5. Prometheus 타겟 확인 + +### 5.1 UI에서 확인 + +``` +http://localhost:9090/targets +``` + +6개 job이 모두 **UP** (초록색)이면 정상. + +### 5.2 API로 확인 + +```bash +curl -s http://localhost:9090/api/v1/targets | python3 -c " +import json, sys +data = json.load(sys.stdin) +for t in data['data']['activeTargets']: + print(f\"{t['labels']['job']:20s} {t['labels']['instance']:30s} {t['health']}\") +" +``` + +### 5.3 타겟이 DOWN일 때 + +| 증상 | 원인 | 해결 | +|------|------|------| +| django DOWN | main 컨테이너 미기동 또는 django-prometheus 미설치 | `docker logs speedcam-main` 확인 | +| rabbitmq DOWN | rabbitmq_prometheus 플러그인 미활성화 | docker-compose.yml의 command에 `rabbitmq_prometheus` 포함 확인 | +| mysql DOWN | mysqld-exporter 인증 실패 | `.my.cnf` 파일의 user/password 확인 | +| celery DOWN | celery-exporter가 broker 연결 실패 | RabbitMQ 기동 여부 확인 | + +--- + +## 6. 설정 파일 구조 + +``` +docker/monitoring/ +├── otel-collector/ +│ └── otel-collector-config.yml # OTLP 수신 → Jaeger/Prometheus 내보내기 +├── prometheus/ +│ └── prometheus.yml # scrape targets 정의 +├── loki/ +│ └── loki-config.yml # 로그 저장 (7일 보존) +├── promtail/ +│ └── promtail-config.yml # Docker 로그 수집 → Loki 전송 +├── grafana/ +│ └── provisioning/ +│ ├── datasources/ +│ │ └── datasources.yml # Prometheus, Jaeger, Loki 자동 등록 +│ └── dashboards/ +│ └── dashboards.yml # 대시보드 프로비저닝 +└── mysqld-exporter/ + └── .my.cnf # MySQL 접속 정보 +``` + +--- + +## 7. OpenTelemetry 계측 + +### 7.1 앱 계측 방식 + +`opentelemetry-instrument` CLI로 자동 계측 (코드 수정 없음): + +```bash +# start_main.sh +opentelemetry-instrument \ + --service_name speedcam-api \ + gunicorn config.wsgi:application ... + +# start_ocr_worker.sh +opentelemetry-instrument \ + --service_name speedcam-ocr \ + celery -A config worker ... + +# start_alert_worker.sh +opentelemetry-instrument \ + --service_name speedcam-alert \ + celery -A config worker ... +``` + +### 7.2 자동 계측 대상 + +| 패키지 | 계측 대상 | +|--------|-----------| +| `opentelemetry-instrumentation-django` | HTTP 요청/응답, 미들웨어 | +| `opentelemetry-instrumentation-celery` | 태스크 실행, 큐 대기 시간 | +| `opentelemetry-instrumentation-pymysql` | DB 쿼리, 커넥션 | +| `opentelemetry-instrumentation-requests` | 외부 HTTP 호출 (GCS, FCM) | +| `opentelemetry-instrumentation-logging` | 로그에 trace_id/span_id 주입 | + +### 7.3 환경변수 (backend.env) + +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_RESOURCE_ATTRIBUTES=service.namespace=speedcam,deployment.environment=dev +OTEL_TRACES_SAMPLER=parentbased_always_on +OTEL_PYTHON_LOG_CORRELATION=true +``` + +### 7.4 데이터 흐름 + +``` +Django/Celery → (OTLP gRPC) → OTel Collector + ├── traces → Jaeger + └── metrics → Prometheus (:8889) + +Django /metrics → (HTTP scrape) → Prometheus (django-prometheus 메트릭) +``` + +--- + +## 8. 로그 → 트레이스 연동 (Loki ↔ Jaeger) + +### 8.1 동작 원리 + +1. `opentelemetry-instrumentation-logging`이 로그에 `trace_id`, `span_id` 주입 +2. Django LOGGING 포맷: + ``` + INFO 2024-01-01 12:00:00 views [trace_id=abc123 span_id=def456] Request processed + ``` +3. Promtail이 로그에서 `trace_id` 추출 → Loki 라벨로 저장 +4. Grafana Loki 데이터소스의 `derivedFields`가 trace_id → Jaeger 링크 자동 생성 + +### 8.2 확인 방법 + +1. Grafana → Explore → Loki 데이터소스 선택 +2. `{service="main"}` 쿼리 실행 +3. 로그 라인의 `trace_id=` 부분 클릭 → Jaeger 트레이스로 이동 + +--- + +## 9. 유용한 PromQL 쿼리 + +### 9.1 Django + +```promql +# 초당 요청 수 (RPS) +rate(django_http_requests_total_by_method_total[5m]) + +# 응답 시간 p95 +histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket[5m])) + +# HTTP 5xx 에러율 +rate(django_http_responses_total_by_status_total{status=~"5.."}[5m]) +/ rate(django_http_responses_total_by_status_total[5m]) + +# DB 쿼리 수 +rate(django_db_execute_total[5m]) +``` + +### 9.2 RabbitMQ + +```promql +# 큐별 대기 메시지 수 +rabbitmq_queue_messages{queue=~"ocr_queue|fcm_queue"} + +# 초당 메시지 발행율 +rate(rabbitmq_queue_messages_published_total[5m]) + +# Consumer 수 +rabbitmq_queue_consumers{queue=~"ocr_queue|fcm_queue"} +``` + +### 9.3 MySQL + +```promql +# 활성 커넥션 수 +mysql_global_status_threads_connected + +# 초당 쿼리 수 +rate(mysql_global_status_questions[5m]) + +# 슬로우 쿼리 수 +rate(mysql_global_status_slow_queries[5m]) +``` + +### 9.4 Celery + +```promql +# 태스크 성공/실패 수 +celery_tasks_total{state="SUCCESS"} +celery_tasks_total{state="FAILURE"} + +# 태스크 실행 시간 +celery_tasks_runtime_seconds{quantile="0.95"} + +# 큐 길이 +celery_queue_length +``` + +### 9.5 컨테이너 리소스 + +```promql +# 컨테이너별 CPU 사용률 +rate(container_cpu_usage_seconds_total{name=~"speedcam-.*"}[5m]) * 100 + +# 컨테이너별 메모리 사용량 (MB) +container_memory_usage_bytes{name=~"speedcam-.*"} / 1024 / 1024 + +# 컨테이너별 네트워크 I/O (bytes/sec) +rate(container_network_receive_bytes_total{name=~"speedcam-.*"}[5m]) +``` + +--- + +## 10. K6 부하 테스트 + 모니터링 + +### 10.1 실행 + +```bash +cd docker + +# 부하 테스트 실행 (Prometheus에 결과 기록) +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + run k6 run --out experimental-prometheus-rw /scripts/load-test.js +``` + +### 10.2 K6 → Prometheus 메트릭 + +k6는 `--out experimental-prometheus-rw`로 결과를 Prometheus에 직접 기록. `--web.enable-remote-write-receiver` 플래그가 Prometheus에 설정되어 있음. + +| k6 메트릭 | PromQL | 의미 | +|-----------|--------|------| +| `k6_http_req_duration_seconds` | `histogram_quantile(0.95, rate(k6_http_req_duration_seconds_bucket[1m]))` | HTTP p95 응답 시간 | +| `k6_http_reqs_total` | `rate(k6_http_reqs_total[1m])` | 초당 HTTP 요청 수 | +| `k6_vus` | `k6_vus` | 현재 VU 수 | +| `k6_http_req_failed_total` | `rate(k6_http_req_failed_total[1m])` | 실패율 | + +### 10.3 부하 테스트 중 모니터링 체크리스트 + +부하 테스트 중 Grafana에서 아래 항목을 실시간 확인: + +| 확인 항목 | 보는 곳 | 정상 기준 | +|-----------|---------|-----------| +| API 응답 시간 | Prometheus - django 메트릭 | p95 < 500ms | +| 에러율 | Prometheus - django 5xx rate | < 1% | +| RabbitMQ 큐 깊이 | Prometheus - rabbitmq 메트릭 | 지속 증가 없음 | +| Celery 태스크 처리율 | Prometheus - celery 메트릭 | 발행율 ≈ 소비율 | +| MySQL 커넥션 | Prometheus - mysql 메트릭 | < pool size 80% | +| 컨테이너 CPU/메모리 | Prometheus - cadvisor 메트릭 | CPU < 80%, Memory < 85% | +| 분산 트레이스 | Jaeger | 에러 트레이스 없음 | +| 로그 에러 | Loki | ERROR 로그 급증 없음 | + +--- + +## 11. GCP 멀티 인스턴스 배포 시 고려사항 + +현재 Docker Compose는 단일 호스트 내 가상 네트워크. 인스턴스를 분리할 경우: + +### 11.1 인스턴스 분리 구성 예시 + +| 인스턴스 | 서비스 | GCP 머신 타입 | +|----------|--------|---------------| +| app | main, flower | e2-medium | +| ocr-worker | ocr-worker | e2-standard-2 (CPU) | +| alert-worker | alert-worker | e2-small | +| db | mysql | e2-highmem-2 | +| mq | rabbitmq | e2-medium | +| monitoring | prometheus, grafana, jaeger, loki, promtail, otel-collector, cadvisor, exporters | e2-standard-2 | + +### 11.2 네트워크 연결 방법 + +**방법 A: GCP 내부 IP 직접 지정** + +```bash +# 각 인스턴스의 backend.env에서 컨테이너명 대신 내부 IP 사용 +DB_HOST=10.178.0.11 # db 인스턴스 +CELERY_BROKER_URL=amqp://sa:1234@10.178.0.12:5672// # mq 인스턴스 +OTEL_EXPORTER_OTLP_ENDPOINT=http://10.178.0.15:4317 # monitoring 인스턴스 +``` + +**방법 B: GCP 내부 DNS (같은 VPC)** + +```bash +DB_HOST=db-instance.asia-northeast3-a.c.PROJECT_ID.internal +``` + +**방법 C: GKE (Kubernetes) — 서비스 분리가 목적이면 추천** + +- Service DNS 자동 부여: `mysql.default.svc.cluster.local` +- IP 관리 불필요 +- HPA로 worker auto-scaling 가능 +- `kompose convert`로 docker-compose → k8s 변환 가능 + +### 11.3 Prometheus 멀티 인스턴스 설정 + +인스턴스가 분리되면 `prometheus.yml`에서 내부 IP 사용: + +```yaml +scrape_configs: + - job_name: "django" + static_configs: + - targets: ["10.178.0.10:8000"] # app 인스턴스 + + - job_name: "rabbitmq" + static_configs: + - targets: ["10.178.0.12:15692"] # mq 인스턴스 + + - job_name: "mysql" + static_configs: + - targets: ["10.178.0.11:9104"] # db 인스턴스 (mysqld-exporter 같이 띄움) + + - job_name: "celery" + static_configs: + - targets: ["10.178.0.13:9808"] # celery-exporter를 어디서 띄울지 결정 필요 +``` + +### 11.4 주의사항 + +- GCP 방화벽 규칙에서 모니터링 포트 (9090, 4317, 15692, 9104, 9808 등) 내부 허용 필요 +- 외부 노출하면 안 되는 포트: Prometheus (9090), Grafana (3000) → VPN 또는 IAP 터널 사용 +- 각 인스턴스에서 cAdvisor를 로컬로 띄우고, 모니터링 인스턴스의 Prometheus가 모든 cAdvisor를 scrape + +--- + +## 12. 트러블슈팅 + +### 12.1 OTel Collector 연결 실패 + +```bash +docker logs speedcam-otel-collector +# "connection refused" → Jaeger 미기동 확인 +# "context deadline exceeded" → 네트워크 문제 +``` + +### 12.2 Grafana 데이터소스 연결 실패 + +```bash +# Grafana 컨테이너에서 직접 확인 +docker exec speedcam-grafana curl -s http://prometheus:9090/-/healthy +docker exec speedcam-grafana curl -s http://loki:3100/ready +docker exec speedcam-grafana curl -s http://jaeger:16686/ +``` + +### 12.3 Promtail 로그 수집 안됨 + +```bash +docker logs speedcam-promtail +# Docker socket 접근 권한 확인 +# container name이 speedcam-* 패턴인지 확인 +``` + +### 12.4 mysqld-exporter 인증 실패 + +```bash +docker logs speedcam-mysqld-exporter +# "Access denied" → .my.cnf의 user/password 확인 +# "no configuration found" → config.my-cnf 마운트 경로 확인 +``` diff --git a/docs/PERFORMANCE_TEST.md b/docs/PERFORMANCE_TEST.md deleted file mode 100644 index d6be556..0000000 --- a/docs/PERFORMANCE_TEST.md +++ /dev/null @@ -1,1094 +0,0 @@ -# 성능 테스트 가이드 - -## 1. 개요 - -### 1.1 목적 -Edge Device(Raspberry Pi) 없이 시스템의 성능과 안정성을 검증하기 위한 부하 테스트 수행 - -### 1.2 테스트 도구 -- **K6**: 부하 테스트 도구 -- **xk6-mqtt**: K6 MQTT 확장 (Edge Device 시뮬레이션) -- **Docker**: 테스트 환경 구성 - -### 1.3 테스트 대상 -| 구간 | 설명 | -|------|------| -| MQTT → Main Service | Edge Device 메시지 수신 처리 | -| Main Service → RabbitMQ | Task 발행 성능 | -| OCR Worker | 이미지 처리 처리량 | -| Alert Worker | FCM 전송 처리량 | -| REST API | API 응답 시간 | - ---- - -## 2. 테스트 환경 구성 - -### 2.1 아키텍처 - -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ Performance Test Environment │ -├─────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────┐ ┌─────────────────────────────────┐ │ -│ │ K6 Runner │ │ Application Stack │ │ -│ │ │ │ │ │ -│ │ - MQTT Publish │ ──────▶ │ RabbitMQ (MQTT + AMQP) │ │ -│ │ - HTTP Request │ │ Main Service (Django) │ │ -│ │ - Metrics │ ──────▶ │ OCR Worker (Mock) │ │ -│ │ │ │ Alert Worker (Mock) │ │ -│ └─────────────────┘ │ MySQL │ │ -│ └─────────────────────────────────┘ │ -│ │ -│ ┌─────────────────┐ ┌─────────────────────────────────┐ │ -│ │ Monitoring │ │ Mock Services │ │ -│ │ │ │ │ │ -│ │ - Grafana │◀────────│ - GCS Mock (MinIO) │ │ -│ │ - InfluxDB │ │ - FCM Mock (WireMock) │ │ -│ │ - DataDog │ │ │ │ -│ └─────────────────┘ └─────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────┘ -``` - -### 2.2 Docker Compose (테스트 환경) - -```yaml -# docker-compose.test.yml -version: '3.8' - -services: - # 애플리케이션 스택 - mysql: - image: mysql:8.0 - environment: - MYSQL_ROOT_PASSWORD: root - MYSQL_DATABASE: speedcam_test - MYSQL_USER: sa - MYSQL_PASSWORD: "1234" - ports: - - "3306:3306" - networks: - - test-network - - rabbitmq: - image: rabbitmq:3.13-management - environment: - RABBITMQ_DEFAULT_USER: sa - RABBITMQ_DEFAULT_PASS: "1234" - ports: - - "5672:5672" - - "1883:1883" - - "15672:15672" - volumes: - - ./rabbitmq/enabled_plugins:/etc/rabbitmq/enabled_plugins - - ./rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf - networks: - - test-network - - main: - build: - context: .. - dockerfile: docker/Dockerfile.main - environment: - - DJANGO_SETTINGS_MODULE=config.settings.dev - - DB_HOST=mysql - - DB_USER=sa - - DB_PASSWORD=1234 - - CELERY_BROKER_URL=amqp://sa:1234@rabbitmq:5672// - ports: - - "8000:8000" - depends_on: - - mysql - - rabbitmq - networks: - - test-network - - ocr-worker: - build: - context: .. - dockerfile: docker/Dockerfile.ocr - environment: - - DJANGO_SETTINGS_MODULE=config.settings.dev - - DB_HOST=mysql - - DB_USER=sa - - DB_PASSWORD=1234 - - CELERY_BROKER_URL=amqp://sa:1234@rabbitmq:5672// - - GCS_MOCK_URL=http://minio:9000 - - OCR_MOCK=true # OCR Mock 모드 - depends_on: - - rabbitmq - - minio - networks: - - test-network - - alert-worker: - build: - context: .. - dockerfile: docker/Dockerfile.alert - environment: - - DJANGO_SETTINGS_MODULE=config.settings.dev - - DB_HOST=mysql - - DB_USER=sa - - DB_PASSWORD=1234 - - CELERY_BROKER_URL=amqp://sa:1234@rabbitmq:5672// - - FCM_MOCK_URL=http://wiremock:8080 - depends_on: - - rabbitmq - - wiremock - networks: - - test-network - - # Mock Services - minio: - image: minio/minio - command: server /data --console-address ":9001" - environment: - MINIO_ROOT_USER: minioadmin - MINIO_ROOT_PASSWORD: minioadmin - ports: - - "9000:9000" - - "9001:9001" - networks: - - test-network - - wiremock: - image: wiremock/wiremock:3.3.1 - ports: - - "8080:8080" - volumes: - - ./wiremock:/home/wiremock - networks: - - test-network - - # Monitoring - influxdb: - image: influxdb:1.8 - environment: - INFLUXDB_DB: k6 - ports: - - "8086:8086" - networks: - - test-network - - grafana: - image: grafana/grafana:10.2.0 - environment: - - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - - GF_AUTH_ANONYMOUS_ENABLED=true - - GF_AUTH_BASIC_ENABLED=false - ports: - - "3000:3000" - volumes: - - ./grafana/provisioning:/etc/grafana/provisioning - - ./grafana/dashboards:/var/lib/grafana/dashboards - depends_on: - - influxdb - networks: - - test-network - - # K6 Runner - k6: - image: grafana/k6:latest - volumes: - - ./k6:/scripts - environment: - - K6_OUT=influxdb=http://influxdb:8086/k6 - networks: - - test-network - depends_on: - - influxdb - - main - -networks: - test-network: - driver: bridge -``` - -### 2.3 Mock 서비스 설정 - -#### WireMock (FCM Mock) - -**wiremock/mappings/fcm-send.json** -```json -{ - "request": { - "method": "POST", - "urlPattern": "/v1/projects/.*/messages:send" - }, - "response": { - "status": 200, - "headers": { - "Content-Type": "application/json" - }, - "jsonBody": { - "name": "projects/test/messages/{{randomValue type='UUID'}}" - }, - "transformers": ["response-template"], - "fixedDelayMilliseconds": 50 - } -} -``` - -#### OCR Mock 모드 - -```python -# tasks/ocr_tasks.py (테스트 모드) -import os -import random -import string - -OCR_MOCK = os.getenv('OCR_MOCK', 'false').lower() == 'true' - -def mock_ocr_result(): - """테스트용 가짜 OCR 결과 생성""" - num1 = random.randint(10, 999) - char = random.choice('가나다라마바사아자차카타파하') - num2 = random.randint(1000, 9999) - plate = f"{num1}{char}{num2}" - confidence = random.uniform(0.85, 0.99) - return plate, confidence - -@shared_task(bind=True, max_retries=3, acks_late=True) -def process_ocr(self, detection_id: int, gcs_uri: str): - try: - Detection.objects.filter(id=detection_id).update( - status='processing' - ) - - if OCR_MOCK: - # Mock 모드: 실제 OCR 없이 가짜 결과 반환 - import time - time.sleep(random.uniform(0.1, 0.5)) # 처리 시간 시뮬레이션 - plate_number, confidence = mock_ocr_result() - else: - # 실제 OCR 처리 - # ... 기존 코드 ... - pass - - # 이하 동일 -``` - ---- - -## 3. K6 설치 및 MQTT 확장 - -### 3.1 xk6-mqtt 빌드 - -MQTT 테스트를 위해 K6에 xk6-mqtt 확장을 추가합니다. - -```bash -# xk6 설치 -go install go.k6.io/xk6/cmd/xk6@latest - -# xk6-mqtt 확장 포함하여 K6 빌드 -xk6 build --with github.com/pmalhaire/xk6-mqtt@latest - -# 빌드된 바이너리 확인 -./k6 version -``` - -### 3.2 Docker 이미지 빌드 - -**k6/Dockerfile** -```dockerfile -FROM golang:1.21 as builder - -RUN go install go.k6.io/xk6/cmd/xk6@latest - -RUN xk6 build \ - --with github.com/pmalhaire/xk6-mqtt@latest \ - --output /k6 - -FROM grafana/k6:latest -COPY --from=builder /k6 /usr/bin/k6 -``` - ---- - -## 4. 테스트 시나리오 - -### 4.1 테스트 유형 - -| 테스트 | 목적 | VU | Duration | 특징 | -|--------|------|-----|----------|------| -| **Smoke** | 기본 동작 확인 | 1-5 | 1분 | 최소 부하로 시스템 정상 동작 확인 | -| **Load** | 예상 부하 검증 | 50-100 | 10분 | 일반적인 운영 환경 시뮬레이션 | -| **Stress** | 시스템 한계 확인 | 100-500 | 30분 | 점진적 부하 증가로 Breaking Point 탐색 | -| **Spike** | 급증 대응력 확인 | 10→500→10 | 10분 | 급격한 트래픽 변화 대응 | -| **Soak** | 장시간 안정성 | 100 | 2-4시간 | 메모리 누수, 리소스 고갈 확인 | - -### 4.2 예상 트래픽 기준 - -| 항목 | 값 | 설명 | -|------|-----|------| -| Edge Device 수 | 100대 | 동시 연결 카메라 수 | -| 이벤트/분/디바이스 | 10건 | 분당 과속 감지 이벤트 | -| 총 이벤트/분 | 1,000건 | 피크 시간대 | -| 총 이벤트/초 | ~17건 | 평균 TPS | - ---- - -## 5. K6 테스트 스크립트 - -### 5.1 공통 설정 - -**k6/common/config.js** -```javascript -// 환경 변수 또는 기본값 -export const CONFIG = { - // 서비스 URL - MAIN_SERVICE_URL: __ENV.MAIN_SERVICE_URL || 'http://main:8000', - - // MQTT 설정 - MQTT_BROKER: __ENV.MQTT_BROKER || 'tcp://rabbitmq:1883', - MQTT_USER: __ENV.MQTT_USER || 'sa', - MQTT_PASS: __ENV.MQTT_PASS || '1234', - MQTT_TOPIC: 'detections/new', - - // 테스트 데이터 - CAMERAS: ['cam_001', 'cam_002', 'cam_003', 'cam_004', 'cam_005'], - LOCATIONS: [ - '서울시 강남구 테헤란로', - '서울시 서초구 반포대로', - '서울시 송파구 올림픽로', - '경기도 성남시 분당구', - '인천시 연수구 센트럴로', - ], -}; - -// 테스트용 Detection 메시지 생성 -export function generateDetectionMessage() { - const camera = CONFIG.CAMERAS[Math.floor(Math.random() * CONFIG.CAMERAS.length)]; - const location = CONFIG.LOCATIONS[Math.floor(Math.random() * CONFIG.LOCATIONS.length)]; - const speedLimit = [50, 60, 80, 100][Math.floor(Math.random() * 4)]; - const detectedSpeed = speedLimit + Math.random() * 40 + 10; // 제한속도 + 10~50 - - return JSON.stringify({ - camera_id: camera, - location: location, - detected_speed: Math.round(detectedSpeed * 10) / 10, - speed_limit: speedLimit, - detected_at: new Date().toISOString(), - image_gcs_uri: `gs://test-bucket/${camera}/${Date.now()}.jpg`, - }); -} - -// 성능 임계값 (Thresholds) -export const THRESHOLDS = { - // HTTP 요청 - http_req_duration: ['p(95)<500', 'p(99)<1000'], - http_req_failed: ['rate<0.01'], - - // MQTT 발행 - mqtt_publish_duration: ['p(95)<100'], - mqtt_publish_failed: ['rate<0.01'], - - // 커스텀 메트릭 - detection_e2e_duration: ['p(95)<30000'], // End-to-End 30초 이내 -}; -``` - -### 5.2 Smoke Test - -**k6/tests/smoke.js** -```javascript -import mqtt from 'k6/x/mqtt'; -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { Counter, Trend } from 'k6/metrics'; -import { CONFIG, generateDetectionMessage, THRESHOLDS } from '../common/config.js'; - -// 커스텀 메트릭 -const mqttPublishDuration = new Trend('mqtt_publish_duration'); -const mqttPublishFailed = new Counter('mqtt_publish_failed'); - -export const options = { - vus: 3, - duration: '1m', - thresholds: THRESHOLDS, -}; - -// MQTT 클라이언트 (VU당 1개) -const client = new mqtt.Client( - CONFIG.MQTT_BROKER, - `k6-smoke-${__VU}-${Date.now()}` -); - -export function setup() { - // API 헬스체크 - const res = http.get(`${CONFIG.MAIN_SERVICE_URL}/health/`); - check(res, { - 'API is healthy': (r) => r.status === 200, - }); - - console.log('Smoke Test 시작: 기본 동작 확인'); -} - -export default function () { - // MQTT 연결 - client.connect({ - username: CONFIG.MQTT_USER, - password: CONFIG.MQTT_PASS, - }); - - // Detection 메시지 발행 - const message = generateDetectionMessage(); - const startTime = Date.now(); - - try { - client.publish(CONFIG.MQTT_TOPIC, message, 1, false); - mqttPublishDuration.add(Date.now() - startTime); - - check(null, { - 'MQTT publish successful': () => true, - }); - } catch (e) { - mqttPublishFailed.add(1); - console.error(`MQTT publish failed: ${e}`); - } - - client.disconnect(); - - // API 조회 테스트 - const apiRes = http.get(`${CONFIG.MAIN_SERVICE_URL}/api/v1/detections/pending/`); - check(apiRes, { - 'API status 200': (r) => r.status === 200, - 'API response time < 500ms': (r) => r.timings.duration < 500, - }); - - sleep(1); -} - -export function teardown() { - console.log('Smoke Test 완료'); -} -``` - -### 5.3 Load Test - -**k6/tests/load.js** -```javascript -import mqtt from 'k6/x/mqtt'; -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { Counter, Trend, Rate } from 'k6/metrics'; -import { CONFIG, generateDetectionMessage, THRESHOLDS } from '../common/config.js'; - -// 커스텀 메트릭 -const mqttPublishDuration = new Trend('mqtt_publish_duration'); -const mqttPublishFailed = new Counter('mqtt_publish_failed'); -const detectionCreated = new Counter('detection_created'); - -export const options = { - stages: [ - { duration: '1m', target: 50 }, // Ramp-up - { duration: '8m', target: 50 }, // Steady state - { duration: '1m', target: 0 }, // Ramp-down - ], - thresholds: { - ...THRESHOLDS, - 'detection_created': ['count>400'], // 10분간 최소 400건 - }, -}; - -let client; - -export function setup() { - console.log('Load Test 시작: 예상 부하 검증'); - console.log(`Target: 50 VUs, 예상 TPS: ~17/s`); -} - -export default function () { - // VU별 MQTT 클라이언트 생성 - if (!client) { - client = new mqtt.Client( - CONFIG.MQTT_BROKER, - `k6-load-${__VU}-${Date.now()}` - ); - } - - client.connect({ - username: CONFIG.MQTT_USER, - password: CONFIG.MQTT_PASS, - }); - - // 1. MQTT 메시지 발행 (Edge Device 시뮬레이션) - const message = generateDetectionMessage(); - const startTime = Date.now(); - - try { - client.publish(CONFIG.MQTT_TOPIC, message, 1, false); - mqttPublishDuration.add(Date.now() - startTime); - detectionCreated.add(1); - - check(null, { 'MQTT publish OK': () => true }); - } catch (e) { - mqttPublishFailed.add(1); - } - - client.disconnect(); - - // 2. API 부하 (프론트엔드 시뮬레이션) - const endpoints = [ - '/api/v1/detections/', - '/api/v1/detections/pending/', - '/api/v1/detections/statistics/', - ]; - - const endpoint = endpoints[Math.floor(Math.random() * endpoints.length)]; - const apiRes = http.get(`${CONFIG.MAIN_SERVICE_URL}${endpoint}`); - - check(apiRes, { - 'API status 200': (r) => r.status === 200, - 'API response < 500ms': (r) => r.timings.duration < 500, - }); - - // 초당 약 17건 (50 VU * 0.33건/VU/초) - sleep(3); -} - -export function teardown() { - console.log('Load Test 완료'); -} -``` - -### 5.4 Stress Test - -**k6/tests/stress.js** -```javascript -import mqtt from 'k6/x/mqtt'; -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { Counter, Trend, Gauge } from 'k6/metrics'; -import { CONFIG, generateDetectionMessage, THRESHOLDS } from '../common/config.js'; - -// 커스텀 메트릭 -const mqttPublishDuration = new Trend('mqtt_publish_duration'); -const mqttPublishFailed = new Counter('mqtt_publish_failed'); -const activeVUs = new Gauge('active_vus'); -const queueDepth = new Gauge('estimated_queue_depth'); - -export const options = { - stages: [ - // 점진적 부하 증가 - { duration: '2m', target: 50 }, - { duration: '5m', target: 50 }, - { duration: '2m', target: 100 }, - { duration: '5m', target: 100 }, - { duration: '2m', target: 200 }, - { duration: '5m', target: 200 }, - { duration: '2m', target: 300 }, - { duration: '5m', target: 300 }, - // Breaking point 탐색 - { duration: '2m', target: 500 }, - { duration: '5m', target: 500 }, - // Ramp-down - { duration: '2m', target: 0 }, - ], - thresholds: { - http_req_duration: ['p(95)<2000'], // Stress 시 완화 - http_req_failed: ['rate<0.1'], // 10% 미만 실패 허용 - mqtt_publish_failed: ['rate<0.05'], // 5% 미만 실패 허용 - }, -}; - -let client; - -export function setup() { - console.log('Stress Test 시작: 시스템 한계 확인'); - console.log('단계: 50 → 100 → 200 → 300 → 500 VUs'); -} - -export default function () { - activeVUs.add(__VU); - - if (!client) { - client = new mqtt.Client( - CONFIG.MQTT_BROKER, - `k6-stress-${__VU}-${Date.now()}` - ); - } - - try { - client.connect({ - username: CONFIG.MQTT_USER, - password: CONFIG.MQTT_PASS, - }); - - const message = generateDetectionMessage(); - const startTime = Date.now(); - - client.publish(CONFIG.MQTT_TOPIC, message, 1, false); - mqttPublishDuration.add(Date.now() - startTime); - - client.disconnect(); - } catch (e) { - mqttPublishFailed.add(1); - console.error(`Stress error at VU ${__VU}: ${e}`); - } - - // API 부하 - const apiRes = http.get(`${CONFIG.MAIN_SERVICE_URL}/api/v1/detections/`); - check(apiRes, { - 'API responds': (r) => r.status === 200 || r.status === 503, - }); - - // RabbitMQ Queue 깊이 확인 (추정) - try { - const rmqRes = http.get( - 'http://rabbitmq:15672/api/queues/%2F/ocr_queue', - { auth: 'sa:1234' } - ); - if (rmqRes.status === 200) { - const queue = JSON.parse(rmqRes.body); - queueDepth.add(queue.messages || 0); - } - } catch (e) { - // Queue 모니터링 실패 무시 - } - - sleep(1); -} - -export function teardown() { - console.log('Stress Test 완료'); - console.log('Breaking Point 분석 필요'); -} -``` - -### 5.5 Spike Test - -**k6/tests/spike.js** -```javascript -import mqtt from 'k6/x/mqtt'; -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { Counter, Trend } from 'k6/metrics'; -import { CONFIG, generateDetectionMessage } from '../common/config.js'; - -const mqttPublishDuration = new Trend('mqtt_publish_duration'); -const mqttPublishFailed = new Counter('mqtt_publish_failed'); -const recoveryTime = new Trend('recovery_time'); - -export const options = { - stages: [ - // 정상 상태 - { duration: '1m', target: 10 }, - // 급격한 스파이크 - { duration: '10s', target: 500 }, - // 스파이크 유지 - { duration: '1m', target: 500 }, - // 급격한 감소 - { duration: '10s', target: 10 }, - // 정상 상태 복귀 - { duration: '2m', target: 10 }, - // 두 번째 스파이크 - { duration: '10s', target: 300 }, - { duration: '1m', target: 300 }, - { duration: '10s', target: 10 }, - // 복구 확인 - { duration: '2m', target: 10 }, - // 종료 - { duration: '30s', target: 0 }, - ], - thresholds: { - http_req_duration: ['p(95)<3000'], // Spike 시 완화된 임계값 - mqtt_publish_failed: ['rate<0.1'], // 10% 미만 실패 허용 - }, -}; - -let client; -let preSpikeDuration = null; - -export function setup() { - console.log('Spike Test 시작: 급격한 트래픽 변화 대응력 확인'); - console.log('시나리오: 10 → 500 → 10 → 300 → 10 VUs'); -} - -export default function () { - if (!client) { - client = new mqtt.Client( - CONFIG.MQTT_BROKER, - `k6-spike-${__VU}-${Date.now()}` - ); - } - - try { - client.connect({ - username: CONFIG.MQTT_USER, - password: CONFIG.MQTT_PASS, - }); - - const message = generateDetectionMessage(); - const startTime = Date.now(); - - client.publish(CONFIG.MQTT_TOPIC, message, 1, false); - const duration = Date.now() - startTime; - mqttPublishDuration.add(duration); - - // 스파이크 전 기준 응답 시간 저장 - if (__ITER < 60 && !preSpikeDuration) { - preSpikeDuration = duration; - } - - // 복구 시간 측정 (스파이크 후 정상 응답으로 돌아오는 시간) - if (__ITER > 200 && preSpikeDuration) { - if (duration <= preSpikeDuration * 1.5) { - recoveryTime.add(__ITER); - } - } - - client.disconnect(); - } catch (e) { - mqttPublishFailed.add(1); - } - - // API 응답 확인 - const apiRes = http.get(`${CONFIG.MAIN_SERVICE_URL}/api/v1/detections/pending/`); - check(apiRes, { - 'API responds during spike': (r) => r.status === 200 || r.status === 503, - }); - - sleep(0.5); -} - -export function teardown() { - console.log('Spike Test 완료'); - console.log('복구 시간 분석 필요'); -} -``` - -### 5.6 Soak Test - -**k6/tests/soak.js** -```javascript -import mqtt from 'k6/x/mqtt'; -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { Counter, Trend, Gauge } from 'k6/metrics'; -import { CONFIG, generateDetectionMessage } from '../common/config.js'; - -const mqttPublishDuration = new Trend('mqtt_publish_duration'); -const mqttPublishFailed = new Counter('mqtt_publish_failed'); -const memoryUsage = new Gauge('memory_usage_estimate'); -const dbConnections = new Gauge('db_connections_estimate'); - -export const options = { - stages: [ - { duration: '5m', target: 100 }, // Ramp-up - { duration: '4h', target: 100 }, // 4시간 유지 (조정 가능: 2h, 8h) - { duration: '5m', target: 0 }, // Ramp-down - ], - thresholds: { - http_req_duration: ['p(95)<500', 'p(99)<1000'], - http_req_failed: ['rate<0.01'], - mqtt_publish_failed: ['rate<0.01'], - }, -}; - -let client; -let iterationCount = 0; - -export function setup() { - console.log('Soak Test 시작: 장시간 안정성 확인'); - console.log('Duration: 4시간, VUs: 100'); - console.log('확인 항목: 메모리 누수, DB 커넥션 풀 고갈, 성능 저하'); -} - -export default function () { - iterationCount++; - - if (!client) { - client = new mqtt.Client( - CONFIG.MQTT_BROKER, - `k6-soak-${__VU}-${Date.now()}` - ); - } - - try { - client.connect({ - username: CONFIG.MQTT_USER, - password: CONFIG.MQTT_PASS, - }); - - const message = generateDetectionMessage(); - const startTime = Date.now(); - - client.publish(CONFIG.MQTT_TOPIC, message, 1, false); - mqttPublishDuration.add(Date.now() - startTime); - - client.disconnect(); - } catch (e) { - mqttPublishFailed.add(1); - } - - // API 호출 - const apiRes = http.get(`${CONFIG.MAIN_SERVICE_URL}/api/v1/detections/`); - check(apiRes, { - 'API status 200': (r) => r.status === 200, - 'API response < 500ms': (r) => r.timings.duration < 500, - }); - - // 주기적으로 시스템 상태 확인 (10분마다) - if (iterationCount % 600 === 0) { - console.log(`Checkpoint at iteration ${iterationCount}`); - - // Health check - const healthRes = http.get(`${CONFIG.MAIN_SERVICE_URL}/health/`); - if (healthRes.status !== 200) { - console.error('Health check failed!'); - } - - // RabbitMQ Queue 상태 - try { - const rmqRes = http.get( - 'http://rabbitmq:15672/api/overview', - { auth: 'sa:1234' } - ); - if (rmqRes.status === 200) { - const overview = JSON.parse(rmqRes.body); - console.log(`RabbitMQ Messages: ${overview.queue_totals?.messages || 0}`); - } - } catch (e) { - // 무시 - } - } - - sleep(1); -} - -export function teardown() { - console.log('Soak Test 완료'); - console.log(`Total iterations: ${iterationCount}`); - console.log('메모리 사용량 그래프 및 성능 추이 분석 필요'); -} -``` - ---- - -## 6. 테스트 실행 방법 - -### 6.1 테스트 환경 시작 - -```bash -# 1. 테스트 환경 시작 -cd docs -docker compose -f docker-compose.test.yml up -d - -# 2. 서비스 준비 대기 -sleep 30 - -# 3. 헬스체크 -curl http://localhost:8000/health/ -``` - -### 6.2 K6 테스트 실행 - -```bash -# Smoke Test (1분) -docker compose -f docker-compose.test.yml run k6 run /scripts/tests/smoke.js - -# Load Test (10분) -docker compose -f docker-compose.test.yml run k6 run /scripts/tests/load.js - -# Stress Test (37분) -docker compose -f docker-compose.test.yml run k6 run /scripts/tests/stress.js - -# Spike Test (9분) -docker compose -f docker-compose.test.yml run k6 run /scripts/tests/spike.js - -# Soak Test (4시간+) -docker compose -f docker-compose.test.yml run k6 run /scripts/tests/soak.js -``` - -### 6.3 결과 확인 - -- **Grafana Dashboard**: http://localhost:3000 -- **RabbitMQ Management**: http://localhost:15672 (sa/1234) -- **Flower (Celery)**: http://localhost:5555 - ---- - -## 7. 메트릭 및 분석 - -### 7.1 핵심 메트릭 - -| 메트릭 | 설명 | 목표값 | -|--------|------|--------| -| `http_req_duration` (p95) | API 응답 시간 | < 500ms | -| `mqtt_publish_duration` (p95) | MQTT 발행 시간 | < 100ms | -| `http_req_failed` | API 실패율 | < 1% | -| `mqtt_publish_failed` | MQTT 발행 실패율 | < 1% | -| `detection_e2e_duration` | 감지→알림 전체 시간 | < 30초 | - -### 7.2 RabbitMQ 메트릭 - -| 메트릭 | 설명 | 경고 임계값 | -|--------|------|-------------| -| Queue Depth (ocr_queue) | OCR 대기 메시지 수 | > 1000 | -| Queue Depth (fcm_queue) | FCM 대기 메시지 수 | > 500 | -| Consumer Count | 활성 Consumer 수 | = 0 (장애) | -| Message Rate | 초당 메시지 처리량 | 감소 추세 | - -### 7.3 시스템 메트릭 - -| 메트릭 | 설명 | 경고 임계값 | -|--------|------|-------------| -| CPU Usage | CPU 사용률 | > 80% | -| Memory Usage | 메모리 사용률 | > 85% | -| DB Connections | DB 커넥션 수 | > Pool Size 80% | -| Network I/O | 네트워크 트래픽 | 급격한 변화 | - -### 7.4 Grafana Dashboard JSON - -**grafana/dashboards/k6-performance.json** -```json -{ - "dashboard": { - "title": "K6 Performance Test Dashboard", - "panels": [ - { - "title": "Virtual Users", - "type": "graph", - "targets": [ - { - "query": "SELECT mean(\"value\") FROM \"k6_vus\" WHERE $timeFilter GROUP BY time(10s)", - "alias": "VUs" - } - ] - }, - { - "title": "HTTP Request Duration (p95)", - "type": "graph", - "targets": [ - { - "query": "SELECT percentile(\"value\", 95) FROM \"k6_http_req_duration\" WHERE $timeFilter GROUP BY time(10s)", - "alias": "p95" - } - ] - }, - { - "title": "MQTT Publish Duration", - "type": "graph", - "targets": [ - { - "query": "SELECT mean(\"value\") FROM \"k6_mqtt_publish_duration\" WHERE $timeFilter GROUP BY time(10s)", - "alias": "mean" - } - ] - }, - { - "title": "Error Rate", - "type": "graph", - "targets": [ - { - "query": "SELECT sum(\"value\") FROM \"k6_http_req_failed\" WHERE $timeFilter GROUP BY time(10s)", - "alias": "HTTP Errors" - }, - { - "query": "SELECT sum(\"value\") FROM \"k6_mqtt_publish_failed\" WHERE $timeFilter GROUP BY time(10s)", - "alias": "MQTT Errors" - } - ] - }, - { - "title": "Requests per Second", - "type": "graph", - "targets": [ - { - "query": "SELECT count(\"value\") FROM \"k6_http_reqs\" WHERE $timeFilter GROUP BY time(1s)", - "alias": "RPS" - } - ] - } - ] - } -} -``` - ---- - -## 8. 테스트 결과 분석 체크리스트 - -### 8.1 Smoke Test -- [ ] 모든 컴포넌트 정상 동작 -- [ ] MQTT → Django → RabbitMQ 흐름 확인 -- [ ] API 응답 정상 - -### 8.2 Load Test -- [ ] 목표 TPS 달성 (17건/초) -- [ ] p95 응답 시간 < 500ms -- [ ] 에러율 < 1% -- [ ] Queue 백로그 축적 없음 - -### 8.3 Stress Test -- [ ] Breaking Point 식별 (VU 수, TPS) -- [ ] 장애 발생 지점 확인 -- [ ] 리소스 병목 구간 확인 (CPU/Memory/DB/Queue) -- [ ] 장애 시 Graceful Degradation 여부 - -### 8.4 Spike Test -- [ ] 스파이크 시 시스템 다운 없음 -- [ ] 복구 시간 측정 -- [ ] 메시지 유실 여부 확인 -- [ ] Auto-scaling 동작 확인 (적용 시) - -### 8.5 Soak Test -- [ ] 메모리 누수 없음 (일정한 메모리 사용량) -- [ ] DB 커넥션 풀 안정 -- [ ] 성능 저하 없음 (시간 경과에 따른 응답 시간) -- [ ] 로그 파일 사이즈 관리 - ---- - -## 9. 트러블슈팅 - -### 9.1 일반적인 문제 - -| 문제 | 원인 | 해결 | -|------|------|------| -| MQTT 연결 실패 | RabbitMQ MQTT Plugin 미활성화 | `rabbitmq-plugins enable rabbitmq_mqtt` | -| Queue 백로그 증가 | Worker 처리량 부족 | Worker concurrency 증가 | -| DB 커넥션 고갈 | Pool Size 부족 | `CONN_MAX_AGE`, `pool_size` 증가 | -| OOM Kill | 메모리 부족 | Container 메모리 제한 증가 | - -### 9.2 성능 병목 해결 - -```bash -# RabbitMQ Queue 상태 확인 -curl -u sa:1234 http://localhost:15672/api/queues/%2F/ocr_queue - -# MySQL 커넥션 상태 확인 -mysql -u sa -p1234 -e "SHOW PROCESSLIST;" - -# Celery Worker 상태 확인 -celery -A config inspect active - -# Docker 리소스 사용량 -docker stats -``` - ---- - -## 10. 권장 테스트 순서 - -``` -1. Smoke Test (1분) - └─ 기본 동작 확인 - -2. Load Test (10분) - └─ 예상 부하 검증 - -3. Stress Test (37분) - └─ 시스템 한계 확인 - -4. Spike Test (9분) - └─ 급증 대응력 확인 - -5. Soak Test (4시간) - └─ 장시간 안정성 확인 -``` - -각 테스트 후 결과를 분석하고, 발견된 문제를 해결한 후 다음 테스트를 진행합니다. - diff --git a/docs/PERFORMANCE_TEST_GUIDE.md b/docs/PERFORMANCE_TEST_GUIDE.md new file mode 100644 index 0000000..12d20ff --- /dev/null +++ b/docs/PERFORMANCE_TEST_GUIDE.md @@ -0,0 +1,864 @@ +# 성능 테스트 가이드 + +SpeedCam IoT 백엔드의 성능, 안정성, 파이프라인 처리 능력을 검증하기 위한 종합 가이드입니다. HTTP API 부하 테스트(k6)와 IoT 파이프라인 부하 테스트(MQTT)를 다룹니다. + +--- + +## 목차 + +1. [사전 준비](#1-사전-준비) +2. [모니터링 대시보드](#2-모니터링-대시보드) +3. [HTTP API 부하 테스트 (k6)](#3-http-api-부하-테스트-k6) +4. [IoT 파이프라인 부하 테스트 (MQTT)](#4-iot-파이프라인-부하-테스트-mqtt) +5. [End-to-End 검증 체크리스트](#5-end-to-end-검증-체크리스트) +6. [트러블슈팅](#6-트러블슈팅) +7. [정리 및 종료](#7-정리-및-종료) + +--- + +## 1. 사전 준비 + +### 1.1 필요 도구 + +| 도구 | 용도 | 설치 방법 | +|------|------|----------| +| Docker | 컨테이너 실행 | https://docs.docker.com/get-docker/ | +| Docker Compose | 다중 컨테이너 관리 | Docker Desktop 포함 | +| Python 3.x | MQTT 파이프라인 테스트 | 기본 설치됨 | +| paho-mqtt | MQTT 클라이언트 | `pip install paho-mqtt` | +| curl | API 요청 테스트 | 기본 설치됨 | + +### 1.2 환경 시작 + +**중요**: `docker-compose.yml`이 `speedcam-network`를 생성하고, `docker-compose.monitoring.yml`은 이를 `external: true`로 참조합니다. 반드시 순서대로 또는 `-f` 플래그로 함께 시작하세요. + +```bash +# 방법 1: 앱 + 모니터링 함께 시작 (권장) +cd docker +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d +``` + +```bash +# 방법 2: 순차 시작 +cd docker +docker compose -f docker-compose.yml up -d +docker compose -f docker-compose.monitoring.yml up -d +``` + +### 1.3 macOS 참고사항 + +- **cAdvisor는 Linux 전용**: `docker-compose.monitoring.yml`에 `profiles: [linux]`가 설정되어 있으므로 macOS에서는 자동 제외됩니다 +- **Linux에서 cAdvisor 포함**: + ```bash + docker compose -f docker-compose.yml -f docker-compose.monitoring.yml --profile linux up -d + ``` +- **권장 Docker Desktop 메모리**: 8GB 이상 + +### 1.4 서비스 상태 확인 + +```bash +# 전체 컨테이너 상태 확인 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml ps +``` + +**예상 상태**: 모든 컨테이너가 `Up` 상태 + +```bash +# Prometheus 타겟 상태 확인 +curl -s http://localhost:9090/api/v1/targets | python3 -c " +import json, sys +data = json.load(sys.stdin) +print('Prometheus Scrape Targets:') +for t in data['data']['activeTargets']: + status = '✓ UP' if t['health'] == 'up' else '✗ DOWN' + print(f\" {t['labels']['job']:20s} {t['health']:5s} {t['labels']['instance']}\") +" +``` + +**예상 출력**: +``` +Prometheus Scrape Targets: + django up main:8000 + otel-collector up otel-collector:8889 + rabbitmq up rabbitmq:15692 + mysql up mysqld-exporter:9104 + celery up celery-exporter:9808 + cadvisor up cadvisor:8080 (Linux only) +``` + +--- + +## 2. 모니터링 대시보드 + +### 2.1 접속 정보 + +| 서비스 | URL | 인증 | 용도 | +|--------|-----|------|------| +| **Grafana** | http://localhost:3000 | admin / admin | 통합 대시보드 (메트릭, 로그, 트레이스) | +| **Prometheus** | http://localhost:9090 | 없음 | 메트릭 저장소 및 PromQL 쿼리 | +| **Jaeger** | http://localhost:16686 | 없음 | 분산 트레이싱 UI | +| **RabbitMQ** | http://localhost:15672 | sa / 1234 | 큐 모니터링 | +| **Flower** | http://localhost:5555 | 없음 | Celery 태스크 모니터링 | + +### 2.2 Grafana 대시보드 Import + +시작 시 자동으로 대시보드가 프로비저닝되지만, 추가 대시보드는 수동 import: + +1. Grafana 접속: http://localhost:3000 +2. Dashboards → New → Import +3. Dashboard ID 입력: + +| 대시보드 | ID | 데이터소스 | 용도 | +|---------|-----|-----------|------| +| Django Prometheus | 17658 | Prometheus | HTTP 요청, 응답시간, 에러율 | +| Celery Monitoring | 17509 | Prometheus | 태스크 성공/실패, 큐 깊이 | +| RabbitMQ Overview | 10991 | Prometheus | 메시지 rate, 큐 깊이 | +| MySQL Overview | 14057 | Prometheus | 쿼리 수, 커넥션, 슬로우 쿼리 | +| K6 Load Testing | 19665 | Prometheus | k6 부하 테스트 결과 (실시간) | + +### 2.3 주요 메트릭 보기 + +**Django HTTP 메트릭** (자동 수집): +``` +http://localhost:3000/d/ +``` + +**Jaeger 트레이스** (요청 플로우 추적): +``` +http://localhost:16686 → Services → speedcam-api → 최근 트레이스 보기 +``` + +**Loki 로그** (구조화된 로그): +``` +Grafana → Explore → Data source: Loki +쿼리: {service="main"} +``` + +--- + +## 3. HTTP API 부하 테스트 (k6) + +### 3.1 테스트 대상 + +REST API 엔드포인트 검증 (IoT 파이프라인 제외): + +| 엔드포인트 | 메서드 | 용도 | +|-----------|--------|------| +| `/health/` | GET | 헬스 체크 | +| `/api/v1/vehicles/` | GET, POST, PUT, DELETE | 차량 CRUD | +| `/api/v1/detections/` | GET | 검출 목록 조회 | +| `/api/v1/notifications/` | GET | 알림 목록 조회 | + +### 3.2 설치 + +paho-mqtt는 MQTT 테스트에만 필요합니다. k6 테스트는 Docker 컨테이너에서 실행되므로 호스트 설치 불필요합니다. + +### 3.3 실행 방법 + +```bash +cd docker + +# 기본 실행: Prometheus에 결과 기록 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + run k6 run --out experimental-prometheus-rw /scripts/load-test.js +``` + +**선택 사항**: 환경 변수 오버라이드 + +```bash +# 커스텀 대상 서버 지정 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + run -e MAIN_SERVICE_URL=http://localhost:8000 k6 \ + run --out experimental-prometheus-rw /scripts/load-test.js +``` + +### 3.4 시나리오별 테스트 + +`load-test.js`에 3가지 시나리오가 정의되어 있습니다. 각 시나리오는 startTime이 다르므로 한 번의 실행으로 모두 테스트됩니다. + +| 시나리오 | VU 범위 | 시간 | 시작 시간 | 용도 | 기대 결과 | +|----------|---------|------|----------|------|----------| +| **smoke** | 1 | 10초 | 0s | 기본 동작 확인 | 에러 0%, 응답 <100ms | +| **average_load** | 0→10→0 | ~100초 | 15s | 평균 부하 검증 | p95 <500ms, 에러 <1% | +| **spike** | 0→30→0 | ~25초 | 120s | 스파이크 처리 능력 | p99 <1000ms, 에러 <5% | + +**총 실행 시간**: ~2분 30초 + +### 3.5 실행 중 모니터링 + +실시간으로 다른 터미널에서 메트릭 확인: + +```bash +# Prometheus UI에서 확인 +open http://localhost:9090/graph +# 쿼리: rate(k6_http_reqs_total[1m]) +``` + +```bash +# Grafana K6 대시보드 (ID: 19665) 보기 +open http://localhost:3000/d/K6-dashboard +``` + +### 3.6 결과 해석 + +k6 실행 완료 후 stdout에 요약이 표시됩니다: + +``` + checks.........................: 100.00% ✓ 5000 ✗ 0 + data_received..................: 1.2 MB ✓ + data_sent.......................: 850 kB ✓ + http_req_blocked...............: avg=1.2ms min=100µs max=50ms p(90)=2.1ms p(95)=3.5ms + http_req_connecting............: avg=0.8ms min=0µs max=40ms p(90)=1.5ms p(95)=2.2ms + http_req_duration..............: avg=125ms min=50ms max=2s p(90)=350ms p(95)=450ms + http_req_failed................: 0.00% ✓ 0 ✗ 5000 + http_req_receiving.............: avg=2.5ms min=0.5ms max=20ms p(90)=4ms p(95)=5ms + http_req_sending...............: avg=0.5ms min=0.1ms max=5ms p(90)=1ms p(95)=1ms + http_req_tls_handshaking.......: avg=0ms min=0µs max=0s p(90)=0s p(95)=0s + http_req_waiting...............: avg=122ms min=48ms max=1.9s p(90)=348ms p(95)=448ms + http_reqs.......................: 5000 199.31/s + iteration_duration.............: avg=2.5s min=2s max=30s p(90)=2.8s p(95)=3.1s + iterations......................: 5000 199.31/s +``` + +**주요 메트릭**: +- `checks`: 테스트 검증 통과율 (100%이어야 함) +- `http_req_duration` (p95): 95% 요청의 응답시간 (목표 <500ms) +- `http_req_failed`: 실패율 (0%이어야 함) +- `http_reqs`: 초당 처리한 요청 수 (RPS) + +### 3.7 맞춤형 시나리오 작성 + +`load-test.js`를 수정하여 커스텀 시나리오를 추가할 수 있습니다. 자세한 내용은 [k6 공식 문서](https://k6.io/docs/get-started/running-k6/)를 참고하세요. + +--- + +## 4. IoT 파이프라인 부하 테스트 (MQTT) + +### 4.1 테스트 대상 + +실제 IoT 카메라 동작을 시뮬레이션하여 전체 파이프라인을 검증합니다: + +``` +MQTT 메시지 발행 (Raspberry Pi 시뮬) + ↓ +RabbitMQ 큐에 저장 + ↓ +Detection 생성 (pending) + ↓ +OCR Worker (이미지 처리) + ↓ +Alert Worker (FCM 알림) + ↓ +완료 (completed) +``` + +### 4.2 사전 준비 + +**호스트에서 실행하는 경우**: +```bash +pip install paho-mqtt +``` + +### 4.3 실행 방법 + +**기본 실행** (호스트): +```bash +python docker/k6/mqtt-load-test.py \ + --workers 5 \ + --rate 2 \ + --duration 60 +``` + +**환경 변수 오버라이드**: +```bash +MQTT_HOST=localhost MQTT_PORT=1883 python docker/k6/mqtt-load-test.py \ + --workers 5 --rate 2 --duration 60 +``` + +### 4.4 테스트 단계별 파라미터 + +| 단계 | Workers | Rate(/s) | Duration | 총 메시지 | 용도 | 예상 처리 시간 | +|------|---------|----------|----------|-----------|------|----------------| +| **Smoke** | 1 | 1 | 10s | ~10 | 기본 동작 확인 | ~30초 | +| **Load** | 5 | 2 | 60s | ~600 | 일반 부하 검증 | ~5분 | +| **Stress** | 20 | 5 | 120s | ~12,000 | 시스템 한계 확인 | ~30분 | +| **Soak** | 5 | 2 | 3600s | ~36,000 | 장시간 안정성 | ~2시간 | + +**추천 시작 순서**: +1. Smoke 테스트로 연결성 확인 +2. Load 테스트로 정상 동작 확인 +3. Stress 테스트로 한계 확인 + +### 4.5 메시지 형식 + +MQTT 메시지는 다음 JSON 형식으로 발행됩니다: + +```json +{ + "camera_id": "CAM-001", + "location": "서울시 강남구 테헤란로", + "detected_speed": 95.3, + "speed_limit": 60.0, + "detected_at": "2024-01-01T12:00:00+09:00", + "image_gcs_uri": "gs://speedcam-bucket/detections/1704067200000-1234.jpg" +} +``` + +**필드 설명**: +- `camera_id`: 카메라 ID (CAM-001 ~ CAM-020) +- `location`: 카메라 위치 (실제 한국 도로명) +- `detected_speed`: 감지된 속도 (제한속도 + 5~50km/h 초과) +- `speed_limit`: 해당 구간 제한속도 (60, 80, 100, 110 중 선택) +- `detected_at`: ISO 8601 형식의 감지 시간 (한국 표준시) +- `image_gcs_uri`: GCS에 저장된 이미지 경로 (시뮬레이션용 경로) + +### 4.6 실행 중 모니터링 + +테스트 실행 중 다른 터미널에서 진행 상황을 모니터링합니다: + +**RabbitMQ 큐 상태**: +```bash +curl -s -u sa:1234 http://localhost:15672/api/queues/%2F | python3 -c " +import json, sys +queues = json.load(sys.stdin) +print('RabbitMQ Queue Status:') +for q in queues: + if q['name'] in ('detections_queue', 'ocr_queue', 'fcm_queue'): + print(f\" {q['name']:20s} messages={q.get('messages', 0):6d} consumers={q.get('consumers', 0)}\") +" +``` + +**Celery 태스크 상태**: +```bash +curl -s http://localhost:5555/api/workers | python3 -c " +import json, sys +data = json.load(sys.stdin) +print('Celery Workers:') +for worker, info in data.items(): + print(f\" {worker:30s} {info.get('status', 'unknown')}\") +" +``` + +**Jaeger 트레이스 (선택)**: +```bash +open http://localhost:16686 +# Services → speedcam-api → Detection 또는 OCR 작업 선택 +``` + +### 4.7 결과 확인 + +MQTT 테스트 완료 후 stdout에 통계가 표시됩니다: + +``` +=== MQTT Load Test Complete === +Total Published: 600 +Failed: 0 +Success Rate: 100.00% +Avg Latency: 245ms +Min Latency: 50ms +Max Latency: 1200ms +Total Duration: 65 seconds +Messages/sec: 9.23 +``` + +**해석**: +- **Success Rate**: 100%이어야 함 (메시지 발행 성공) +- **Avg Latency**: MQTT 발행 시간 (네트워크 지연) +- **Total Duration**: 부하 테스트 총 소요 시간 + +--- + +## 5. End-to-End 검증 체크리스트 + +MQTT 부하 테스트 실행 후 다음 항목들을 확인하여 파이프라인이 정상 동작하는지 검증합니다. + +### 5.1 Detection 처리 상태 + +```bash +# 전체 Detection 조회 +curl -s http://localhost:8000/api/v1/detections/ | python3 -c " +import json, sys +data = json.load(sys.stdin) +print(f\"Total Detections: {data['count']}\") +print() + +# 상태별 카운트 추출 +results = data['results'] +status_counts = {} +for r in results: + status = r.get('ocr_status', 'unknown') + status_counts[status] = status_counts.get(status, 0) + 1 + +print('Status Distribution:') +for status, count in sorted(status_counts.items()): + print(f\" {status:15s}: {count:4d}\") +" +``` + +**정상 상태**: +- 대부분이 `completed` 상태 +- 일부 `processing` 또는 `pending` (최근 생성된 건) +- `failed` 건이 있으면 OCR Worker 로그 확인: `docker logs speedcam-ocr` + +```bash +# 최근 생성된 Detection 확인 +curl -s "http://localhost:8000/api/v1/detections/?ordering=-detected_at&limit=5" | \ + python3 -m json.tool | head -50 +``` + +### 5.2 Jaeger 분산 트레이스 확인 + +트레이스를 통해 요청이 전체 시스템을 거치는 과정을 추적합니다. + +```bash +# 사용 가능한 서비스 확인 +curl -s http://localhost:16686/api/services | python3 -c " +import json, sys +data = json.load(sys.stdin) +print('Jaeger Services:') +for service in data['data']: + print(f\" - {service}\") +" +``` + +**예상 서비스**: +- `speedcam-api`: Django 메인 애플리케이션 +- `speedcam-ocr`: OCR Worker (Celery) +- `speedcam-alert`: Alert Worker (Celery) + +```bash +# 최근 트레이스 조회 (speedcam-api) +curl -s "http://localhost:16686/api/traces?service=speedcam-api&limit=3" | python3 -c " +import json, sys +data = json.load(sys.stdin) +print('Recent Traces (speedcam-api):') +for trace in data['data'][:3]: + trace_id = trace['traceID'][:16] + num_spans = len(trace['spans']) + operation = trace['spans'][0]['operationName'] + duration_ms = (trace['spans'][0]['endTime'] - trace['spans'][0]['startTime']) / 1000 + print(f\" {trace_id}... | Spans: {num_spans:2d} | {operation:30s} | {duration_ms:6.1f}ms\") +" +``` + +**정상 구성**: +- Health Check: 1-2 spans (빠름) +- Vehicle Create: 3-5 spans (DB 쿼리 포함) +- Detection Create: 5-10 spans (MQTT, RabbitMQ, DB) +- OCR Task: 7-15 spans (GCS, API, DB) + +### 5.3 Loki 로그 확인 + +구조화된 로그를 통해 각 컴포넌트의 동작을 확인합니다. + +```bash +# Loki에서 수집된 로그 스트림 확인 +curl -sG http://localhost:3100/loki/api/v1/labels | python3 -c " +import json, sys +data = json.load(sys.stdin) +print('Loki Labels:') +print(f\" Available labels: {', '.join(data['data'][:5])}...\") +" +``` + +```bash +# speedcam 컨테이너의 최근 로그 (Loki) +curl -sG "http://localhost:3100/loki/api/v1/query" \ + --data-urlencode 'query={container=~"speedcam.*"}' \ + --data-urlencode 'limit=10' | python3 -c " +import json, sys +data = json.load(sys.stdin) +streams = data['data']['result'] +print(f'Log Streams Found: {len(streams)}') +for stream in streams[:3]: + container = stream['stream'].get('container', 'unknown') + num_entries = len(stream['values']) + print(f\" {container:30s}: {num_entries} log entries\") +" +``` + +**Grafana UI에서 로그 보기**: +1. Grafana → Explore → Loki +2. 쿼리: `{container=~"speedcam.*"}` +3. 각 로그 라인의 `trace_id=` 클릭 → Jaeger 트레이스 자동 이동 + +### 5.4 RabbitMQ 큐 상태 + +MQTT 메시지 처리 파이프라인의 큐 상태를 확인합니다. + +```bash +# 큐별 메시지 수 확인 +curl -s -u sa:1234 http://localhost:15672/api/queues/%2F | python3 -c " +import json, sys +queues = json.load(sys.stdin) +print('RabbitMQ Queue Status:') +print(f\"{'Queue Name':<20} {'Messages':>10} {'Consumers':>10} {'Ready':>10} {'Unacked':>10}\") +print('-' * 60) +for q in queues: + if q['name'] in ('detections_queue', 'ocr_queue', 'fcm_queue', 'dlq_queue'): + name = q['name'] + msgs = q.get('messages', 0) + consumers = q.get('consumers', 0) + ready = q.get('messages_ready', 0) + unacked = q.get('messages_unacknowledged', 0) + print(f'{name:<20} {msgs:>10} {consumers:>10} {ready:>10} {unacked:>10}') +" +``` + +**정상 상태**: +- **detections_queue**: 0 (Detection 생성 후 즉시 처리) +- **ocr_queue**: 0-10 (처리 중) +- **fcm_queue**: 0-5 (처리 중) +- **dlq_queue**: 0 (에러 없음) +- **consumers**: 각 큐당 1 이상 (worker가 리스닝 중) + +### 5.5 Prometheus 메트릭 확인 + +시스템 성능 메트릭을 Prometheus PromQL로 확인합니다. + +```bash +# Django HTTP 요청 메트릭 +curl -s http://localhost:9090/api/v1/query --data-urlencode \ + 'query=rate(django_http_requests_total[5m])' | python3 -c " +import json, sys +data = json.load(sys.stdin) +result = data['data']['result'] +if result: + print(f'Django HTTP Request Rate: {len(result)} series found') + print(f' Current RPS: {float(result[0][\"value\"][1]):.1f}') +else: + print('No Django metrics found') +" +``` + +```bash +# Celery 태스크 메트릭 +curl -s http://localhost:9090/api/v1/query --data-urlencode \ + 'query=rate(celery_tasks_total[5m])' | python3 -c " +import json, sys +data = json.load(sys.stdin) +result = data['data']['result'] +if result: + print(f'Celery Task Rate: {len(result)} series found') + for r in result[:3]: + state = r['metric'].get('state', 'unknown') + rate = float(r['value'][1]) + print(f\" {state:10s}: {rate:.1f} tasks/sec\") +else: + print('No Celery metrics found') +" +``` + +### 5.6 DB 성능 메트릭 + +```bash +# MySQL 활성 커넥션 수 +curl -s http://localhost:9090/api/v1/query --data-urlencode \ + 'query=mysql_global_status_threads_connected' | python3 -c " +import json, sys +data = json.load(sys.stdin) +result = data['data']['result'] +if result: + value = float(result[0]['value'][1]) + print(f'Active MySQL Connections: {int(value)}') +else: + print('No MySQL metrics found') +" +``` + +### 5.7 컨테이너 리소스 사용률 + +```bash +# 각 컨테이너 CPU 사용률 (%) - cAdvisor 필요 +curl -s http://localhost:9090/api/v1/query --data-urlencode \ + 'query=rate(container_cpu_usage_seconds_total{name=~"speedcam-.*"}[5m])*100' | python3 -c " +import json, sys +data = json.load(sys.stdin) +result = data['data']['result'] +if result: + print('Container CPU Usage (%):') + for r in result[:5]: + name = r['metric'].get('name', 'unknown') + cpu_usage = float(r['value'][1]) + print(f\" {name:30s}: {cpu_usage:6.2f}%\") +else: + print('No cAdvisor metrics found (Linux only)') +" +``` + +```bash +# 각 컨테이너 메모리 사용량 (MB) - cAdvisor 필요 +curl -s http://localhost:9090/api/v1/query --data-urlencode \ + 'query=container_memory_usage_bytes{name=~"speedcam-.*"}/1024/1024' | python3 -c " +import json, sys +data = json.load(sys.stdin) +result = data['data']['result'] +if result: + print('Container Memory Usage (MB):') + for r in result[:5]: + name = r['metric'].get('name', 'unknown') + memory_mb = float(r['value'][1]) + print(f\" {name:30s}: {memory_mb:7.1f} MB\") +else: + print('No cAdvisor metrics found (Linux only)') +" +``` + +--- + +## 6. 트러블슈팅 + +### 6.1 docker-compose 실행 오류 + +**오류**: `network speedcam-network not found` + +**원인**: 모니터링 스택만 단독으로 시작함 + +**해결**: +```bash +# 앱 스택을 먼저 시작 +docker compose -f docker-compose.yml up -d + +# 그 다음 모니터링 추가 +docker compose -f docker-compose.monitoring.yml up -d + +# 또는 함께 시작 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d +``` + +### 6.2 Prometheus 타겟이 DOWN + +**오류**: Prometheus 대시보드에서 일부 타겟이 DOWN 상태 + +**celery-exporter가 재시작되는 경우**: +- **원인**: RabbitMQ보다 먼저 시작되어 broker 연결 실패 +- **해결**: 자동 복구됨 (`restart: unless-stopped`). 30초 기다린 후 확인 + +**mysqld-exporter가 DOWN**: +- **원인**: MySQL보다 먼저 시작됨 +- **해결**: 자동 복구됨. 30초 기다린 후 확인 + +**django가 DOWN**: +- **원인**: 앱 시작 실패 +- **해결**: + ```bash + docker logs speedcam-main + ``` + +### 6.3 cAdvisor 시작 실패 (macOS) + +**오류**: `cadvisor: error setting oom score: open /proc/.../oom_score_adj: no such file or directory` + +**원인**: cAdvisor는 Linux 전용이며 /proc 파일시스템 필요 + +**해결**: 예상 동작. macOS에서는 자동으로 제외됨 (`profiles: [linux]`). Linux에서만 실행하세요. + +### 6.4 Jaeger에 트레이스가 없음 + +**오류**: Jaeger UI에서 데이터가 보이지 않음 + +**원인**: OTEL_EXPORTER_OTLP_ENDPOINT 미설정 + +**해결**: +```bash +# backend.env 확인 +cat docker/backend.env | grep OTEL_EXPORTER_OTLP_ENDPOINT + +# 없으면 추가 +echo 'OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317' >> docker/backend.env + +# 앱 재시작 +docker compose -f docker-compose.yml up -d --force-recreate speedcam-main +``` + +### 6.5 Loki 429 Too Many Requests + +**오류**: Loki 쿼리 실패 with 429 status + +**원인**: 로그 스트림이 너무 많음 (한계 초과) + +**해결**: +```bash +# loki-config.yml에서 한계 증가 +# docker/monitoring/loki/loki-config.yml 수정 +# limits_config: +# max_global_streams_per_user: 20000 # 기본값 10000에서 증가 +``` + +### 6.6 환경 변수 변경 후 반영 안됨 + +**오류**: backend.env 변경 후 앱에 반영 안됨 + +**원인**: Docker restart는 env_file을 다시 읽지 않음 + +**해결**: +```bash +# --force-recreate 사용 +docker compose -f docker-compose.yml up -d --force-recreate speedcam-main +``` + +### 6.7 MQTT 연결 실패 + +**오류**: `python mqtt-load-test.py` 실행 시 연결 실패 + +**원인**: RabbitMQ MQTT 플러그인 미활성화 + +**확인**: +```bash +docker logs speedcam-rabbitmq | grep -i mqtt +# "MQTT plugin loaded" 메시지가 있어야 함 +``` + +**해결** (이미 자동 활성화됨): +docker-compose.yml의 rabbitmq command에 `rabbitmq_mqtt` 플러그인이 포함되어 있는지 확인 + +### 6.8 k6 테스트 타임아웃 + +**오류**: k6 테스트 중 `dial tcp: lookup main: no such host` + +**원인**: k6 컨테이너가 speedcam-network에 연결되지 않음 + +**해결**: docker-compose.monitoring.yml에서 k6 서비스가 올바른 네트워크 설정이 있는지 확인 + +```yaml +networks: + - speedcam-network # speedcam-network 참조 +``` + +--- + +## 7. 정리 및 종료 + +### 7.1 전체 종료 및 데이터 제거 + +```bash +cd docker + +# 컨테이너 + 볼륨 완전 제거 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml down -v +``` + +### 7.2 모니터링 데이터만 삭제 + +런타임 데이터(Prometheus, Grafana, Loki)를 초기화합니다: + +```bash +rm -rf docker/monitoring/prometheus/data \ + docker/monitoring/loki/data \ + docker/monitoring/grafana/data +``` + +다시 시작하면 초기 상태로 복구됩니다. + +### 7.3 모니터링 스택만 종료 (앱 유지) + +앱은 계속 실행하고 모니터링만 종료: + +```bash +cd docker + +docker compose -f docker-compose.monitoring.yml down +``` + +나중에 모니터링을 다시 시작: + +```bash +docker compose -f docker-compose.monitoring.yml up -d +``` + +### 7.4 특정 컨테이너만 종료 + +```bash +# 개별 서비스 종료 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + stop speedcam-main speedcam-ocr + +# 개별 서비스 재시작 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml \ + restart speedcam-main +``` + +--- + +## 8. 추가 자료 + +### 8.1 관련 문서 + +- [모니터링 스택 가이드](./MONITORING.md): 아키텍처, 설정, 메트릭 상세 설명 +- [배포 가이드](./DEPLOYMENT.md): GCP 멀티 인스턴스 배포 방법 +- [아키텍처 비교](./ARCHITECTURE_COMPARISON.md): 시스템 설계 이유 + +### 8.2 외부 자료 + +- [k6 공식 문서](https://k6.io/docs/) +- [Prometheus PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/) +- [Grafana 대시보드](https://grafana.com/grafana/dashboards/) +- [Jaeger 분산 트레이싱](https://www.jaegertracing.io/docs/) +- [OpenTelemetry Python](https://opentelemetry.io/docs/instrumentation/python/) + +### 8.3 자주 사용하는 명령어 + +```bash +# 모니터링 스택 전체 확인 +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml ps + +# 로그 실시간 추적 +docker logs -f speedcam-main +docker logs -f speedcam-ocr +docker logs -f speedcam-alert + +# 모니터링 데이터 초기화 후 재시작 +rm -rf docker/monitoring/*/data +docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d + +# Prometheus 메트릭 직접 조회 +curl -s http://localhost:9090/api/v1/query --data-urlencode 'query=' + +# RabbitMQ 큐 확인 +curl -s -u sa:1234 http://localhost:15672/api/queues/%2F + +# Jaeger 서비스 확인 +curl -s http://localhost:16686/api/services + +# Docker 디스크 정리 (주의: 사용하지 않는 모든 이미지/볼륨 제거) +docker system prune -a --volumes +``` + +--- + +## 9. FAQ + +**Q: k6과 MQTT 테스트 중 어느 것을 먼저 실행해야 하나요?** + +A: k6을 먼저 실행하세요. k6은 REST API만 테스트하므로 (순수 읽기 작업) 데이터베이스 상태에 영향을 주지 않습니다. MQTT 테스트는 실제 Detection을 생성하므로 나중에 실행하는 것이 좋습니다. + +**Q: 부하 테스트 중 시스템이 느려집니다. 어떻게 해야 하나요?** + +A: 정상입니다. 먼저 메트릭을 확인하세요: +1. Prometheus에서 CPU/메모리 사용률 확인 +2. RabbitMQ 큐 깊이 확인 (메시지 밀림) +3. MySQL 커넥션 풀 상태 확인 +4. 필요하면 docker-compose.yml의 리소스 제한(`resources`) 조정 + +**Q: 테스트 결과를 저장하고 싶습니다.** + +A: k6은 자동으로 Prometheus에 메트릭을 기록합니다. Prometheus → Export로 데이터를 JSON/CSV로 내보낼 수 있습니다. MQTT 테스트의 경우 stdout을 파일로 리다이렉트합니다: +```bash +python docker/k6/mqtt-load-test.py ... > test_results.txt +``` + +**Q: 모니터링 없이 성능 테스트를 실행할 수 있나요?** + +A: 가능합니다. k6 또는 MQTT 테스트 스크립트는 독립적으로 실행할 수 있습니다. 하지만 모니터링 없으면 결과를 측정하고 분석하기 어렵습니다. + +**Q: 프로덕션 환경에서 어떻게 테스트하나요?** + +A: 이 가이드는 로컬/개발 환경 기준입니다. 프로덕션 배포는 [GCP 멀티 인스턴스 배포 가이드](./DEPLOYMENT.md)를 참고하세요. 프로덕션에서는: +1. 전용 모니터링 인스턴스 사용 +2. Prometheus 보안 설정 (인증, TLS) +3. 백그라운드에서 정기적인 스모크 테스트 실행 +4. 알림 규칙(Alert) 설정 + +--- + +마지막 업데이트: 2024년 1월 diff --git a/requirements/base.txt b/requirements/base.txt index 58478e2..0a0c47b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -24,6 +24,14 @@ pytz==2025.1 drf-yasg==1.21.10 PyYAML==6.0.2 -# APM -ddtrace +# Observability (OpenTelemetry) +opentelemetry-distro +opentelemetry-exporter-otlp +opentelemetry-instrumentation-django +opentelemetry-instrumentation-celery +opentelemetry-instrumentation-pymysql +opentelemetry-instrumentation-requests +opentelemetry-instrumentation-logging +# Prometheus Metrics +django-prometheus diff --git a/scripts/start_alert_worker.sh b/scripts/start_alert_worker.sh index 6c424f7..24bda15 100644 --- a/scripts/start_alert_worker.sh +++ b/scripts/start_alert_worker.sh @@ -4,7 +4,9 @@ set -e echo "Starting Alert Worker (Celery)..." # Celery Worker 시작 (gevent pool - I/O 집약적) -ddtrace-run celery -A config worker \ +opentelemetry-instrument \ + --service_name speedcam-alert \ + celery -A config worker \ --pool=gevent \ --concurrency=${ALERT_CONCURRENCY:-100} \ --queues=fcm_queue \ diff --git a/scripts/start_main.sh b/scripts/start_main.sh index a7a4499..187a61d 100644 --- a/scripts/start_main.sh +++ b/scripts/start_main.sh @@ -45,7 +45,9 @@ start_mqtt_subscriber() # Gunicorn 시작 echo "Starting Gunicorn..." -ddtrace-run gunicorn config.wsgi:application \ +opentelemetry-instrument \ + --service_name speedcam-api \ + gunicorn config.wsgi:application \ --bind 0.0.0.0:8000 \ --workers ${GUNICORN_WORKERS:-4} \ --threads ${GUNICORN_THREADS:-2} \ diff --git a/scripts/start_ocr_worker.sh b/scripts/start_ocr_worker.sh index c884272..40557a6 100644 --- a/scripts/start_ocr_worker.sh +++ b/scripts/start_ocr_worker.sh @@ -4,7 +4,9 @@ set -e echo "Starting OCR Worker (Celery)..." # Celery Worker 시작 (prefork pool - CPU 집약적) -ddtrace-run celery -A config worker \ +opentelemetry-instrument \ + --service_name speedcam-ocr \ + celery -A config worker \ --pool=prefork \ --concurrency=${OCR_CONCURRENCY:-4} \ --queues=ocr_queue \ diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl deleted file mode 100644 index 8289be8..0000000 --- a/terraform/.terraform.lock.hcl +++ /dev/null @@ -1,60 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/hashicorp/google" { - version = "5.45.2" - constraints = "~> 5.0" - hashes = [ - "h1:iy2Q9VcnMu4z/bH3v/NmI/nEpgYY7bXgJmT/hVTAUS4=", - "zh:0d09c8f20b556305192cdbe0efa6d333ceebba963a8ba91f9f1714b5a20c4b7a", - "zh:117143fc91be407874568df416b938a6896f94cb873f26bba279cedab646a804", - "zh:16ccf77d18dd2c5ef9c0625f9cf546ebdf3213c0a452f432204c69feed55081e", - "zh:3e555cf22a570a4bd247964671f421ed7517970cd9765ceb46f335edc2c6f392", - "zh:688bd5b05a75124da7ae6e885b2b92bd29f4261808b2b78bd5f51f525c1052ca", - "zh:6db3ef37a05010d82900bfffb3261c59a0c247e0692049cb3eb8c2ef16c9d7bf", - "zh:70316fde75f6a15d72749f66d994ccbdde5f5ed4311b6d06b99850f698c9bbf9", - "zh:84b8e583771a4f2bd514e519d98ed7fd28dce5efe0634e973170e1cfb5556fb4", - "zh:9d4b8ef0a9b6677935c604d94495042e68ff5489932cfd1ec41052e094a279d3", - "zh:a2089dd9bd825c107b148dd12d6b286f71aa37dfd4ca9c35157f2dcba7bc19d8", - "zh:f03d795c0fd9721e59839255ee7ba7414173017dc530b4ce566daf3802a0d6dd", - "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", - ] -} - -provider "registry.terraform.io/hashicorp/null" { - version = "3.2.4" - hashes = [ - "h1:L5V05xwp/Gto1leRryuesxjMfgZwjb7oool4WS1UEFQ=", - "zh:59f6b52ab4ff35739647f9509ee6d93d7c032985d9f8c6237d1f8a59471bbbe2", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:795c897119ff082133150121d39ff26cb5f89a730a2c8c26f3a9c1abf81a9c43", - "zh:7b9c7b16f118fbc2b05a983817b8ce2f86df125857966ad356353baf4bff5c0a", - "zh:85e33ab43e0e1726e5f97a874b8e24820b6565ff8076523cc2922ba671492991", - "zh:9d32ac3619cfc93eb3c4f423492a8e0f79db05fec58e449dee9b2d5873d5f69f", - "zh:9e15c3c9dd8e0d1e3731841d44c34571b6c97f5b95e8296a45318b94e5287a6e", - "zh:b4c2ab35d1b7696c30b64bf2c0f3a62329107bd1a9121ce70683dec58af19615", - "zh:c43723e8cc65bcdf5e0c92581dcbbdcbdcf18b8d2037406a5f2033b1e22de442", - "zh:ceb5495d9c31bfb299d246ab333f08c7fb0d67a4f82681fbf47f2a21c3e11ab5", - "zh:e171026b3659305c558d9804062762d168f50ba02b88b231d20ec99578a6233f", - "zh:ed0fe2acdb61330b01841fa790be00ec6beaac91d41f311fb8254f74eb6a711f", - ] -} - -provider "registry.terraform.io/hashicorp/time" { - version = "0.13.1" - hashes = [ - "h1:ZT5ppCNIModqk3iOkVt5my8b8yBHmDpl663JtXAIRqM=", - "zh:02cb9aab1002f0f2a94a4f85acec8893297dc75915f7404c165983f720a54b74", - "zh:04429b2b31a492d19e5ecf999b116d396dac0b24bba0d0fb19ecaefe193fdb8f", - "zh:26f8e51bb7c275c404ba6028c1b530312066009194db721a8427a7bc5cdbc83a", - "zh:772ff8dbdbef968651ab3ae76d04afd355c32f8a868d03244db3f8496e462690", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:898db5d2b6bd6ca5457dccb52eedbc7c5b1a71e4a4658381bcbb38cedbbda328", - "zh:8de913bf09a3fa7bedc29fec18c47c571d0c7a3d0644322c46f3aa648cf30cd8", - "zh:9402102c86a87bdfe7e501ffbb9c685c32bbcefcfcf897fd7d53df414c36877b", - "zh:b18b9bb1726bb8cfbefc0a29cf3657c82578001f514bcf4c079839b6776c47f0", - "zh:b9d31fdc4faecb909d7c5ce41d2479dd0536862a963df434be4b16e8e4edc94d", - "zh:c951e9f39cca3446c060bd63933ebb89cedde9523904813973fbc3d11863ba75", - "zh:e5b773c0d07e962291be0e9b413c7a22c044b8c7b58c76e8aa91d1659990dfb5", - ] -} diff --git a/terraform/README.md b/terraform/README.md deleted file mode 100644 index 99ccc8e..0000000 --- a/terraform/README.md +++ /dev/null @@ -1,385 +0,0 @@ -# Speedcam MSA - Terraform 배포 가이드 - -GCP(Google Cloud Platform)에 Speedcam MSA 인프라를 자동으로 배포하기 위한 Terraform 구성입니다. - -## 아키텍처 - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Google Cloud Platform │ -│ │ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Main │ │ OCR │ │ Alert │ │ -│ │ (Django) │ │ (Celery) │ │ (Celery) │ │ -│ │ e2-medium │ │ e2-medium │ │ e2-small │ │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ -│ │ │ │ │ -│ └────────────────┼────────────────┘ │ -│ │ │ -│ ┌───────────┴───────────┐ │ -│ │ │ │ -│ ┌──────┴──────┐ ┌──────┴──────┐ │ -│ │ RabbitMQ │ │ MySQL │ │ -│ │ e2-small │ │ e2-small │ │ -│ │ MQTT/AMQP │ │ 4 DBs │ │ -│ └─────────────┘ └─────────────┘ │ -│ │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ Artifact Registry │ │ -│ │ speedcam/{main,ocr,alert} │ │ -│ └─────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -## 파일 구조 - -``` -terraform/ -├── main.tf # Provider 설정 및 로컬 변수 -├── variables.tf # 입력 변수 정의 -├── network.tf # 방화벽 규칙 -├── artifact_registry.tf # 컨테이너 레지스트리 -├── instances_infra.tf # RabbitMQ, MySQL 인스턴스 -├── instances_services.tf # Main, OCR, Alert 인스턴스 -├── outputs.tf # 출력 변수 -├── terraform.tfvars.example # 변수 예제 파일 -└── README.md # 이 문서 -``` - -## 사전 요구사항 - -1. **Terraform 설치** (v1.0 이상) - ```bash - # macOS - brew install terraform - - # Linux - wget https://releases.hashicorp.com/terraform/1.7.0/terraform_1.7.0_linux_amd64.zip - unzip terraform_1.7.0_linux_amd64.zip - sudo mv terraform /usr/local/bin/ - ``` - -2. **Google Cloud SDK 설치 및 인증** - ```bash - # 인증 - gcloud auth login - gcloud auth application-default login - - # 프로젝트 설정 - gcloud config set project YOUR_PROJECT_ID - ``` - -3. **필요한 API 활성화** - ```bash - gcloud services enable compute.googleapis.com - gcloud services enable artifactregistry.googleapis.com - ``` - -4. **Docker 이미지 빌드 및 푸시** - ```bash - # 프로젝트 루트에서 - make build - make push - ``` - -## 빠른 시작 - -### 1. 변수 파일 생성 - -```bash -cd terraform -cp terraform.tfvars.example terraform.tfvars -``` - -### 2. 변수 파일 수정 - -```hcl -# terraform.tfvars -project_id = "your-actual-project-id" - -# 보안을 위해 강력한 비밀번호 설정 -db_password = "your-secure-db-password" -db_root_password = "your-secure-root-password" -rabbitmq_password = "your-secure-rabbitmq-password" - -# 환경 설정 (dev, staging, prod) -environment = "dev" -``` - -### 3. Terraform 초기화 - -```bash -terraform init -``` - -### 4. 배포 계획 확인 - -```bash -terraform plan -``` - -### 5. 인프라 배포 - -```bash -terraform apply -``` - -## 변수 설명 - -### 필수 변수 - -| 변수 | 설명 | 예시 | -|------|------|------| -| `project_id` | GCP 프로젝트 ID | `my-project-123` | - -### 선택 변수 (기본값 제공) - -| 변수 | 기본값 | 설명 | -|------|--------|------| -| `region` | `asia-northeast3` | GCP 리전 (서울) | -| `zone` | `asia-northeast3-a` | GCP 존 | -| `environment` | `dev` | 환경 (dev/staging/prod) | -| `db_name` | `speedcam` | 기본 데이터베이스 이름 | -| `db_user` | `sa` | 데이터베이스 사용자 | -| `db_password` | `sa` | 데이터베이스 비밀번호 | -| `rabbitmq_user` | `sa` | RabbitMQ 사용자 | -| `rabbitmq_password` | `sa` | RabbitMQ 비밀번호 | -| `machine_type_small` | `e2-small` | 작은 인스턴스 타입 | -| `machine_type_medium` | `e2-medium` | 중간 인스턴스 타입 | -| `ocr_concurrency` | `2` | OCR 워커 동시성 | -| `alert_concurrency` | `50` | Alert 워커 동시성 | -| `ocr_mock` | `true` | OCR 모킹 여부 | -| `fcm_mock` | `true` | FCM 모킹 여부 | - -## 출력 값 - -배포 완료 후 다음 정보를 확인할 수 있습니다: - -```bash -# 모든 출력 확인 -terraform output - -# 특정 출력 확인 -terraform output api_url -terraform output swagger_url -terraform output deployment_summary -``` - -### 주요 출력 - -- `api_url` - API 기본 URL -- `swagger_url` - Swagger UI URL -- `health_url` - 헬스 체크 URL -- `rabbitmq_management_url` - RabbitMQ 관리 UI URL -- `registry_url` - Artifact Registry URL -- `deployment_summary` - 전체 배포 요약 - -## 환경별 배포 - -### 개발 환경 - -```hcl -# terraform.tfvars -environment = "dev" -ocr_mock = true -fcm_mock = true -machine_type_small = "e2-small" -machine_type_medium = "e2-medium" -``` - -### 스테이징 환경 - -```hcl -# terraform.tfvars -environment = "staging" -ocr_mock = false -fcm_mock = true -machine_type_small = "e2-small" -machine_type_medium = "e2-medium" -``` - -### 프로덕션 환경 - -```hcl -# terraform.tfvars -environment = "prod" -ocr_mock = false -fcm_mock = false -machine_type_small = "e2-medium" -machine_type_medium = "e2-standard-2" -ocr_concurrency = 4 -alert_concurrency = 100 -``` - -## Workspace 활용 - -여러 환경을 관리하려면 Terraform Workspace를 사용할 수 있습니다: - -```bash -# Workspace 생성 -terraform workspace new dev -terraform workspace new staging -terraform workspace new prod - -# Workspace 전환 -terraform workspace select dev - -# 현재 Workspace 확인 -terraform workspace show - -# Workspace 목록 -terraform workspace list -``` - -## 상태 관리 - -### 로컬 상태 (기본) - -기본적으로 상태 파일은 로컬에 저장됩니다: -- `terraform.tfstate` -- `terraform.tfstate.backup` - -### 원격 상태 (권장) - -팀 협업을 위해 GCS 백엔드 사용을 권장합니다: - -```hcl -# main.tf에 추가 -terraform { - backend "gcs" { - bucket = "your-terraform-state-bucket" - prefix = "speedcam/terraform/state" - } -} -``` - -백엔드 설정 후: -```bash -terraform init -migrate-state -``` - -## 인프라 업데이트 - -### 이미지 태그 변경 - -```bash -terraform apply -var="image_tag=v1.2.0" -``` - -### 인스턴스 타입 변경 - -```bash -terraform apply -var="machine_type_medium=e2-standard-2" -``` - -### 특정 리소스만 재생성 - -```bash -# Main 서비스만 재생성 -terraform taint google_compute_instance.main -terraform apply - -# 마이그레이션만 재실행 -terraform taint null_resource.run_migrations -terraform apply -``` - -## 인프라 삭제 - -```bash -# 전체 삭제 (확인 필요) -terraform destroy - -# 자동 승인으로 삭제 -terraform destroy -auto-approve -``` - -## 문제 해결 - -### 1. 인스턴스 시작 실패 - -```bash -# 인스턴스 로그 확인 -gcloud compute instances get-serial-port-output speedcam-main --zone=asia-northeast3-a - -# 컨테이너 로그 확인 -gcloud compute ssh speedcam-main --zone=asia-northeast3-a \ - --command="docker logs \$(docker ps -q)" -``` - -### 2. 데이터베이스 연결 실패 - -```bash -# MySQL 상태 확인 -gcloud compute ssh speedcam-mysql --zone=asia-northeast3-a \ - --command="docker exec \$(docker ps -q) mysqladmin -u root -p status" -``` - -### 3. RabbitMQ 연결 실패 - -```bash -# RabbitMQ 상태 확인 -gcloud compute ssh speedcam-rabbitmq --zone=asia-northeast3-a \ - --command="docker exec \$(docker ps -q) rabbitmqctl status" -``` - -### 4. Terraform 상태 문제 - -```bash -# 상태 새로고침 -terraform refresh - -# 상태에서 리소스 제거 (실제 리소스는 유지) -terraform state rm google_compute_instance.main - -# 기존 리소스 가져오기 -terraform import google_compute_instance.main speedcam-main -``` - -## 비용 최적화 - -### 예상 월간 비용 (asia-northeast3 기준) - -| 리소스 | 타입 | 예상 비용 | -|--------|------|----------| -| Main | e2-medium | ~$25 | -| OCR | e2-medium | ~$25 | -| Alert | e2-small | ~$13 | -| RabbitMQ | e2-small | ~$13 | -| MySQL | e2-small + SSD | ~$18 | -| **총계** | | **~$94/월** | - -### 비용 절감 팁 - -1. **Preemptible VM 사용** (개발 환경) - ```hcl - scheduling { - preemptible = true - } - ``` - -2. **자동 시작/중지 스케줄링** - - Cloud Scheduler로 업무 시간 외 인스턴스 중지 - -3. **Committed Use Discounts** - - 1년/3년 약정으로 최대 57% 할인 - -## 보안 고려사항 - -1. **비밀번호 관리** - - Secret Manager 사용 권장 - - terraform.tfvars를 .gitignore에 추가 - -2. **네트워크 보안** - - 프로덕션에서는 외부 IP 제거 - - VPN 또는 IAP 터널 사용 - -3. **서비스 계정** - - 최소 권한 원칙 적용 - - 전용 서비스 계정 생성 - -## 참고 자료 - -- [Terraform GCP Provider 문서](https://registry.terraform.io/providers/hashicorp/google/latest/docs) -- [GCP Container-Optimized OS](https://cloud.google.com/container-optimized-os/docs) -- [GCP 가격 계산기](https://cloud.google.com/products/calculator) diff --git a/terraform/artifact_registry.tf b/terraform/artifact_registry.tf deleted file mode 100644 index bef100a..0000000 --- a/terraform/artifact_registry.tf +++ /dev/null @@ -1,50 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Artifact Registry Configuration -# ============================================================================= - -# ============================================================================= -# Artifact Registry Repository -# ============================================================================= - -resource "google_artifact_registry_repository" "speedcam" { - location = var.region - repository_id = var.registry_name - description = "Speedcam MSA Docker images" - format = "DOCKER" - project = var.project_id - - labels = local.common_labels - - # Cleanup policy (optional) - cleanup_policies { - id = "delete-old-images" - action = "DELETE" - - condition { - tag_state = "UNTAGGED" - older_than = "2592000s" # 30 days - } - } - - cleanup_policies { - id = "keep-recent-tagged" - action = "KEEP" - - most_recent_versions { - keep_count = 10 - package_name_prefixes = ["main", "ocr", "alert"] - } - } -} - -# ============================================================================= -# IAM Policy for Compute Engine to pull images -# ============================================================================= - -resource "google_artifact_registry_repository_iam_member" "compute_reader" { - project = var.project_id - location = var.region - repository = google_artifact_registry_repository.speedcam.name - role = "roles/artifactregistry.reader" - member = "serviceAccount:${data.google_compute_default_service_account.default.email}" -} diff --git a/terraform/instances_infra.tf b/terraform/instances_infra.tf deleted file mode 100644 index 1aeb872..0000000 --- a/terraform/instances_infra.tf +++ /dev/null @@ -1,204 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Infrastructure Instances (RabbitMQ, MySQL) -# ============================================================================= - -# ============================================================================= -# RabbitMQ Instance -# ============================================================================= - -resource "google_compute_instance" "rabbitmq" { - name = "speedcam-rabbitmq" - machine_type = var.machine_type_small - zone = var.zone - project = var.project_id - - tags = ["speedcam", "speedcam-web"] - - labels = merge(local.common_labels, { - service = "rabbitmq" - }) - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 20 - type = "pd-standard" - } - } - - network_interface { - network = var.network_name - - access_config { - // Ephemeral public IP - } - } - - metadata = { - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "rabbitmq" - image = var.rabbitmq_image - env = [ - { name = "RABBITMQ_DEFAULT_USER", value = var.rabbitmq_user }, - { name = "RABBITMQ_DEFAULT_PASS", value = var.rabbitmq_password }, - ] - volumeMounts = [] - }] - volumes = [] - restartPolicy = "Always" - } - }) - } - - service_account { - email = data.google_compute_default_service_account.default.email - scopes = ["cloud-platform"] - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - allow_stopping_for_update = true - - lifecycle { - create_before_destroy = true - } -} - -# ============================================================================= -# MySQL Instance -# ============================================================================= - -resource "google_compute_instance" "mysql" { - name = "speedcam-mysql" - machine_type = var.machine_type_small - zone = var.zone - project = var.project_id - - tags = ["speedcam"] - - labels = merge(local.common_labels, { - service = "mysql" - }) - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 50 - type = "pd-ssd" - } - } - - network_interface { - network = var.network_name - - access_config { - // Ephemeral public IP - } - } - - metadata = { - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "mysql" - image = var.mysql_image - env = [ - { name = "MYSQL_ROOT_PASSWORD", value = var.db_root_password }, - { name = "MYSQL_DATABASE", value = var.db_name }, - { name = "MYSQL_USER", value = var.db_user }, - { name = "MYSQL_PASSWORD", value = var.db_password }, - ] - volumeMounts = [{ - name = "mysql-data" - mountPath = "/var/lib/mysql" - }] - }] - volumes = [{ - name = "mysql-data" - hostPath = { - path = "/var/lib/mysql" - } - }] - restartPolicy = "Always" - } - }) - } - - service_account { - email = data.google_compute_default_service_account.default.email - scopes = ["cloud-platform"] - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - allow_stopping_for_update = true - - lifecycle { - create_before_destroy = true - } -} - -# ============================================================================= -# Null Resource for Infrastructure Initialization -# ============================================================================= - -# Wait for instances to be ready -resource "time_sleep" "wait_for_infra" { - depends_on = [ - google_compute_instance.rabbitmq, - google_compute_instance.mysql - ] - - create_duration = "90s" -} - -# Initialize RabbitMQ MQTT plugin -resource "null_resource" "init_rabbitmq" { - depends_on = [time_sleep.wait_for_infra] - - provisioner "local-exec" { - command = <<-EOT - gcloud compute ssh speedcam-rabbitmq --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) rabbitmq-plugins enable rabbitmq_mqtt" \ - || echo "MQTT plugin may already be enabled" - EOT - } - - triggers = { - instance_id = google_compute_instance.rabbitmq.instance_id - } -} - -# Initialize MySQL databases -resource "null_resource" "init_mysql" { - depends_on = [time_sleep.wait_for_infra] - - provisioner "local-exec" { - command = <<-EOT - gcloud compute ssh speedcam-mysql --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) mysql -u root -p${var.db_root_password} -e \"\ - CREATE DATABASE IF NOT EXISTS ${var.db_name}_vehicles; \ - CREATE DATABASE IF NOT EXISTS ${var.db_name}_detections; \ - CREATE DATABASE IF NOT EXISTS ${var.db_name}_notifications; \ - GRANT ALL PRIVILEGES ON ${var.db_name}_vehicles.* TO '${var.db_user}'@'%'; \ - GRANT ALL PRIVILEGES ON ${var.db_name}_detections.* TO '${var.db_user}'@'%'; \ - GRANT ALL PRIVILEGES ON ${var.db_name}_notifications.* TO '${var.db_user}'@'%'; \ - FLUSH PRIVILEGES;\"" \ - || echo "Databases may already exist" - EOT - } - - triggers = { - instance_id = google_compute_instance.mysql.instance_id - } -} diff --git a/terraform/instances_monitoring.tf b/terraform/instances_monitoring.tf deleted file mode 100644 index f3988b1..0000000 --- a/terraform/instances_monitoring.tf +++ /dev/null @@ -1,109 +0,0 @@ -# ============================================================================= -# DataDog Agent Instance -# ============================================================================= - -resource "google_compute_instance" "datadog_agent" { - name = "speedcam-datadog" - machine_type = var.machine_type_small - zone = var.zone - - depends_on = [ - google_compute_instance.rabbitmq, - google_compute_instance.mysql - ] - - tags = ["speedcam"] - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 20 - } - } - - network_interface { - network = "default" - access_config {} - } - - metadata = { - # cloud-init: 컨테이너 시작 전에 Integration 설정 파일 생성 - user-data = <<-CLOUDINIT - #cloud-config - write_files: - - path: /tmp/dd-confd/mysql.d/conf.yaml - permissions: '0644' - content: | - init_config: - instances: - - host: ${google_compute_instance.mysql.network_interface[0].network_ip} - port: 3306 - username: ${var.db_user} - password: ${var.db_password} - reported_hostname: speedcam-mysql - tags: - - env:${var.environment} - - service:speedcam-mysql - - path: /tmp/dd-confd/rabbitmq.d/conf.yaml - permissions: '0644' - content: | - init_config: - instances: - - rabbitmq_api_url: http://${google_compute_instance.rabbitmq.network_interface[0].network_ip}:15672/api/ - rabbitmq_user: ${var.rabbitmq_user} - rabbitmq_pass: ${var.rabbitmq_password} - tag_families: true - collect_node_metrics: true - reported_hostname: speedcam-rabbitmq - tags: - - env:${var.environment} - - service:speedcam-rabbitmq - CLOUDINIT - - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "datadog-agent" - image = "gcr.io/datadoghq/agent:7" - env = [ - { name = "DD_API_KEY", value = var.dd_api_key }, - { name = "DD_SITE", value = var.dd_site }, - { name = "DD_APM_ENABLED", value = "true" }, - { name = "DD_APM_NON_LOCAL_TRAFFIC", value = "true" }, - { name = "DD_DOGSTATSD_NON_LOCAL_TRAFFIC", value = "true" }, - { name = "DD_LOGS_ENABLED", value = "true" }, - { name = "DD_ENV", value = var.environment }, - ] - volumeMounts = [ - { name = "mysql-confd", mountPath = "/etc/datadog-agent/conf.d/mysql.d", readOnly = true }, - { name = "rabbitmq-confd", mountPath = "/etc/datadog-agent/conf.d/rabbitmq.d", readOnly = true }, - ] - }] - volumes = [ - { name = "mysql-confd", hostPath = { path = "/tmp/dd-confd/mysql.d" } }, - { name = "rabbitmq-confd", hostPath = { path = "/tmp/dd-confd/rabbitmq.d" } }, - ] - restartPolicy = "Always" - } - }) - } - - labels = { - project = "speedcam" - environment = var.environment - service = "datadog" - managed_by = "terraform" - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - service_account { - scopes = ["cloud-platform"] - } - - allow_stopping_for_update = true -} diff --git a/terraform/instances_services.tf b/terraform/instances_services.tf deleted file mode 100644 index c6f2cba..0000000 --- a/terraform/instances_services.tf +++ /dev/null @@ -1,259 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Service Instances (Main, OCR, Alert) -# ============================================================================= - -# ============================================================================= -# Main Service Instance -# ============================================================================= - -resource "google_compute_instance" "main" { - name = "speedcam-main" - machine_type = var.machine_type_medium - zone = var.zone - project = var.project_id - - depends_on = [ - null_resource.init_rabbitmq, - null_resource.init_mysql, - google_compute_instance.datadog_agent - ] - - tags = ["speedcam", "speedcam-web"] - - labels = merge(local.common_labels, { - service = "main" - }) - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 20 - type = "pd-standard" - } - } - - network_interface { - network = var.network_name - - access_config { - // Ephemeral public IP - } - } - - metadata = { - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "main" - image = "${local.registry}/main:${var.image_tag}" - env = concat([ - for k, v in local.common_env : { name = k, value = v } - ], [ - { name = "DD_SERVICE", value = "speedcam-api" }, - { name = "RABBITMQ_HOST", value = google_compute_instance.rabbitmq.network_interface[0].network_ip }, - { name = "MQTT_PORT", value = "1883" }, - { name = "MQTT_USER", value = var.rabbitmq_user }, - { name = "MQTT_PASS", value = var.rabbitmq_password }, - { name = "OCR_MOCK", value = tostring(var.ocr_mock) }, - { name = "FCM_MOCK", value = tostring(var.fcm_mock) }, - ]) - }] - restartPolicy = "Always" - } - }) - } - - service_account { - email = data.google_compute_default_service_account.default.email - scopes = ["cloud-platform"] - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - allow_stopping_for_update = true -} - -# ============================================================================= -# OCR Worker Instance -# ============================================================================= - -resource "google_compute_instance" "ocr" { - name = "speedcam-ocr" - machine_type = var.machine_type_medium - zone = var.zone - project = var.project_id - - depends_on = [ - null_resource.init_rabbitmq, - null_resource.init_mysql, - google_compute_instance.datadog_agent - ] - - tags = ["speedcam"] - - labels = merge(local.common_labels, { - service = "ocr" - }) - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 30 - type = "pd-standard" - } - } - - network_interface { - network = var.network_name - - access_config { - // Ephemeral public IP - } - } - - metadata = { - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "ocr" - image = "${local.registry}/ocr:${var.image_tag}" - env = concat([ - for k, v in local.common_env : { name = k, value = v } - ], [ - { name = "DD_SERVICE", value = "speedcam-ocr" }, - { name = "OCR_CONCURRENCY", value = tostring(var.ocr_concurrency) }, - { name = "OCR_MOCK", value = tostring(var.ocr_mock) }, - ]) - }] - restartPolicy = "Always" - } - }) - } - - service_account { - email = data.google_compute_default_service_account.default.email - scopes = ["cloud-platform"] - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - allow_stopping_for_update = true -} - -# ============================================================================= -# Alert Worker Instance -# ============================================================================= - -resource "google_compute_instance" "alert" { - name = "speedcam-alert" - machine_type = var.machine_type_small - zone = var.zone - project = var.project_id - - depends_on = [ - null_resource.init_rabbitmq, - null_resource.init_mysql, - google_compute_instance.datadog_agent - ] - - tags = ["speedcam"] - - labels = merge(local.common_labels, { - service = "alert" - }) - - boot_disk { - initialize_params { - image = "cos-cloud/cos-stable" - size = 20 - type = "pd-standard" - } - } - - network_interface { - network = var.network_name - - access_config { - // Ephemeral public IP - } - } - - metadata = { - gce-container-declaration = yamlencode({ - spec = { - containers = [{ - name = "alert" - image = "${local.registry}/alert:${var.image_tag}" - env = concat([ - for k, v in local.common_env : { name = k, value = v } - ], [ - { name = "DD_SERVICE", value = "speedcam-alert" }, - { name = "ALERT_CONCURRENCY", value = tostring(var.alert_concurrency) }, - { name = "FCM_MOCK", value = tostring(var.fcm_mock) }, - ]) - }] - restartPolicy = "Always" - } - }) - } - - service_account { - email = data.google_compute_default_service_account.default.email - scopes = ["cloud-platform"] - } - - scheduling { - automatic_restart = true - on_host_maintenance = "MIGRATE" - preemptible = false - } - - allow_stopping_for_update = true -} - -# ============================================================================= -# Django Migrations -# ============================================================================= - -resource "time_sleep" "wait_for_main" { - depends_on = [google_compute_instance.main] - - create_duration = "60s" -} - -resource "null_resource" "run_migrations" { - depends_on = [time_sleep.wait_for_main] - - provisioner "local-exec" { - command = <<-EOT - # Create migrations - gcloud compute ssh speedcam-main --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) python manage.py makemigrations vehicles detections notifications 2>/dev/null || true" - - # Run migrations - gcloud compute ssh speedcam-main --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) python manage.py migrate --database=default --noinput" - - gcloud compute ssh speedcam-main --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) python manage.py migrate vehicles --database=vehicles_db --noinput" - - gcloud compute ssh speedcam-main --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) python manage.py migrate detections --database=detections_db --noinput" - - gcloud compute ssh speedcam-main --zone=${var.zone} --project=${var.project_id} \ - --command="docker exec \$(docker ps -q) python manage.py migrate notifications --database=notifications_db --noinput" - EOT - } - - triggers = { - main_instance_id = google_compute_instance.main.instance_id - } -} diff --git a/terraform/main.tf b/terraform/main.tf deleted file mode 100644 index d527cee..0000000 --- a/terraform/main.tf +++ /dev/null @@ -1,79 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Terraform Main Configuration -# ============================================================================= - -terraform { - required_version = ">= 1.5.0" - - required_providers { - google = { - source = "hashicorp/google" - version = "~> 5.0" - } - } - - # Optional: Configure backend for state management - # backend "gcs" { - # bucket = "your-terraform-state-bucket" - # prefix = "speedcam/state" - # } -} - -# ============================================================================= -# Provider Configuration -# ============================================================================= - -provider "google" { - project = var.project_id - region = var.region - zone = var.zone -} - -# ============================================================================= -# Data Sources -# ============================================================================= - -data "google_project" "current" { - project_id = var.project_id -} - -data "google_compute_default_service_account" "default" { - project = var.project_id -} - -# ============================================================================= -# Local Values -# ============================================================================= - -locals { - # Common labels for all resources - common_labels = { - project = "speedcam" - environment = var.environment - managed_by = "terraform" - } - - # Container registry path - registry = "${var.region}-docker.pkg.dev/${var.project_id}/${var.registry_name}" - - # Service environment variables (common) - common_env = { - DJANGO_SETTINGS_MODULE = "config.settings.${var.environment}" - DB_HOST = google_compute_instance.mysql.network_interface[0].network_ip - DB_PORT = "3306" - DB_NAME = var.db_name - DB_NAME_VEHICLES = "${var.db_name}_vehicles" - DB_NAME_DETECTIONS = "${var.db_name}_detections" - DB_NAME_NOTIFICATIONS = "${var.db_name}_notifications" - DB_USER = var.db_user - DB_PASSWORD = var.db_password - CELERY_BROKER_URL = "amqp://${var.rabbitmq_user}:${var.rabbitmq_password}@${google_compute_instance.rabbitmq.network_interface[0].network_ip}:5672//" - DD_AGENT_HOST = google_compute_instance.datadog_agent.network_interface[0].network_ip - DD_TRACE_AGENT_PORT = "8126" - DD_ENV = var.environment - DD_LOGS_INJECTION = "true" - DD_TRACE_SAMPLE_RATE = "1" - DD_PROFILING_ENABLED = "true" - _DD_TRACE_WRITER_NATIVE = "false" - } -} diff --git a/terraform/network.tf b/terraform/network.tf deleted file mode 100644 index 102ccbb..0000000 --- a/terraform/network.tf +++ /dev/null @@ -1,69 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Network Configuration -# ============================================================================= - -# ============================================================================= -# Firewall Rules -# ============================================================================= - -# Internal communication between services -resource "google_compute_firewall" "speedcam_internal" { - name = "speedcam-internal" - network = var.network_name - project = var.project_id - - description = "Allow internal communication for Speedcam MSA" - - allow { - protocol = "tcp" - ports = ["3306", "5672", "1883", "15672", "8000", "8126"] - } - - allow { - protocol = "udp" - ports = ["8125"] - } - - source_ranges = ["10.0.0.0/8"] - target_tags = ["speedcam"] - - priority = 1000 -} - -# External access for API and RabbitMQ Management -resource "google_compute_firewall" "speedcam_external" { - name = "speedcam-external" - network = var.network_name - project = var.project_id - - description = "Allow external access for Speedcam API and RabbitMQ Management" - - allow { - protocol = "tcp" - ports = ["8000", "15672"] - } - - source_ranges = ["0.0.0.0/0"] - target_tags = ["speedcam-web"] - - priority = 1000 -} - -# SSH access (optional - for debugging) -resource "google_compute_firewall" "speedcam_ssh" { - name = "speedcam-ssh" - network = var.network_name - project = var.project_id - - description = "Allow SSH access to Speedcam instances" - - allow { - protocol = "tcp" - ports = ["22"] - } - - source_ranges = ["0.0.0.0/0"] - target_tags = ["speedcam"] - - priority = 1000 -} diff --git a/terraform/outputs.tf b/terraform/outputs.tf deleted file mode 100644 index 8c1f65c..0000000 --- a/terraform/outputs.tf +++ /dev/null @@ -1,160 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Terraform Outputs -# ============================================================================= - -# ============================================================================= -# Instance IPs -# ============================================================================= - -output "rabbitmq_internal_ip" { - description = "RabbitMQ internal IP" - value = google_compute_instance.rabbitmq.network_interface[0].network_ip -} - -output "rabbitmq_external_ip" { - description = "RabbitMQ external IP" - value = google_compute_instance.rabbitmq.network_interface[0].access_config[0].nat_ip -} - -output "mysql_internal_ip" { - description = "MySQL internal IP" - value = google_compute_instance.mysql.network_interface[0].network_ip -} - -output "mysql_external_ip" { - description = "MySQL external IP" - value = google_compute_instance.mysql.network_interface[0].access_config[0].nat_ip -} - -output "main_internal_ip" { - description = "Main service internal IP" - value = google_compute_instance.main.network_interface[0].network_ip -} - -output "main_external_ip" { - description = "Main service external IP" - value = google_compute_instance.main.network_interface[0].access_config[0].nat_ip -} - -output "ocr_internal_ip" { - description = "OCR worker internal IP" - value = google_compute_instance.ocr.network_interface[0].network_ip -} - -output "ocr_external_ip" { - description = "OCR worker external IP" - value = google_compute_instance.ocr.network_interface[0].access_config[0].nat_ip -} - -output "alert_internal_ip" { - description = "Alert worker internal IP" - value = google_compute_instance.alert.network_interface[0].network_ip -} - -output "alert_external_ip" { - description = "Alert worker external IP" - value = google_compute_instance.alert.network_interface[0].access_config[0].nat_ip -} - -output "datadog_internal_ip" { - description = "DataDog Agent internal IP" - value = google_compute_instance.datadog_agent.network_interface[0].network_ip -} - -output "datadog_external_ip" { - description = "DataDog Agent external IP" - value = google_compute_instance.datadog_agent.network_interface[0].access_config[0].nat_ip -} - -# ============================================================================= -# Service URLs -# ============================================================================= - -output "api_url" { - description = "API base URL" - value = "http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000" -} - -output "swagger_url" { - description = "Swagger UI URL" - value = "http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000/swagger/" -} - -output "health_url" { - description = "Health check URL" - value = "http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000/health/" -} - -output "rabbitmq_management_url" { - description = "RabbitMQ Management UI URL" - value = "http://${google_compute_instance.rabbitmq.network_interface[0].access_config[0].nat_ip}:15672" -} - -# ============================================================================= -# Registry -# ============================================================================= - -output "registry_url" { - description = "Artifact Registry URL" - value = "${var.region}-docker.pkg.dev/${var.project_id}/${var.registry_name}" -} - -# ============================================================================= -# Connection Strings -# ============================================================================= - -output "celery_broker_url" { - description = "Celery broker URL (internal)" - value = "amqp://${var.rabbitmq_user}:****@${google_compute_instance.rabbitmq.network_interface[0].network_ip}:5672//" - sensitive = false -} - -output "mysql_connection_string" { - description = "MySQL connection string (internal)" - value = "mysql://${var.db_user}:****@${google_compute_instance.mysql.network_interface[0].network_ip}:3306/${var.db_name}" - sensitive = false -} - -# ============================================================================= -# Summary -# ============================================================================= - -output "deployment_summary" { - description = "Deployment summary" - value = <<-EOT - - =========================================== - Speedcam MSA Deployment Summary - =========================================== - - Project: ${var.project_id} - Region: ${var.region} - Zone: ${var.zone} - Environment: ${var.environment} - - ------------------------------------------- - Infrastructure - ------------------------------------------- - RabbitMQ: ${google_compute_instance.rabbitmq.network_interface[0].network_ip} (${google_compute_instance.rabbitmq.network_interface[0].access_config[0].nat_ip}) - MySQL: ${google_compute_instance.mysql.network_interface[0].network_ip} (${google_compute_instance.mysql.network_interface[0].access_config[0].nat_ip}) - DataDog: ${google_compute_instance.datadog_agent.network_interface[0].network_ip} (${google_compute_instance.datadog_agent.network_interface[0].access_config[0].nat_ip}) - - ------------------------------------------- - Services - ------------------------------------------- - Main: ${google_compute_instance.main.network_interface[0].network_ip} (${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}) - OCR: ${google_compute_instance.ocr.network_interface[0].network_ip} (${google_compute_instance.ocr.network_interface[0].access_config[0].nat_ip}) - Alert: ${google_compute_instance.alert.network_interface[0].network_ip} (${google_compute_instance.alert.network_interface[0].access_config[0].nat_ip}) - - ------------------------------------------- - URLs - ------------------------------------------- - API: http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000/ - Swagger: http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000/swagger/ - Health: http://${google_compute_instance.main.network_interface[0].access_config[0].nat_ip}:8000/health/ - RabbitMQ: http://${google_compute_instance.rabbitmq.network_interface[0].access_config[0].nat_ip}:15672/ - - =========================================== - - EOT -} diff --git a/terraform/terraform.tfvars.example b/terraform/terraform.tfvars.example deleted file mode 100644 index 1c062d5..0000000 --- a/terraform/terraform.tfvars.example +++ /dev/null @@ -1,54 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Terraform Variables Example -# ============================================================================= -# Copy this file to terraform.tfvars and customize the values -# ============================================================================= - -# ============================================================================= -# Required Variables -# ============================================================================= - -# GCP Project ID (required) -project_id = "your-project-id" - -# ============================================================================= -# Optional Variables (defaults provided) -# ============================================================================= - -# Region and Zone -region = "asia-northeast3" -zone = "asia-northeast3-a" - -# Environment -environment = "dev" # dev, staging, prod - -# Database Configuration -db_name = "speedcam" -db_user = "sa" -db_password = "your-secure-password" -db_root_password = "your-secure-root-password" - -# RabbitMQ Configuration -rabbitmq_user = "sa" -rabbitmq_password = "your-secure-password" - -# Instance Types -machine_type_small = "e2-small" -machine_type_medium = "e2-medium" - -# Container Images -image_tag = "latest" -mysql_image = "mysql:8.0" -rabbitmq_image = "rabbitmq:3.13-management" - -# Application Configuration -ocr_concurrency = 2 -alert_concurrency = 50 -ocr_mock = true -fcm_mock = true - -# ============================================================================= -# DataDog -# ============================================================================= -dd_api_key = "your-datadog-api-key" -# dd_site = "ap1.datadoghq.com" # default diff --git a/terraform/variables.tf b/terraform/variables.tf deleted file mode 100644 index e63e018..0000000 --- a/terraform/variables.tf +++ /dev/null @@ -1,184 +0,0 @@ -# ============================================================================= -# Speedcam MSA - Terraform Variables -# ============================================================================= - -# ============================================================================= -# Project Configuration -# ============================================================================= - -variable "project_id" { - description = "GCP Project ID" - type = string -} - -variable "region" { - description = "GCP Region" - type = string - default = "asia-northeast3" -} - -variable "zone" { - description = "GCP Zone" - type = string - default = "asia-northeast3-a" -} - -variable "environment" { - description = "Environment (dev, staging, prod)" - type = string - default = "dev" - - validation { - condition = contains(["dev", "staging", "prod"], var.environment) - error_message = "Environment must be one of: dev, staging, prod." - } -} - -# ============================================================================= -# Network Configuration -# ============================================================================= - -variable "network_name" { - description = "VPC Network name" - type = string - default = "default" -} - -# ============================================================================= -# Artifact Registry Configuration -# ============================================================================= - -variable "registry_name" { - description = "Artifact Registry repository name" - type = string - default = "speedcam" -} - -# ============================================================================= -# Database Configuration -# ============================================================================= - -variable "db_name" { - description = "Base database name" - type = string - default = "speedcam" -} - -variable "db_user" { - description = "Database user" - type = string - default = "sa" -} - -variable "db_password" { - description = "Database password" - type = string - sensitive = true - default = "1234" -} - -variable "db_root_password" { - description = "Database root password" - type = string - sensitive = true - default = "root" -} - -# ============================================================================= -# RabbitMQ Configuration -# ============================================================================= - -variable "rabbitmq_user" { - description = "RabbitMQ user" - type = string - default = "sa" -} - -variable "rabbitmq_password" { - description = "RabbitMQ password" - type = string - sensitive = true - default = "1234" -} - -# ============================================================================= -# Instance Configuration -# ============================================================================= - -variable "machine_type_small" { - description = "Machine type for small instances" - type = string - default = "e2-small" -} - -variable "machine_type_medium" { - description = "Machine type for medium instances" - type = string - default = "e2-medium" -} - -# ============================================================================= -# Container Images -# ============================================================================= - -variable "image_tag" { - description = "Docker image tag" - type = string - default = "latest" -} - -variable "mysql_image" { - description = "MySQL Docker image" - type = string - default = "mysql:8.0" -} - -variable "rabbitmq_image" { - description = "RabbitMQ Docker image" - type = string - default = "rabbitmq:3.13-management" -} - -# ============================================================================= -# Application Configuration -# ============================================================================= - -variable "ocr_concurrency" { - description = "OCR worker concurrency" - type = number - default = 2 -} - -variable "alert_concurrency" { - description = "Alert worker concurrency" - type = number - default = 50 -} - -variable "ocr_mock" { - description = "Enable OCR mock mode" - type = bool - default = true -} - -variable "fcm_mock" { - description = "Enable FCM mock mode" - type = bool - default = true -} - -# ============================================================================= -# DataDog -# ============================================================================= - -variable "dd_api_key" { - description = "DataDog API Key" - type = string - sensitive = true -} - -variable "dd_site" { - description = "DataDog site (e.g., ap1.datadoghq.com)" - type = string - default = "ap1.datadoghq.com" -}