diff --git a/config/common.yml b/config/common.yml index 8ba5de70..60251345 100644 --- a/config/common.yml +++ b/config/common.yml @@ -17,7 +17,7 @@ spring: redis: host: ${SPRING_DATA_REDIS_HOST:localhost} port: ${SPRING_DATA_REDIS_PORT:6379} - + temporal: connection: target: ${SPRING_TEMPORAL_CONNECTION_TARGET:temporal:7233} @@ -79,7 +79,7 @@ toss: customerKey: ${spring.toss.customerKey:${TOSS_CUSTOMER_KEY}} secretKey: ${spring.toss.secretKey:${TOSS_SECRET_KEY}} timeout: 10 - + management: endpoints: web: diff --git a/config/kafka-topics.yml b/config/kafka-topics.yml index a8b0bad5..db94bf36 100644 --- a/config/kafka-topics.yml +++ b/config/kafka-topics.yml @@ -1,7 +1,7 @@ spring: kafka: bootstrap-servers: ${KAFKA_BOOTSTRAP_SERVERS:kafka1:29092,kafka2:29092,kafka3:29092} - + listener: ack-mode: MANUAL @@ -25,9 +25,9 @@ spring: enable-auto-commit: false auto-offset-reset: earliest max-poll-records: 50 - max-poll-interval-ms: 300000 - session-timeout-ms: 15000 - heartbeat-interval-ms: 3000 + max.poll.interval.ms: 300000 + session.timeout.ms: 15000 + heartbeat.interval.ms: 3000 properties: partition.assignment.strategy: org.apache.kafka.clients.consumer.CooperativeStickyAssignor group: diff --git a/config/spot-gateway-docker.yml b/config/spot-gateway-docker.yml deleted file mode 100644 index 99d89aa7..00000000 --- a/config/spot-gateway-docker.yml +++ /dev/null @@ -1,55 +0,0 @@ -spring: - cloud: - server: - webflux: - gateway: - routes: - - id: user-auth - uri: ${SPOT_USER_URI:http://spot-user:8081} - predicates: - - Path=/api/login,/api/join,/api/auth/refresh - - - id: user-service - uri: ${SPOT_USER_URI:http://spot-user:8081} - predicates: - - Path=/api/users/**,/api/admin/** - - - id: store-service - uri: ${SPOT_STORE_URI:http://spot-store:8083} - predicates: - - Path=/api/stores/**,/api/categories/**,/api/reviews/** - - - id: order-service - uri: ${SPOT_ORDER_URI:http://spot-order:8082} - predicates: - - Path=/api/orders/** - - - id: payment-service - uri: ${SPOT_PAYMENT_URI:http://spot-payment:8084} - predicates: - - Path=/api/payments/** - - - id: block-internal - uri: http://localhost:9999 - predicates: - - Path=/internal/** - filters: - - SetStatus=403 - - - -logging: - level: - org.springframework.boot.context.config: DEBUG - org.springframework.cloud.gateway: DEBUG - org.springframework.cloud.gateway.route.RouteDefinitionRouteLocator: DEBUG - org.springframework.cloud.gateway.handler.RoutePredicateHandlerMapping: TRACE - -management: - endpoints: - web: - exposure: - include: health, info, metrics, gateway - endpoint: - gateway: - access: unrestricted \ No newline at end of file diff --git a/config/spot-order.yml b/config/spot-order.yml index 80ac090d..da49d617 100644 --- a/config/spot-order.yml +++ b/config/spot-order.yml @@ -14,7 +14,7 @@ spring: threads: virtual: enabled: true - + feign: user: url: ${FEIGN_USER_URL:http://spot-user:8081} @@ -28,7 +28,7 @@ feign: connectTimeout: 100 readTimeout: 500 loggerLevel: BASIC - + resilience4j: @@ -101,4 +101,4 @@ logging: feign: INFO io.github.resilience4j.circuitbreaker: INFO io.github.resilience4j.retry: INFO - io.github.resilience4j.bulkhead: INFO + io.github.resilience4j.bulkhead: INFO \ No newline at end of file diff --git a/config/spot-payment.yml b/config/spot-payment.yml index 27cd4ac0..c08e7ba3 100644 --- a/config/spot-payment.yml +++ b/config/spot-payment.yml @@ -14,7 +14,7 @@ spring: threads: virtual: enabled: true - + feign: user: url: ${FEIGN_USER_URL:http://spot-user:8081} diff --git a/infra/argo/application.yaml b/infra/argo/application.yaml index 460a0aea..6f59c3e5 100644 --- a/infra/argo/application.yaml +++ b/infra/argo/application.yaml @@ -38,4 +38,4 @@ spec: prune: true selfHeal: true syncOptions: - - CreateNamespace=true + - CreateNamespace=true \ No newline at end of file diff --git a/infra/argo/argocd-ingress.yaml b/infra/argo/argocd-ingress.yaml index de35e984..55c7752c 100644 --- a/infra/argo/argocd-ingress.yaml +++ b/infra/argo/argocd-ingress.yaml @@ -15,4 +15,4 @@ spec: - name: https port: 443 targetPort: 8080 - nodePort: 30091 + nodePort: 30091 \ No newline at end of file diff --git a/infra/argo/blue-green.yaml b/infra/argo/blue-green.yaml index 52cab865..e3db4227 100644 --- a/infra/argo/blue-green.yaml +++ b/infra/argo/blue-green.yaml @@ -13,10 +13,10 @@ spec: app: test-rollout spec: containers: - - name: nginx - image: nginx:stable-alpine + - name: nginx + image: nginx:stable-alpine strategy: canary: steps: - - setWeight: 20 - - pause: {} \ No newline at end of file + - setWeight: 20 + - pause: {} \ No newline at end of file diff --git a/infra/argo/test-rollout.yaml b/infra/argo/test-rollout.yaml index d8b596e2..e3db4227 100644 --- a/infra/argo/test-rollout.yaml +++ b/infra/argo/test-rollout.yaml @@ -13,12 +13,10 @@ spec: app: test-rollout spec: containers: - - name: nginx - image: nginx:stable-alpine + - name: nginx + image: nginx:stable-alpine strategy: canary: steps: - - setWeight: 20 - - pause: {} - - + - setWeight: 20 + - pause: {} \ No newline at end of file diff --git a/infra/k8s/apps/spot-gateway.yaml b/infra/k8s/apps/spot-gateway.yaml index 5ee338f3..5815d09f 100644 --- a/infra/k8s/apps/spot-gateway.yaml +++ b/infra/k8s/apps/spot-gateway.yaml @@ -17,11 +17,14 @@ spec: spec: containers: - name: spot-gateway - image: spot-registry.localhost:5111/spot-gateway:latest +# image: spot-registry.localhost:5111/spot-gateway:latest + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-gateway:latest imagePullPolicy: Always ports: - containerPort: 8080 env: + - name: SERVER_PORT + value: "8080" - name: SPRING_PROFILES_ACTIVE value: "k8s" - name: LOGGING_CONFIG @@ -35,11 +38,11 @@ spec: readOnly: true resources: requests: + cpu: "250m" memory: "512Mi" - cpu: "500m" limits: - memory: "1Gi" cpu: "800m" + memory: "1Gi" readinessProbe: httpGet: path: /actuator/health diff --git a/infra/k8s/apps/spot-ingress.yaml b/infra/k8s/apps/spot-ingress.yaml index 625eb8da..d0b4be12 100644 --- a/infra/k8s/apps/spot-ingress.yaml +++ b/infra/k8s/apps/spot-ingress.yaml @@ -1,14 +1,19 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - name: spot-ingress + name: aws-ingress namespace: spot annotations: - kubernetes.io/ingress.class: nginx + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip # IP 모드 + alb.ingress.kubernetes.io/healthcheck-path: /actuator/health + alb.ingress.kubernetes.io/success-codes: "200" # 헬스 체크 성공 코드 + alb.ingress.kubernetes.io/load-balancer-name: spot-dev-alb # tf위한 이름 고정 spec: - ingressClassName: nginx + ingressClassName: alb rules: - - host: spot.localhost + # spot + - host: spotorder.org http: paths: - path: / @@ -20,7 +25,7 @@ spec: number: 80 # kafka - - host: kafka.localhost + - host: kafka.spotorder.org http: paths: - path: / diff --git a/infra/k8s/apps/spot-order.yaml b/infra/k8s/apps/spot-order.yaml index 353cabb8..159055d3 100644 --- a/infra/k8s/apps/spot-order.yaml +++ b/infra/k8s/apps/spot-order.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: spot-order - image: spot-registry.localhost:5111/spot-order:latest + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-order imagePullPolicy: Always ports: - containerPort: 8082 @@ -35,8 +35,8 @@ spec: readOnly: true resources: requests: - memory: "512Mi" - cpu: "500m" + memory: "256Mi" + cpu: "75m" limits: memory: "1Gi" cpu: "800m" diff --git a/infra/k8s/apps/spot-payment.yaml b/infra/k8s/apps/spot-payment.yaml index 3ec9b6d2..2d21e9b4 100644 --- a/infra/k8s/apps/spot-payment.yaml +++ b/infra/k8s/apps/spot-payment.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: spot-payment - image: spot-registry.localhost:5111/spot-payment:latest + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-payment imagePullPolicy: Always ports: - containerPort: 8084 @@ -35,8 +35,8 @@ spec: readOnly: true resources: requests: - memory: "512Mi" - cpu: "500m" + memory: "256Mi" + cpu: "75m" limits: memory: "1Gi" cpu: "800m" diff --git a/infra/k8s/apps/spot-store.yaml b/infra/k8s/apps/spot-store.yaml index b4dc2b8e..c9dbc22f 100644 --- a/infra/k8s/apps/spot-store.yaml +++ b/infra/k8s/apps/spot-store.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: spot-store - image: spot-registry.localhost:5111/spot-store:latest + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-store imagePullPolicy: Always ports: - containerPort: 8083 @@ -35,8 +35,8 @@ spec: readOnly: true resources: requests: - memory: "512Mi" - cpu: "500m" + memory: "256Mi" + cpu: "75m" limits: memory: "1Gi" cpu: "800m" diff --git a/infra/k8s/apps/spot-user.yaml b/infra/k8s/apps/spot-user.yaml index 7b1f75a6..b72aaf00 100644 --- a/infra/k8s/apps/spot-user.yaml +++ b/infra/k8s/apps/spot-user.yaml @@ -17,7 +17,7 @@ spec: spec: containers: - name: spot-user - image: spot-registry.localhost:5111/spot-user:latest + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-user imagePullPolicy: Always ports: - containerPort: 8081 @@ -35,8 +35,8 @@ spec: readOnly: true resources: requests: - memory: "512Mi" - cpu: "500m" + memory: "256Mi" + cpu: "75m" limits: memory: "1Gi" cpu: "800m" diff --git a/infra/k8s/base/configmap.yaml b/infra/k8s/base/configmap.yaml deleted file mode 100644 index c36d287e..00000000 --- a/infra/k8s/base/configmap.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: spot-common-config - namespace: spot -data: - SPRING_DATASOURCE_URL: "jdbc:postgresql://postgres:5432/myapp_db" - SPRING_DATASOURCE_USERNAME: "admin" - SPRING_DATA_REDIS_HOST: "redis" - SPRING_DATA_REDIS_PORT: "6379" - KAFKA_BOOTSTRAP_SERVERS: "kafka:9092" ---- -apiVersion: v1 -kind: Secret -metadata: - name: monitoring-secrets - namespace: monitoring -type: Opaque -stringData: - GF_SECURITY_ADMIN_USER: "spot" - GF_SECURITY_ADMIN_PASSWORD: "spot-grafana" ---- diff --git a/infra/k8s/base/kafka/allow-kafka-ui-netpol.yaml b/infra/k8s/base/kafka/allow-kafka-ui-netpol.yaml index 06c25c01..a8c5cb55 100644 --- a/infra/k8s/base/kafka/allow-kafka-ui-netpol.yaml +++ b/infra/k8s/base/kafka/allow-kafka-ui-netpol.yaml @@ -2,7 +2,7 @@ apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: allow-kafka-ui-to-connect - namespace: spot + namespace: infra spec: podSelector: matchLabels: diff --git a/infra/k8s/base/kafka/connectors.yaml b/infra/k8s/base/kafka/connectors.yaml index 4db633d7..930d40e0 100644 --- a/infra/k8s/base/kafka/connectors.yaml +++ b/infra/k8s/base/kafka/connectors.yaml @@ -2,7 +2,7 @@ apiVersion: kafka.strimzi.io/v1 kind: KafkaConnector metadata: name: order-outbox-connector - namespace: spot + namespace: infra labels: # 이전에 만든 KafkaConnect 리소스 이름(spot-connect)과 반드시 일치해야 함 strimzi.io/cluster: spot-connect @@ -40,14 +40,12 @@ spec: producer.compression.type: "lz4" producer.linger.ms: "20" producer.batch.size: "65536" - --- - apiVersion: kafka.strimzi.io/v1 kind: KafkaConnector metadata: name: payment-outbox-connector - namespace: spot + namespace: infra labels: strimzi.io/cluster: spot-connect spec: diff --git a/infra/k8s/base/kafka/kafka-connect.yaml b/infra/k8s/base/kafka/kafka-connect.yaml index d22663d7..6de4c91e 100644 --- a/infra/k8s/base/kafka/kafka-connect.yaml +++ b/infra/k8s/base/kafka/kafka-connect.yaml @@ -2,24 +2,24 @@ apiVersion: kafka.strimzi.io/v1 kind: KafkaConnect metadata: name: spot-connect - namespace: spot + namespace: infra annotations: strimzi.io/use-connector-resources: "true" spec: version: 4.0.0 - replicas: 2 - bootstrapServers: spot-cluster-kafka-bootstrap:9092 + replicas: 1 # 2 + bootstrapServers: kafka-cluster-kafka-bootstrap.infra.svc.cluster.local:9092 image: spot-registry.localhost:5111/spot-connect-custom:latest - + groupId: spot-connect-group configStorageTopic: connect_configs offsetStorageTopic: connect_offsets statusStorageTopic: connect_status config: - config.storage.replication.factor: 3 - offset.storage.replication.factor: 3 - status.storage.replication.factor: 3 + config.storage.replication.factor: 1 # 3 + offset.storage.replication.factor: 1 # 3 + status.storage.replication.factor: 1 # 3 key.converter: org.apache.kafka.connect.json.JsonConverter value.converter: org.apache.kafka.connect.json.JsonConverter @@ -57,8 +57,8 @@ spec: resources: requests: - cpu: "500m" + cpu: "250m" memory: "512Mi" limits: - cpu: "1000m" + cpu: "750m" memory: "1Gi" \ No newline at end of file diff --git a/infra/k8s/base/kafka/kafka-nodepool.yaml b/infra/k8s/base/kafka/kafka-nodepool.yaml new file mode 100644 index 00000000..0b2ac5b8 --- /dev/null +++ b/infra/k8s/base/kafka/kafka-nodepool.yaml @@ -0,0 +1,39 @@ +apiVersion: kafka.strimzi.io/v1 +kind: KafkaNodePool +metadata: + name: kafka-nodes + namespace: infra + labels: + strimzi.io/cluster: kafka-cluster +spec: + replicas: 1 # 3 + +# # 같은 노드에 kafka가 띄워지지 않게 하는 설정 +# template: +# pod: +# affinity: +# podAntiAffinity: +# preferredDuringSchedulingIgnoredDuringExecution: +# - weight: 100 +# podAffinityTerm: +# labelSelector: +# matchLabels: +# strimzi.io/cluster: kafka-cluster +# topologyKey: "kubernetes.io/hostname" # 노드 기준 + roles: + - broker + - controller + storage: + type: jbod + volumes: + - id: 0 + type: persistent-claim + size: 1Gi + deleteClaim: true + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi diff --git a/infra/k8s/base/kafka/kafka-nodepool.yml b/infra/k8s/base/kafka/kafka-nodepool.yml deleted file mode 100644 index 195b7696..00000000 --- a/infra/k8s/base/kafka/kafka-nodepool.yml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: kafka.strimzi.io/v1 -kind: KafkaNodePool -metadata: - name: kafka-nodes - namespace: spot - labels: - strimzi.io/cluster: spot-cluster -spec: - replicas: 3 - roles: - - broker - - controller - storage: - type: jbod - volumes: - - id: 0 - type: persistent-claim - size: 1Gi - deleteClaim: true - resources: - requests: - memory: 512Mi - cpu: 250m - limits: - memory: 1Gi - cpu: 500m \ No newline at end of file diff --git a/infra/k8s/base/kafka/kafka-ui.yaml b/infra/k8s/base/kafka/kafka-ui.yaml index 5ab3dab2..780e53e9 100644 --- a/infra/k8s/base/kafka/kafka-ui.yaml +++ b/infra/k8s/base/kafka/kafka-ui.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: kafka-ui - namespace: spot + namespace: infra spec: replicas: 1 selector: @@ -18,20 +18,16 @@ spec: image: provectuslabs/kafka-ui:latest ports: - containerPort: 8080 - resources: - requests: - memory: "384Mi" - cpu: "100m" env: - name: KAFKA_CLUSTERS_0_NAME - value: spot-cluster + value: kafka-cluster - name: KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS - value: 'spot-cluster-kafka-bootstrap:9092' + value: 'kafka-cluster-kafka-bootstrap.infra.svc.cluster.local:9092' - name: KAFKA_CLUSTERS_0_KAFKACONNECT_0_NAME value: "spot-connect" - name: KAFKA_CLUSTERS_0_KAFKACONNECT_0_ADDRESS - value: "http://spot-connect-connect-api.spot.svc:8083" + value: "http://spot-connect-connect-api.infra.svc:8083" - name: KAFKA_CLUSTERS_0_KAFKACONNECT_0_TYPE value: "kafka-connect" @@ -39,12 +35,19 @@ spec: value: WARN - name: LOGGING_LEVEL_COM_PROVECTUS value: WARN + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" --- apiVersion: v1 kind: Service metadata: name: kafka-ui-svc - namespace: spot + namespace: infra spec: type: ClusterIP selector: diff --git a/infra/k8s/base/kafka/kafka.yaml b/infra/k8s/base/kafka/kafka.yaml index 6a082c0f..798b447b 100644 --- a/infra/k8s/base/kafka/kafka.yaml +++ b/infra/k8s/base/kafka/kafka.yaml @@ -1,12 +1,21 @@ apiVersion: kafka.strimzi.io/v1 kind: Kafka metadata: - name: spot-cluster - namespace: spot + name: kafka-cluster + namespace: infra spec: kafka: version: 4.0.0 metadataVersion: "3.7-IV4" + template: + pod: + enableServiceLinks: false + metricsConfig: + type: jmxPrometheusExporter + valueFrom: + configMapKeyRef: + name: kafka-metrics-config + key: kafka-metrics-config.yaml listeners: - name: plain port: 9092 @@ -18,13 +27,23 @@ spec: tls: false config: process.roles: "broker,controller" - offsets.topic.replication.factor: 3 - transaction.state.log.replication.factor: 3 - transaction.state.log.min.isr: 2 - default.replication.factor: 3 - min.insync.replicas: 2 + offsets.topic.replication.factor: 1 # 3 + transaction.state.log.replication.factor: 1 # 3 + transaction.state.log.min.isr: 1 # 2 + default.replication.factor: 1 # 3 + min.insync.replicas: 1 # 3 auto.create.topics.enable: "true" num.partitions: 3 entityOperator: topicOperator: {} userOperator: {} + kafkaExporter: + topicRegex: ".*" + groupRegex: ".*" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi \ No newline at end of file diff --git a/infra/k8s/base/monitoring/grafana/dashboards/kafka-broker.json b/infra/k8s/base/monitoring/grafana/dashboards/kafka-broker.json new file mode 100644 index 00000000..d53b77a6 --- /dev/null +++ b/infra/k8s/base/monitoring/grafana/dashboards/kafka-broker.json @@ -0,0 +1,641 @@ +{ + "uid": "kafka-broker-jmx", + "title": "Kafka Broker (JMX)", + "tags": ["kafka", "jmx"], + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "schemaVersion": 39, + "refresh": "10s", + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "links": [], + "annotations": { "list": [] }, + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "title": "Active Controller", + "description": "1이면 정상. 0이면 컨트롤러 없음 (장애)", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_controller_kafkacontroller_activecontrollercount", + "legendFormat": "Active Controller", + "refId": "A" + } + ] + }, + { + "id": 2, + "title": "Active Brokers", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 2 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_controller_kafkacontroller_activebrokercount", + "legendFormat": "Active Brokers", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "Offline Partitions", + "description": "0이어야 정상. 1 이상이면 장애", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_controller_kafkacontroller_offlinepartitionscount", + "legendFormat": "Offline Partitions", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "Under Replicated Partitions", + "description": "0이어야 정상. 부하테스트 중 증가하면 브로커 과부하", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_server_replicamanager_underreplicatedpartitions", + "legendFormat": "Under Replicated", + "refId": "A" + } + ] + }, + { + "id": 5, + "title": "Global Topic Count", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_controller_kafkacontroller_globaltopiccount", + "legendFormat": "Topics", + "refId": "A" + } + ] + }, + { + "id": 6, + "title": "Global Partition Count", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_controller_kafkacontroller_globalpartitioncount", + "legendFormat": "Partitions", + "refId": "A" + } + ] + }, + + { + "id": 10, + "title": "Broker State", + "description": "3 = Running 정상. 부하테스트 중 변동 시 브로커 재시작 의심", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "state" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_server_kafkaserver_brokerstate", + "legendFormat": "Broker State", + "refId": "A" + } + ] + }, + { + "id": 11, + "title": "Leader Count", + "description": "브로커가 리더인 파티션 수. 부하테스트 중 급감하면 리밸런싱 발생", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "count" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_server_replicamanager_leadercount", + "legendFormat": "Leader Partitions", + "refId": "A" + } + ] + }, + { + "id": 12, + "title": "Partition Count", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "count" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_server_replicamanager_partitioncount", + "legendFormat": "Total Partitions", + "refId": "A" + }, + { + "expr": "kafka_server_replicamanager_atminisrpartitioncount", + "legendFormat": "At MinISR", + "refId": "B" + }, + { + "expr": "kafka_server_replicamanager_underminisrpartitioncount", + "legendFormat": "Under MinISR", + "refId": "C" + } + ] + }, + + { + "id": 20, + "title": "Network Processor Idle %", + "description": "낮을수록 네트워크 처리 바쁨. 부하테스트 중 0%에 가까워지면 네트워크 병목", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "%" + }, + "unit": "percentunit", + "min": 0, + "max": 1 + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "min"] }, + "tooltip": { "mode": "multi", "sort": "asc" } + }, + "targets": [ + { + "expr": "kafka_network_socketserver_networkprocessoravgidlepercent", + "legendFormat": "Avg Idle %", + "refId": "A" + } + ] + }, + { + "id": 21, + "title": "Request Queue Size", + "description": "요청 큐가 쌓이면 브로커가 처리를 못 따라가는 것", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "queue size" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_network_requestchannel_requestqueuesize", + "legendFormat": "Request Queue", + "refId": "A" + }, + { + "expr": "kafka_network_requestchannel_responsequeuesize", + "legendFormat": "Response Queue", + "refId": "B" + } + ] + }, + + { + "id": 30, + "title": "Disk Read/Write (bytes/s)", + "description": "부하테스트 중 디스크 I/O 병목 확인", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "bytes/s" + }, + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "rate(kafka_server_kafkaserver_linux_disk_read_bytes[1m])", + "legendFormat": "Disk Read", + "refId": "A" + }, + { + "expr": "rate(kafka_server_kafkaserver_linux_disk_write_bytes[1m])", + "legendFormat": "Disk Write", + "refId": "B" + } + ] + }, + { + "id": 31, + "title": "Log Size per Topic", + "description": "토픽별 로그 크기. 부하테스트 중 빠르게 증가하면 메시지 유입 많은 것", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "bytes" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_log_log_size", + "legendFormat": "{{topic}} / p{{partition}}", + "refId": "A" + } + ] + }, + + { + "id": 40, + "title": "Delayed Produce Operations", + "description": "Produce 요청이 지연되는 수. 부하테스트 중 증가하면 브로커 과부하", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "count" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_server_delayedoperationpurgatory_numdelayedoperations_delayedoperation_produce", + "legendFormat": "Delayed Produce", + "refId": "A" + }, + { + "expr": "kafka_server_delayedoperationpurgatory_numdelayedoperations_delayedoperation_fetch", + "legendFormat": "Delayed Fetch", + "refId": "B" + } + ] + }, + { + "id": 41, + "title": "Consumer Group Status", + "description": "Stable이 정상. Rebalancing 상태가 길어지면 컨슈머 문제", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "axisLabel": "count" + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_coordinator_group_groupmetadatamanager_numgroupsstable", + "legendFormat": "Stable", + "refId": "A" + }, + { + "expr": "kafka_coordinator_group_groupmetadatamanager_numgroupspreparingrebalance", + "legendFormat": "Preparing Rebalance", + "refId": "B" + }, + { + "expr": "kafka_coordinator_group_groupmetadatamanager_numgroupscompletingrebalance", + "legendFormat": "Completing Rebalance", + "refId": "C" + }, + { + "expr": "kafka_coordinator_group_groupmetadatamanager_numgroupsdead", + "legendFormat": "Dead", + "refId": "D" + } + ] + }, + { + "id": 42, + "title": "JVM Heap Memory Usage", + "description": "JVM 힙 메모리 사용량. 90% 이상 지속 시 OOM 위험", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10 }, + "unit": "bytes" + } + }, + "targets": [ + { + "expr": "jvm_memory_bytes_used{area=\"heap\"}", + "legendFormat": "Used: {{instance}}", + "refId": "A" + }, + { + "expr": "jvm_memory_bytes_max{area=\"heap\"}", + "legendFormat": "Max: {{instance}}", + "refId": "B" + } + ] + }, + { + "id": 43, + "title": "JVM GC Time (Rate)", + "description": "초당 GC에 소요되는 시간. 급증 시 브로커 STW(Stop-The-World) 발생", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10 }, + "unit": "s" + } + }, + "targets": [ + { + "expr": "rate(jvm_gc_collection_seconds_sum[1m])", + "legendFormat": "GC Time: {{gc}}", + "refId": "A" + } + ] + } + ] +} \ No newline at end of file diff --git a/infra/k8s/base/monitoring/grafana/dashboards/kafka-exporter.json b/infra/k8s/base/monitoring/grafana/dashboards/kafka-exporter.json new file mode 100644 index 00000000..98218728 --- /dev/null +++ b/infra/k8s/base/monitoring/grafana/dashboards/kafka-exporter.json @@ -0,0 +1,449 @@ +{ + "uid": "kafka", + "title": "Kafka 모니터링", + "tags": ["kafka"], + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "schemaVersion": 39, + "refresh": "10s", + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "links": [], + "annotations": { "list": [] }, + "templating": { + "list": [ + { + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(kafka_consumergroup_lag, consumergroup)", + "includeAll": true, + "multi": true, + "name": "consumergroup", + "query": "label_values(kafka_consumergroup_lag, consumergroup)", + "refresh": 2, + "type": "query", + "label": "Consumer Group" + }, + { + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(kafka_consumergroup_lag{consumergroup=~\"$consumergroup\"}, topic)", + "includeAll": true, + "multi": true, + "name": "topic", + "query": "label_values(kafka_consumergroup_lag{consumergroup=~\"$consumergroup\"}, topic)", + "refresh": 2, + "type": "query", + "label": "Topic" + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Brokers Online", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 2 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "kafka_brokers", + "legendFormat": "Brokers", + "refId": "A" + } + ] + }, + { + "id": 2, + "title": "Total Topics", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "count(count by (topic) (kafka_topic_partitions))", + "legendFormat": "Topics", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "Under Replicated Partitions", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "sum(kafka_topic_partition_under_replicated_partition)", + "legendFormat": "Under Replicated", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "Consumer Groups", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "count(count by (consumergroup) (kafka_consumergroup_lag))", + "legendFormat": "Groups", + "refId": "A" + } + ] + }, + + { + "id": 10, + "title": "Consumer Group Lag (그룹별 합계)", + "type": "timeseries", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "gradientMode": "scheme", + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "axisBorderShow": false, + "axisCenteredZero": false, + "axisLabel": "Lag", + "scaleDistribution": { "type": "linear" } + }, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 1000 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "sum(kafka_consumergroup_lag{consumergroup=~\"$consumergroup\", topic=~\"$topic\"} >= 0) by (consumergroup, topic)", + "legendFormat": "{{consumergroup}} / {{topic}}", + "refId": "A" + } + ] + }, + + { + "id": 11, + "title": "Consumer Group Lag (파티션별)", + "type": "timeseries", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "axisLabel": "Lag", + "scaleDistribution": { "type": "linear" } + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "kafka_consumergroup_lag{consumergroup=~\"$consumergroup\", topic=~\"$topic\"} >= 0", + "legendFormat": "{{consumergroup}} / {{topic}} / p{{partition}}", + "refId": "A" + } + ] + }, + + { + "id": 20, + "title": "현재 Lag 현황", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 1000 } + ] + }, + "color": { "mode": "thresholds" } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [{ "displayName": "Value", "desc": true }] + }, + "transformations": [ + { "id": "reduce", "options": { "reducers": ["lastNotNull"] } } + ], + "targets": [ + { + "expr": "sum(kafka_consumergroup_lag{consumergroup=~\"$consumergroup\", topic=~\"$topic\"} >= 0) by (consumergroup, topic)", + "legendFormat": "{{consumergroup}} / {{topic}}", + "refId": "A", + "instant": true, + "format": "table" + } + ] + }, + + { + "id": 21, + "title": "소비 속도 (Messages Consumed/s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "axisLabel": "msg/s", + "scaleDistribution": { "type": "linear" } + }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "sum(rate(kafka_consumergroup_current_offset{consumergroup=~\"$consumergroup\", topic=~\"$topic\"}[5m])) by (consumergroup, topic)", + "legendFormat": "{{consumergroup}} / {{topic}}", + "refId": "A" + } + ] + }, + + { + "id": 30, + "title": "Topic 유입 메시지/s", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "axisLabel": "msg/s", + "scaleDistribution": { "type": "linear" } + }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "max"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "sum(rate(kafka_topic_partition_current_offset{topic=~\"$topic\"}[5m])) by (topic)", + "legendFormat": "{{topic}}", + "refId": "A" + } + ] + }, + + { + "id": 40, + "title": "In-Sync Replicas", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 15, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "mode": "none" }, + "axisLabel": "count", + "scaleDistribution": { "type": "linear" } + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "table", "placement": "right", "calcs": ["mean", "last"] }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "sum(kafka_topic_partition_in_sync_replica{topic=~\"$topic\"}) by (topic)", + "legendFormat": "ISR: {{topic}}", + "refId": "A" + } + ] + }, + + { + "id": 41, + "title": "Consumer Group Members", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "color": { "mode": "thresholds" } + }, + "overrides": [] + }, + "options": { "showHeader": true }, + "targets": [ + { + "expr": "kafka_consumergroup_members{consumergroup=~\"$consumergroup\"}", + "legendFormat": "{{consumergroup}}", + "refId": "A", + "instant": true, + "format": "table" + } + ] + } + ] +} \ No newline at end of file diff --git a/infra/k8s/base/monitoring/grafana/dashboards/temporal.json b/infra/k8s/base/monitoring/grafana/dashboards/temporal.json new file mode 100644 index 00000000..69254699 --- /dev/null +++ b/infra/k8s/base/monitoring/grafana/dashboards/temporal.json @@ -0,0 +1,1223 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "expr": "sum(temporal_workflow_active_thread_count{namespace=~\"$namespace\"}) or vector(0)", + "legendFormat": "Active", + "refId": "A" + } + ], + "title": "Active Workflows", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "expr": "sum(rate(temporal_workflow_completed_total{namespace=~\"$namespace\"}[5m])) or vector(0)", + "legendFormat": "Completed/s", + "refId": "A" + } + ], + "title": "Workflow Completed (5m rate)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(temporal_workflow_failed_total{namespace=~\"$namespace\"}[$__range]))", + "instant": true, + "legendFormat": "Failed", + "range": false, + "refId": "A" + } + ], + "title": "Workflow Failed", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "expr": "sum(temporal_worker_task_slots_used{namespace=~\"$namespace\"}) or vector(0)", + "legendFormat": "Backlog", + "refId": "A" + } + ], + "title": "Task Queue Backlog", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "count", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(temporal_workflow_completed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Completed", + "refId": "A" + }, + { + "expr": "sum(rate(temporal_workflow_task_execution_failed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "sum(rate(temporal_workflow_task_execution_failed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Timeout", + "refId": "C" + }, + { + "expr": "sum(rate(temporal_workflow_completed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Cancelled", + "refId": "D" + } + ], + "title": "Workflow Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "rate", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(temporal_activity_execution_latency_seconds_count{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Executed", + "refId": "A" + }, + { + "expr": "sum(rate(temporal_workflow_task_execution_failed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "sum(rate(temporal_workflow_task_queue_poll_succeed_total{namespace=~\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "Poll Success", + "refId": "C" + } + ], + "title": "Activity Execution Rate & Failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "req/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(temporal_request_total{namespace=~\"$namespace\"}[$__rate_interval])) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Service Request Rate by Operation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "errors/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(temporal_request_failure_total{namespace=~\"$namespace\"}[$__rate_interval])) by (operation, service_name) or vector(0)", + "legendFormat": "{{operation}} ({{error_type}})", + "range": true, + "refId": "A" + } + ], + "title": "Service Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ms", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(temporal_request_latency_bucket[15m])) by (le, operation))", + "legendFormat": "p95 {{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Service Latency P95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ms", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(persistence_latency_bucket[$__rate_interval])) by (le, operation))", + "legendFormat": "p95 {{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Persistence Latency P95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "ms", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, taskqueue, task_type) (rate(task_schedule_to_start_latency_bucket[5m])))", + "legendFormat": "p95 {{task_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Schedule-to-Start Latency P95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "tasks", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 40, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "temporal_worker_task_slots_used{namespace=~\"$namespace\"}", + "legendFormat": "{{task_queue}} ({{task_type}})", + "refId": "A" + } + ], + "title": "Task Queue Backlog by Queue", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "polls/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(poll_success{namespace=~\"$namespace\"}[$__rate_interval])) by (task_type)", + "legendFormat": "success ({{task_type}})", + "refId": "A" + }, + { + "expr": "sum(rate(poll_success_sync{namespace=~\"$namespace\"}[$__rate_interval])) by (task_type)", + "legendFormat": "sync ({{task_type}})", + "refId": "B" + }, + { + "expr": "sum(rate(temporal_workflow_task_queue_poll_empty_total{namespace=~\"$namespace\"}[$__rate_interval])) by (task_type)", + "legendFormat": "timeout ({{task_type}})", + "refId": "C" + } + ], + "title": "Task Poll Rate", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "temporal" + ], + "templating": { + "list": [ + { + "hide": 2, + "label": "Namespace", + "name": "namespace", + "query": "spot", + "skipUrlSync": false, + "type": "constant" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(temporal_request_total{namespace=~\"$namespace\"}, operation)", + "hide": 0, + "includeAll": true, + "label": "Operation", + "multi": true, + "name": "operation", + "options": [], + "query": "label_values(temporal_request_total{namespace=~\"$namespace\"}, operation)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Temporal Server", + "uid": "temporal-server", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/infra/k8s/base/monitoring/grafana/grafana.yaml b/infra/k8s/base/monitoring/grafana/grafana.yaml index bdc441ba..53805805 100644 --- a/infra/k8s/base/monitoring/grafana/grafana.yaml +++ b/infra/k8s/base/monitoring/grafana/grafana.yaml @@ -9,6 +9,8 @@ spec: resources: requests: storage: 1Gi +# # AWS용 +# storageClassName: gp3 --- apiVersion: apps/v1 kind: Deployment @@ -42,19 +44,35 @@ spec: - name: grafana-dashboards-spot configMap: name: grafana-dashboards-spot + - name: grafana-dashboards-9578 configMap: name: grafana-dashboards-9578 + - name: grafana-dashboards-15661 configMap: name: grafana-dashboards-15661 + - name: grafana-dashboards-15757 configMap: name: grafana-dashboards-15757 + - name: grafana-dashboards-15760 configMap: name: grafana-dashboards-15760 + - name: grafana-dashboards-kafka-broker + configMap: + name: grafana-dashboards-kafka-broker + + - name: grafana-dashboards-kafka-exporter + configMap: + name: grafana-dashboards-kafka-exporter + + - name: grafana-dashboards-temporal + configMap: + name: grafana-dashboards-temporal + containers: - name: grafana @@ -116,6 +134,20 @@ spec: subPath: 15661.json readOnly: true + - name: grafana-dashboards-kafka-broker + mountPath: /var/lib/grafana/dashboards/kafka-broker.json + subPath: kafka-broker.json + readOnly: true + + - name: grafana-dashboards-kafka-exporter + mountPath: /var/lib/grafana/dashboards/kafka-exporter.json + subPath: kafka-exporter.json + readOnly: true + + - name: grafana-dashboards-temporal + mountPath: /var/lib/grafana/dashboards/temporal.json + subPath: temporal.json + readOnly: true --- apiVersion: v1 diff --git a/infra/k8s/base/monitoring/loki/loki.yaml b/infra/k8s/base/monitoring/loki/loki.yaml index 3d3f1326..06313e67 100644 --- a/infra/k8s/base/monitoring/loki/loki.yaml +++ b/infra/k8s/base/monitoring/loki/loki.yaml @@ -9,6 +9,8 @@ spec: resources: requests: storage: 2Gi +# # AWS용 +# storageClassName: gp3 --- apiVersion: apps/v1 kind: Deployment diff --git a/infra/k8s/base/monitoring/monitoring-ingress.yaml b/infra/k8s/base/monitoring/monitoring-ingress.yaml deleted file mode 100644 index 22b9bd06..00000000 --- a/infra/k8s/base/monitoring/monitoring-ingress.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: grafana-ingress - namespace: monitoring - annotations: - kubernetes.io/ingress.class: nginx -# nginx.ingress.kubernetes.io/rewrite-target: /$1 # 첫 번째 prefix 제거 -spec: - ingressClassName: nginx - rules: - - host: grafana.localhost - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: grafana-service - port: - number: 80 \ No newline at end of file diff --git a/infra/k8s/base/monitoring/prometheus/kafka-metrics-config.yaml b/infra/k8s/base/monitoring/prometheus/kafka-metrics-config.yaml new file mode 100644 index 00000000..337300f0 --- /dev/null +++ b/infra/k8s/base/monitoring/prometheus/kafka-metrics-config.yaml @@ -0,0 +1,72 @@ +lowercaseOutputName: true +rules: + # Broker info + - pattern: kafka.server<>Value + name: kafka_server_$1_$2 + type: GAUGE + labels: + clientId: "$3" + topic: "$4" + partition: "$5" + - pattern: kafka.server<>Value + name: kafka_server_$1_$2 + type: GAUGE + labels: + clientId: "$3" + broker: "$4:$5" + - pattern: kafka.server<>Value + name: kafka_server_$1_$2 + type: GAUGE + + # Controller + - pattern: kafka.controller<>Value + name: kafka_controller_$1_$2 + type: GAUGE + + # Network + - pattern: kafka.network<>Value + name: kafka_network_$1_$2 + type: GAUGE + labels: + request: "$3" + version: "$4" + - pattern: kafka.network<>Value + name: kafka_network_$1_$2 + type: GAUGE + labels: + request: "$3" + - pattern: kafka.network<>Value + name: kafka_network_$1_$2 + type: GAUGE + + # Log + - pattern: kafka.log<>Value + name: kafka_log_$1_$2 + type: GAUGE + labels: + topic: "$3" + partition: "$4" + + # Catch-all for remaining kafka metrics + - pattern: kafka.(.+)<>Value + name: kafka_$1_$2_$3 + type: GAUGE + + - pattern: java.lang(\w+) + name: jvm_memory_bytes_$1 + type: GAUGE + attrNameSnakeCase: true + labels: + area: heap + + - pattern: java.lang<>CollectionTime + name: jvm_gc_collection_seconds_sum + type: COUNTER + labels: + gc: "$1" + + - pattern: java.lang<>CollectionCount + name: jvm_gc_collection_count + type: COUNTER + labels: + gc: "$1" \ No newline at end of file diff --git a/infra/k8s/base/monitoring/prometheus/value.yaml b/infra/k8s/base/monitoring/prometheus/value.yaml index 8ffc20fa..67b70a7e 100644 --- a/infra/k8s/base/monitoring/prometheus/value.yaml +++ b/infra/k8s/base/monitoring/prometheus/value.yaml @@ -8,6 +8,7 @@ alertmanager: prometheus: prometheusSpec: serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false # retention 설정 (튜닝 가능) retention: 10d @@ -24,4 +25,4 @@ kubeStateMetrics: enabled: true nodeExporter: - enabled: true + enabled: true \ No newline at end of file diff --git a/infra/k8s/base/monitoring/servicemonitors/kafka-broker-podmonitor.yaml b/infra/k8s/base/monitoring/servicemonitors/kafka-broker-podmonitor.yaml new file mode 100644 index 00000000..509cf698 --- /dev/null +++ b/infra/k8s/base/monitoring/servicemonitors/kafka-broker-podmonitor.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: kafka-broker + namespace: monitoring + labels: + release: prom +spec: + namespaceSelector: + matchNames: + - infra + selector: + matchLabels: + strimzi.io/component-type: kafka + podMetricsEndpoints: + - port: tcp-prometheus + path: /metrics + interval: 15s \ No newline at end of file diff --git a/infra/k8s/base/monitoring/servicemonitors/kafka-exporter-servicemonitor.yaml b/infra/k8s/base/monitoring/servicemonitors/kafka-exporter-servicemonitor.yaml new file mode 100644 index 00000000..7778911f --- /dev/null +++ b/infra/k8s/base/monitoring/servicemonitors/kafka-exporter-servicemonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: kafka-exporter + namespace: monitoring + labels: + release: prom +spec: + namespaceSelector: + matchNames: + - infra + selector: + matchLabels: + strimzi.io/cluster: kafka-cluster + strimzi.io/name: kafka-cluster-kafka-exporter + podMetricsEndpoints: + - port: tcp-prometheus + path: /metrics + interval: 15s \ No newline at end of file diff --git a/infra/k8s/base/monitoring/servicemonitors/temporal-servicemonitor.yaml b/infra/k8s/base/monitoring/servicemonitors/temporal-servicemonitor.yaml new file mode 100644 index 00000000..3efa7cf9 --- /dev/null +++ b/infra/k8s/base/monitoring/servicemonitors/temporal-servicemonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: temporal + namespace: monitoring + labels: + release: prom +spec: + namespaceSelector: + matchNames: + - infra + selector: + matchExpressions: + - {key: app, operator: In, values: [temporal, temporal-frontend, temporal-history, temporal-matching]} + endpoints: + - targetPort: 9090 # 이름(metrics) 대신 번호를 직접 써서 실수를 방지합니다. + path: /metrics + interval: 15s + honorLabels: true \ No newline at end of file diff --git a/infra/k8s/base/namespace.yaml b/infra/k8s/base/namespace.yaml index 8d3fbd50..1193e8d7 100644 --- a/infra/k8s/base/namespace.yaml +++ b/infra/k8s/base/namespace.yaml @@ -12,3 +12,8 @@ apiVersion: v1 kind: Namespace metadata: name: monitoring +--- +apiVersion: v1 +kind: Namespace +metadata: + name: infra diff --git a/infra/k8s/base/postgres.yaml b/infra/k8s/base/postgres.yaml index 58885ff8..60014156 100644 --- a/infra/k8s/base/postgres.yaml +++ b/infra/k8s/base/postgres.yaml @@ -9,6 +9,7 @@ spec: resources: requests: storage: 1Gi + storageClassName: gp3 --- apiVersion: apps/v1 kind: Deployment @@ -33,28 +34,37 @@ spec: - containerPort: 5432 env: - name: POSTGRES_DB - value: "myapp_db" + valueFrom: + secretKeyRef: + name: spot-secrets + key: DB_NAME - name: POSTGRES_USER - value: "admin" + valueFrom: + secretKeyRef: + name: spot-secrets + key: SPRING_DATASOURCE_USERNAME - name: POSTGRES_PASSWORD valueFrom: secretKeyRef: name: spot-secrets key: SPRING_DATASOURCE_PASSWORD + - name: PGDATA + value: /var/lib/postgresql/data/pgdata volumeMounts: - name: postgres-storage mountPath: /var/lib/postgresql/data resources: requests: - memory: "256Mi" - cpu: "250m" + memory: 256Mi + cpu: 100m limits: - memory: "512Mi" - cpu: "500m" + memory: 512Mi + cpu: 500m volumes: - name: postgres-storage persistentVolumeClaim: claimName: postgres-pvc + --- apiVersion: v1 kind: Service diff --git a/infra/k8s/base/rds-relay.yaml b/infra/k8s/base/rds-relay.yaml new file mode 100644 index 00000000..e66bf53d --- /dev/null +++ b/infra/k8s/base/rds-relay.yaml @@ -0,0 +1,31 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rds-relay + namespace: spot +spec: + replicas: 1 + selector: + matchLabels: + app: rds-relay + template: + metadata: + labels: + app: rds-relay + spec: + containers: + - name: relay + image: alpine/socat + command: ["socat", "TCP-LISTEN:5432,fork,reuseaddr", "TCP:spot-dev-db.cj6iq2cow597.ap-northeast-2.rds.amazonaws.com:5432"] +--- +apiVersion: v1 +kind: Service +metadata: + name: rds-tunnel + namespace: spot +spec: + ports: + - port: 5432 + targetPort: 5432 + selector: + app: rds-relay \ No newline at end of file diff --git a/infra/k8s/base/secrets.yaml b/infra/k8s/base/secrets.yaml new file mode 100644 index 00000000..85331daf --- /dev/null +++ b/infra/k8s/base/secrets.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: monitoring-secrets + namespace: monitoring +type: Opaque +stringData: + GF_SECURITY_ADMIN_USER: "spot" + GF_SECURITY_ADMIN_PASSWORD: "spot-grafana" \ No newline at end of file diff --git a/infra/k8s/base/storageclass-gp3.yaml b/infra/k8s/base/storageclass-gp3.yaml new file mode 100644 index 00000000..91917b9d --- /dev/null +++ b/infra/k8s/base/storageclass-gp3.yaml @@ -0,0 +1,10 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +parameters: + type: gp3 + fsType: ext4 \ No newline at end of file diff --git a/infra/k8s/base/temporal/temporal-ui.yaml b/infra/k8s/base/temporal/temporal-ui.yaml index fc642a9a..4f7f7278 100644 --- a/infra/k8s/base/temporal/temporal-ui.yaml +++ b/infra/k8s/base/temporal/temporal-ui.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: temporal-ui - namespace: spot + namespace: infra spec: replicas: 1 selector: @@ -20,15 +20,27 @@ spec: - containerPort: 8080 env: - name: TEMPORAL_ADDRESS - value: "temporal:7233" + valueFrom: + secretKeyRef: + name: spot-secrets + key: SPRING_TEMPORAL_CONNECTION_TARGET - name: TEMPORAL_CORS_ALLOW_ORIGINS value: "*" + + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" --- +# Service apiVersion: v1 kind: Service metadata: name: temporal-ui-svc - namespace: spot + namespace: infra spec: type: ClusterIP selector: diff --git a/infra/k8s/base/temporal/temporal.yaml b/infra/k8s/base/temporal/temporal.yaml index 638b8ae8..ece6952e 100644 --- a/infra/k8s/base/temporal/temporal.yaml +++ b/infra/k8s/base/temporal/temporal.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: temporal - namespace: spot + namespace: infra spec: replicas: 1 selector: @@ -19,7 +19,14 @@ spec: imagePullPolicy: IfNotPresent ports: - containerPort: 7233 + - containerPort: 9090 + name: metrics env: + # 모니터링 연결 정보 + - name: PROMETHEUS_ENDPOINT + value: "0.0.0.0:9090" + + # DB 연결 정보 - name: DB value: postgres12 - name: POSTGRES_SEEDS @@ -39,6 +46,7 @@ spec: secretKeyRef: name: spot-secrets key: SPRING_DATASOURCE_PASSWORD + - name: DBNAME value: "spot_temporal" - name: VISIBILITY_DBNAME @@ -47,38 +55,46 @@ spec: value: "false" - name: SKIP_SCHEMA_SETUP value: "false" - - name: POSTGRES_TLS_ENABLED - value: "true" - - name: POSTGRES_TLS_DISABLE_HOST_VERIFICATION - value: "true" - - name: SQL_TLS_ENABLED - value: "true" - - name: SQL_HOST_VERIFICATION - value: "false" - name: ENABLE_ES value: "false" - name: SKIP_DEFAULT_NAMESPACE_CREATION value: "false" - name: LOG_LEVEL value: "warn" + +# # RDS 연결 +# - name: POSTGRES_TLS_ENABLED +# value: "true" +# - name: POSTGRES_TLS_DISABLE_HOST_VERIFICATION +# value: "true" +# - name: SQL_TLS_ENABLED +# value: "true" +# - name: SQL_HOST_VERIFICATION +# value: "false" + resources: requests: memory: "256Mi" cpu: "250m" limits: - memory: "512Mi" - cpu: "500m" + memory: "1Gi" + cpu: "750m" --- apiVersion: v1 kind: Service metadata: name: temporal - namespace: spot + namespace: infra + labels: + app: temporal spec: type: ClusterIP selector: app: temporal ports: - - port: 7233 + - name: grpc + port: 7233 targetPort: 7233 - + - name: metrics + port: 9090 + targetPort: 9090 \ No newline at end of file diff --git a/infra/k8s/config/common.yml b/infra/k8s/config/common.yml new file mode 100644 index 00000000..8ba5de70 --- /dev/null +++ b/infra/k8s/config/common.yml @@ -0,0 +1,100 @@ +spring: + jackson: + time-zone: Asia/Seoul + application: + name: spot-common + + datasource: + url: ${SPRING_DATASOURCE_URL:jdbc:postgresql://${DB_HOST:localhost}:5432/myapp_db} + username: ${SPRING_DATASOURCE_USERNAME:admin} + password: ${SPRING_DATASOURCE_PASSWORD:secret} + driver-class-name: org.postgresql.Driver + hikari: + maximum-pool-size: 3 + minimum-idle: 1 + + data: + redis: + host: ${SPRING_DATA_REDIS_HOST:localhost} + port: ${SPRING_DATA_REDIS_PORT:6379} + + temporal: + connection: + target: ${SPRING_TEMPORAL_CONNECTION_TARGET:temporal:7233} + namespace: default + # + jpa: + open-in-view: false + hibernate: + ddl-auto: update # 개발 단계에서는 update, 운영 단계에서는 none 또는 validate 권장 + show-sql: false # 실행되는 SQL을 콘솔에 출력 + properties: + hibernate: + format_sql: true + dialect: org.hibernate.dialect.PostgreSQLDialect + jwt: + secret: ${SPRING_JWT_SECRET:MyVeryStrongSecretKeyForJWT2024!!ThisIsAtLeast32BytesLongForHS256Algorithm} + expire-ms: ${SPRING_JWT_EXPIRE_MS:3600000} + + security: + refresh-token: + expire-days: 14 + + mvc: + cors: + mappings: + '[/**]': + allowedOriginPatterns: + - '*' + allowedMethods: + - GET + - POST + - PATCH + - DELETE + - OPTIONS + allowedHeaders: + - '*' + exposedHeaders: + - Authorization + allowCredentials: true + + mail: + host: smtp.gmail.com + port: 587 + username: ${spring.mail.username:${EMAIL_USERNAME}} + password: ${spring.mail.password:${EMAIL_PASSWORD}} + properties: + mail.debug: true + mail.connectiontimeout: 5000 + mail.smtp.auth: true + mail.smtp.starttls.enable: true + mail.smtp.starttls.required: true + +service: + active-regions: 종로구 + +toss: + payments: + base-url: https://api.tosspayments.com + customerKey: ${spring.toss.customerKey:${TOSS_CUSTOMER_KEY}} + secretKey: ${spring.toss.secretKey:${TOSS_SECRET_KEY}} + timeout: 10 + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - prometheus + + endpoint: + health: + probes: + enabled: true + + metrics: + tags: + application: ${spring.application.name} \ No newline at end of file diff --git a/infra/k8s/config/kafka-topics.yml b/infra/k8s/config/kafka-topics.yml new file mode 100644 index 00000000..a8b0bad5 --- /dev/null +++ b/infra/k8s/config/kafka-topics.yml @@ -0,0 +1,50 @@ +spring: + kafka: + bootstrap-servers: ${KAFKA_BOOTSTRAP_SERVERS:kafka1:29092,kafka2:29092,kafka3:29092} + + listener: + ack-mode: MANUAL + + producer: + compression.type: lz4 + key-serializer: org.apache.kafka.common.serialization.StringSerializer + value-serializer: org.apache.kafka.common.serialization.StringSerializer + acks: all + retries: 5 + properties: + retry.backoff.ms: 2000 + reconnect.backoff.ms: 5000 + reconnect.backoff.max.ms: 30000 + delivery.timeout.ms: 30000 + enable.idempotence: true + max.in.flight.requests.per.connection: 5 + + consumer: + key-deserializer: org.apache.kafka.common.serialization.StringDeserializer + value-deserializer: org.apache.kafka.common.serialization.StringDeserializer + enable-auto-commit: false + auto-offset-reset: earliest + max-poll-records: 50 + max-poll-interval-ms: 300000 + session-timeout-ms: 15000 + heartbeat-interval-ms: 3000 + properties: + partition.assignment.strategy: org.apache.kafka.clients.consumer.CooperativeStickyAssignor + group: + order: order-group + payment: payment-group + customer: customer-group + chef: chef-group + owner: owner-group + + topic: + order: + created: spot.order.created + pending: spot.order.pending + accepted: spot.order.accepted + cancelled: spot.order.cancelled + payment: + succeeded: spot.payment.succeeded + refunded: spot.payment.refunded + payment-auth: + required: spot.payment_auth.required \ No newline at end of file diff --git a/infra/k8s/config/spot-gateway.yml b/infra/k8s/config/spot-gateway.yml new file mode 100644 index 00000000..ec48d0d5 --- /dev/null +++ b/infra/k8s/config/spot-gateway.yml @@ -0,0 +1,112 @@ +spring: + cloud: + gateway: + server: + webflux: + routes: + - id: user-auth + uri: ${SPOT_USER_URI:http://spot-user:8081} + predicates: + - Path=/api/login,/api/join,/api/auth/refresh + + - id: user-service + uri: ${SPOT_USER_URI:http://spot-user:8081} + predicates: + - Path=/api/users/**,/api/admin/** + + - id: store-service + uri: ${SPOT_STORE_URI:http://spot-store:8083} + predicates: + - Path=/api/stores/**,/api/categories/**,/api/reviews/** + + - id: order-service + uri: ${SPOT_ORDER_URI:http://spot-order:8082} + predicates: + - Path=/api/orders/** + + - id: payment-service + uri: ${SPOT_PAYMENT_URI:http://spot-payment:8084} + predicates: + - Path=/api/payments/** + + - id: block-internal + uri: http://localhost:9999 + predicates: + - Path=/internal/** + filters: + - SetStatus=403 + # ==================== OpenAPI Routes ==================== + - id: user-api-docs + uri: http://spot-user:8081 + predicates: + - Path=/v3/api-docs/user + filters: + - RewritePath=/v3/api-docs/user, /v3/api-docs + + - id: order-api-docs + uri: http://spot-order:8082 + predicates: + - Path=/v3/api-docs/order + filters: + - RewritePath=/v3/api-docs/order, /v3/api-docs + + - id: store-api-docs + uri: http://spot-store:8083 + predicates: + - Path=/v3/api-docs/store + filters: + - RewritePath=/v3/api-docs/store, /v3/api-docs + + - id: payment-api-docs + uri: http://spot-payment:8084 + predicates: + - Path=/v3/api-docs/payment + filters: + - RewritePath=/v3/api-docs/payment, /v3/api-docs + metrics: + enabled: true + httpclient: + wiretap: false + response-timeout: 5s + httpserver: + wiretap: false + + + +# ==================== SpringDoc OpenAPI (통합 Swagger) ==================== +springdoc: + swagger-ui: + urls: + - name: User Service + url: /v3/api-docs/user + - name: Store Service + url: /v3/api-docs/store + - name: Order Service + url: /v3/api-docs/order + - name: Payment Service + url: /v3/api-docs/payment + urls-primary-name: User Service + +test: + marker: spot-gateway-yml-loaded + +logging: + level: + org.springframework.boot.context.config: DEBUG + org.springframework.cloud.gateway: DEBUG + org.springframework.cloud.gateway.route.RouteDefinitionRouteLocator: DEBUG + org.springframework.cloud.gateway.handler.RoutePredicateHandlerMapping: TRACE + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - gateway + - prometheus + endpoint: + gateway: + access: unrestricted \ No newline at end of file diff --git a/infra/k8s/config/spot-order.yml b/infra/k8s/config/spot-order.yml new file mode 100644 index 00000000..80ac090d --- /dev/null +++ b/infra/k8s/config/spot-order.yml @@ -0,0 +1,104 @@ +server: + port: 8082 + +spring: + application: + name: spot-order + temporal: + workers-auto-discovery: + packages: + - "com.example.Spot" + workers: + - task-queue: ORDER_TASK_QUEUE + name: order-worker + threads: + virtual: + enabled: true + +feign: + user: + url: ${FEIGN_USER_URL:http://spot-user:8081} + store: + url: ${FEIGN_STORE_URL:http://spot-store:8083} + payment: + url: ${FEIGN_PAYMENT_URL:http://spot-payment:8084} + client: + config: + spot-user: + connectTimeout: 100 + readTimeout: 500 + loggerLevel: BASIC + + + +resilience4j: + timelimiter: + configs: + default: + cancelRunningFuture: true + instances: + user_validate_activeUser: + timeoutDuration: 250ms + + bulkhead: + instances: + user_validate_activeUser: + maxConcurrentCalls: 40 + maxWaitDuration: 0 + + retry: + instances: + user_validate_activeUser: + maxAttempts: 1 + waitDuration: 0ms + + circuitbreaker: + configs: + cb_slowCall_user: + slidingWindowType: COUNT_BASED + slidingWindowSize: 20 + minimumNumberOfCalls: 10 + slowCallRateThreshold: 50 + slowCallDurationThreshold: 250ms + failureRateThreshold: 50 + waitDurationInOpenState: 10s + permittedNumberOfCallsInHalfOpenState: 5 + automaticTransitionFromOpenToHalfOpenEnabled: true + recordExceptions: + - java.io.IOException + - java.net.SocketTimeoutException + - feign.RetryableException + - feign.FeignException + instances: + user_validate_activeUser: + baseConfig: cb_slowCall_user + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - beans + - mappings + - env + - configprops + - prometheus + + endpoint: + health: + show-details: always + metrics: + tags: + application: ${spring.application.name} + +logging: + level: + root: INFO + com.example.Spot: INFO + feign: INFO + io.github.resilience4j.circuitbreaker: INFO + io.github.resilience4j.retry: INFO + io.github.resilience4j.bulkhead: INFO diff --git a/infra/k8s/config/spot-payment.yml b/infra/k8s/config/spot-payment.yml new file mode 100644 index 00000000..27cd4ac0 --- /dev/null +++ b/infra/k8s/config/spot-payment.yml @@ -0,0 +1,121 @@ +server: + port: 8084 + +spring: + application: + name: spot-payment + temporal: + workers-auto-discovery: + packages: + - "com.example.Spot" + workers: + - task-queue: PAYMENT_TASK_QUEUE + name: payment-worker + threads: + virtual: + enabled: true + +feign: + user: + url: ${FEIGN_USER_URL:http://spot-user:8081} + order: + url: ${FEIGN_ORDER_URL:http://spot-order:8082} + store: + url: ${FEIGN_STORE_URL:http://spot-store:8083} + payment: + url: ${FEIGN_PAYMENT_URL:http://spot-payment:8084} + client: + config: + spot-store: + connectTimeout: 100 + readTimeout: 600 + loggerLevel: BASIC + spot-payment: + connectTimeout: 100 + readTimeout: 700 + loggerLevel: BASIC + +resilience4j: + retry: + instances: + store_me_ownership: + maxAttempts: 1 + waitDuration: 0ms + store_menus_validation: + maxAttempts: 2 + waitDuration: 50ms + retryExceptions: + - java.io.IOException + - java.net.SocketTimeoutException + - feign.RetryableException + payment_ready_create: + maxAttempts: 1 + waitDuration: 0ms + + circuitbreaker: + configs: + cb_failureRate_short: + slidingWindowType: COUNT_BASED + slidingWindowSize: 20 + minimumNumberOfCalls: 10 + failureRateThreshold: 50 + waitDurationInOpenState: 10s + permittedNumberOfCallsInHalfOpenState: 5 + automaticTransitionFromOpenToHalfOpenEnabled: true + recordExceptions: + - java.io.IOException + - java.net.SocketTimeoutException + - feign.RetryableException + - feign.FeignException + + cb_slowCall_short: + slidingWindowType: COUNT_BASED + slidingWindowSize: 20 + minimumNumberOfCalls: 10 + slowCallRateThreshold: 50 + slowCallDurationThreshold: 300ms + failureRateThreshold: 50 + waitDurationInOpenState: 10s + permittedNumberOfCallsInHalfOpenState: 5 + automaticTransitionFromOpenToHalfOpenEnabled: true + recordExceptions: + - java.io.IOException + - java.net.SocketTimeoutException + - feign.RetryableException + - feign.FeignException + + instances: + store_me_ownership: + baseConfig: cb_failureRate_short + store_menus_validation: + baseConfig: cb_slowCall_short + payment_ready_create: + baseConfig: cb_slowCall_short + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - prometheus + + endpoint: + health: + probes: + enabled: true + + metrics: + tags: + application: ${spring.application.name} + +logging: + level: + root: INFO + com.example.Spot: INFO + feign: INFO + io.github.resilience4j.circuitbreaker: INFO + io.github.resilience4j.retry: INFO + io.github.resilience4j.bulkhead: INFO \ No newline at end of file diff --git a/infra/k8s/config/spot-store.yml b/infra/k8s/config/spot-store.yml new file mode 100644 index 00000000..3646e300 --- /dev/null +++ b/infra/k8s/config/spot-store.yml @@ -0,0 +1,73 @@ +server: + port: 8083 + +feign: + user: + url: ${FEIGN_USER_URL:http://spot-user:8081} + order: + url: ${FEIGN_ORDER_URL:http://spot-order:8082} + client: + config: + spot-user: + connectTimeout: 100 + readTimeout: 400 + loggerLevel: BASIC + spot-order: + connectTimeout: 100 + readTimeout: 400 + loggerLevel: BASIC + + +resilience4j: + retry: + instances: + user_validate_activeUser: + maxAttempts: 1 + waitDuration: 0ms + + circuitbreaker: + configs: + cb_failureRate_short: + slidingWindowType: COUNT_BASED + slidingWindowSize: 20 + minimumNumberOfCalls: 10 + failureRateThreshold: 50 + waitDurationInOpenState: 10s + permittedNumberOfCallsInHalfOpenState: 5 + automaticTransitionFromOpenToHalfOpenEnabled: true + recordExceptions: + - java.io.IOException + - java.net.SocketTimeoutException + - feign.RetryableException + - feign.FeignException + instances: + user_validate_activeUser: + baseConfig: cb_failureRate_short + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - prometheus + + endpoint: + health: + probes: + enabled: true + + metrics: + tags: + application: ${spring.application.name} + +logging: + level: + root: INFO + com.example.Spot: INFO + feign.Logger: DEBUG + io.github.resilience4j.circuitbreaker: DEBUG + io.github.resilience4j.retry: DEBUG + io.github.resilience4j.bulkhead: DEBUG \ No newline at end of file diff --git a/infra/k8s/config/spot-user.yml b/infra/k8s/config/spot-user.yml new file mode 100644 index 00000000..4a8f3d67 --- /dev/null +++ b/infra/k8s/config/spot-user.yml @@ -0,0 +1,43 @@ +server: + port: 8081 + +feign: + order: + url: ${FEIGN_ORDER_URL:http://spot-order:8082} + store: + url: ${FEIGN_STORE_URL:http://spot-store:8083} + payment: + url: ${FEIGN_PAYMENT_URL:http://spot-payment:8084} + client: + config: + default: + loggerLevel: FULL + + +management: + endpoints: + web: + exposure: + include: + - health + - info + - metrics + - prometheus + + endpoint: + health: + probes: + enabled: true + + metrics: + tags: + application: ${spring.application.name} + +logging: + level: + feign: DEBUG + com.example.Spot: DEBUG + com.example.Spot.global.feign: DEBUG + io.github.resilience4j.circuitbreaker: INFO + io.github.resilience4j.retry: INFO + io.github.resilience4j.bulkhead: INFO \ No newline at end of file diff --git a/infra/k8s/connectors/order-outbox.json b/infra/k8s/connectors/order-outbox.json new file mode 100644 index 00000000..1d40ef9c --- /dev/null +++ b/infra/k8s/connectors/order-outbox.json @@ -0,0 +1,41 @@ +{ + "name": "order-outbox-connector", + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": "${env:DB_HOST}", + "database.port": "5432", + "database.user": "${env:SPRING_DATASOURCE_USERNAME}", + "database.password": "${env:SPRING_DATASOURCE_PASSWORD}", + "database.dbname": "${env:DB_NAME}", + "topic.prefix": "order_outbox_cdc", + "plugin.name": "pgoutput", + "slot.name": "order_outbox_slot", + "snapshot.mode": "no_data", + "snapshot.locking.mode": "none", + "table.include.list": "public.p_order_outbox", + "tombstones.on.delete": "false", + "transforms": "outbox", + "transforms.outbox.type": "io.debezium.transforms.outbox.EventRouter", + "transforms.outbox.table.field.event.id": "id", + "transforms.outbox.table.field.event.key": "aggregate_id", + "transforms.outbox.table.field.event.type": "event_type", + "transforms.outbox.table.field.event.payload": "payload", + "transforms.outbox.route.by.field": "event_type", + "transforms.outbox.route.topic.replacement": "${routedByValue}", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter.schemas.enable": "false", + "value.converter.schemas.enable": "false", + "transforms.outbox.table.expand.json.payload": "true", + "producer.acks": "all", + "producer.enable.idempotence": "true", + "producer.max.in.flight.requests.per.connection": "5", + "producer.retries": "100", + "producer.delivery.timeout.ms": "120000", + "producer.retry.backoff.ms": "500", + "producer.compression.type": "lz4", + "producer.linger.ms": "20", + "producer.batch.size": "65536" + } +} \ No newline at end of file diff --git a/infra/k8s/connectors/payment-outbox.json b/infra/k8s/connectors/payment-outbox.json new file mode 100644 index 00000000..0675d4a9 --- /dev/null +++ b/infra/k8s/connectors/payment-outbox.json @@ -0,0 +1,41 @@ +{ + "name": "payment-outbox-connector", + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": "${env:DB_HOST}", + "database.port": "5432", + "database.user": "${env:SPRING_DATASOURCE_USERNAME}", + "database.password": "${env:SPRING_DATASOURCE_PASSWORD}", + "database.dbname": "${env:DB_NAME}", + "topic.prefix": "payment_outbox_cdc", + "plugin.name": "pgoutput", + "slot.name": "payment_outbox_slot", + "snapshot.mode": "no_data", + "snapshot.locking.mode": "none", + "table.include.list": "public.p_payment_outbox", + "tombstones.on.delete": "false", + "transforms": "outbox", + "transforms.outbox.type": "io.debezium.transforms.outbox.EventRouter", + "transforms.outbox.table.field.event.id": "id", + "transforms.outbox.table.field.event.key": "aggregate_id", + "transforms.outbox.table.field.event.type": "event_type", + "transforms.outbox.table.field.event.payload": "payload", + "transforms.outbox.route.by.field": "event_type", + "transforms.outbox.route.topic.replacement": "${routedByValue}", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter.schemas.enable": "false", + "value.converter.schemas.enable": "false", + "transforms.outbox.table.expand.json.payload": "true", + "producer.acks": "all", + "producer.enable.idempotence": "true", + "producer.max.in.flight.requests.per.connection": "5", + "producer.retries": "100", + "producer.delivery.timeout.ms": "120000", + "producer.retry.backoff.ms": "500", + "producer.compression.type": "lz4", + "producer.linger.ms": "20", + "producer.batch.size": "65536" + } +} \ No newline at end of file diff --git a/infra/k8s/connectors/register-connectors.sh b/infra/k8s/connectors/register-connectors.sh new file mode 100644 index 00000000..7c4a201f --- /dev/null +++ b/infra/k8s/connectors/register-connectors.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +TARGET_URL=${CONNECT_URL:-"http://connect:8083"} + +echo "Waiting for Kafka Connect..." +while [ $(curl -s -o /dev/null -w "%{http_code}" $TARGET_URL) -ne 200 ]; do + sleep 3 +done + +echo "Registering connectors from /configs..." +for file in /configs/*.json; do + filename=$(basename "$file") + echo "Processing $filename..." + + sed -e "s|\${env:DB_HOST}|$DB_HOST|g" \ + -e "s|\${env:SPRING_DATASOURCE_USERNAME}|$SPRING_DATASOURCE_USERNAME|g" \ + -e "s|\${env:SPRING_DATASOURCE_PASSWORD}|$SPRING_DATASOURCE_PASSWORD|g" \ + -e "s|\${env:DB_NAME}|$DB_NAME|g" \ + "$file" > "/tmp/$filename" + + response=$(curl -s -X POST -H "Content-Type: application/json" \ + -d @"/tmp/$filename" \ + $TARGET_URL/connectors) + echo "Response for $filename: $response" +done + +echo "ALL connectors Created" \ No newline at end of file diff --git a/infra/k8s/kustomization.yaml b/infra/k8s/kustomization.yaml index 51a2819a..729c379e 100644 --- a/infra/k8s/kustomization.yaml +++ b/infra/k8s/kustomization.yaml @@ -5,14 +5,12 @@ kind: Kustomization resources: # Base - - base/namespace.yaml - - base/configmap.yaml -# - base/postgres.yaml -# - base/redis.yaml +# # RDS 접속용 pod +# - base/rds-relay.yaml # Kafka - base/kafka/kafka.yaml - - base/kafka/kafka-nodepool.yml + - base/kafka/kafka-nodepool.yaml - base/kafka/kafka-connect.yaml - base/kafka/connectors.yaml - base/kafka/kafka-ui.yaml @@ -25,7 +23,6 @@ resources: # Monitoring & Logging - base/monitoring/loki/loki.yaml - base/monitoring/loki/loki-config.yaml - - base/monitoring/monitoring-ingress.yaml - base/monitoring/fluent-bit/fluent-bit.yaml - base/monitoring/fluent-bit/fluent-bit-config.yaml @@ -39,38 +36,36 @@ resources: - base/monitoring/servicemonitors/spot-order-servicemonitor.yaml - base/monitoring/servicemonitors/spot-store-servicemonitor.yaml - base/monitoring/servicemonitors/spot-payment-servicemonitor.yaml - - # Spot Apps - - apps/spot-ingress.yaml - - apps/spot-gateway.yaml - - apps/spot-user.yaml - - apps/spot-store.yaml - - apps/spot-order.yaml - - apps/spot-payment.yaml + - base/monitoring/servicemonitors/kafka-broker-podmonitor.yaml + - base/monitoring/servicemonitors/kafka-exporter-servicemonitor.yaml + - base/monitoring/servicemonitors/temporal-servicemonitor.yaml # config 디렉토리의 yml 파일들을 ConfigMap으로 생성 configMapGenerator: - name: spot-app-config namespace: spot files: - - ../../config/common.yml - - ../../config/kafka-topics.yml - - ../../config/spot-gateway.yml - - ../../config/spot-user.yml - - ../../config/spot-store.yml - - ../../config/spot-order.yml - - ../../config/spot-payment.yml +# - ../../config/common.yml +# - ../../config/kafka-topics.yml +# - ../../config/spot-gateway.yml +# - ../../config/spot-user.yml +# - ../../config/spot-store.yml +# - ../../config/spot-order.yml +# - ../../config/spot-payment.yml + - config/spot-gateway.yml + - config/common.yml + - config/kafka-topics.yml + - config/spot-user.yml + - config/spot-store.yml + - config/spot-order.yml + - config/spot-payment.yml options: disableNameSuffixHash: true - - name: kafka-connect-init-config - namespace: spot + - name: kafka-metrics-config + namespace: infra files: - - ../../connectors/order-outbox.json - - ../../connectors/payment-outbox.json - - ../../connectors/register-connectors.sh - options: - disableNameSuffixHash: true + - base/monitoring/prometheus/kafka-metrics-config.yaml - name: grafana-dashboards-spot namespace: monitoring @@ -97,14 +92,37 @@ configMapGenerator: files: - base/monitoring/grafana/dashboards/15661.json + - name: grafana-dashboards-kafka-broker + namespace: monitoring + files: + - base/monitoring/grafana/dashboards/kafka-broker.json + + - name: grafana-dashboards-temporal + namespace: monitoring + files: + - base/monitoring/grafana/dashboards/temporal.json + + - name: grafana-dashboards-kafka-exporter + namespace: monitoring + files: + - base/monitoring/grafana/dashboards/kafka-exporter.json + secretGenerator: - name: spot-secrets namespace: spot envs: - - ../../.env +# - ../../.env + - config/.env options: disableNameSuffixHash: true + - name: spot-secrets + namespace: infra + envs: + # - ../../.env + - config/.env + options: + disableNameSuffixHash: true generatorOptions: disableNameSuffixHash: true \ No newline at end of file diff --git a/infra/spot-apps/.helmignore b/infra/spot-apps/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/infra/spot-apps/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/infra/spot-apps/Chart.yaml b/infra/spot-apps/Chart.yaml new file mode 100644 index 00000000..854d5b1a --- /dev/null +++ b/infra/spot-apps/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: spot-apps +description: Spot 프로젝트 app 설정 +type: application +version: 0.1.0 +appVersion: "1.0.0" \ No newline at end of file diff --git a/infra/spot-apps/output.yaml b/infra/spot-apps/output.yaml new file mode 100644 index 00000000..a1c3f945 --- /dev/null +++ b/infra/spot-apps/output.yaml @@ -0,0 +1,541 @@ +--- +# Source: spot-apps/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: spot-gateway + labels: + app: spot-gateway +spec: + type: ClusterIP + selector: + app: spot-gateway + ports: + - port: 80 + targetPort: 8080 +--- +# Source: spot-apps/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: spot-order + labels: + app: spot-order +spec: + type: ClusterIP + selector: + app: spot-order + ports: + - port: 8082 + targetPort: 8082 +--- +# Source: spot-apps/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: spot-payment + labels: + app: spot-payment +spec: + type: ClusterIP + selector: + app: spot-payment + ports: + - port: 8084 + targetPort: 8084 +--- +# Source: spot-apps/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: spot-store + labels: + app: spot-store +spec: + type: ClusterIP + selector: + app: spot-store + ports: + - port: 8083 + targetPort: 8083 +--- +# Source: spot-apps/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: spot-user + labels: + app: spot-user +spec: + type: ClusterIP + selector: + app: spot-user + ports: + - port: 8081 + targetPort: 8081 +--- +# Source: spot-apps/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spot-gateway + labels: + app: spot-gateway +spec: + replicas: 1 + selector: + matchLabels: + app: spot-gateway + template: + metadata: + labels: + app: spot-gateway + spec: + containers: + - name: spot-gateway + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-gateway:latest + imagePullPolicy: Always + ports: + - containerPort: 8080 + env: + - name: SERVER_PORT + value: "8080" + - name: SPRING_PROFILES_ACTIVE + value: k8s + - name: LOGGING_CONFIG + value: classpath:logback-spring.xml + envFrom: + - secretRef: + name: spot-secrets + + # 리소스 설정 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + + # 헬스 체크 + readinessProbe: + httpGet: + path: /actuator/health + port: 8080 + initialDelaySeconds: 90 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 5 + successThreshold: 1 + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: /actuator/health + port: 8080 + initialDelaySeconds: 120 + periodSeconds: 30 + timeoutSeconds: 5 + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: spot-app-config +--- +# Source: spot-apps/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spot-order + labels: + app: spot-order +spec: + replicas: 1 + selector: + matchLabels: + app: spot-order + template: + metadata: + labels: + app: spot-order + spec: + containers: + - name: spot-order + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-order:latest + imagePullPolicy: Always + ports: + - containerPort: 8082 + env: + - name: SERVER_PORT + value: "8082" + - name: SPRING_PROFILES_ACTIVE + value: k8s + - name: LOGGING_CONFIG + value: classpath:logback-spring.xml + envFrom: + - secretRef: + name: spot-secrets + + # 리소스 설정 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + + # 헬스 체크 + readinessProbe: + httpGet: + path: /actuator/health + port: 8082 + initialDelaySeconds: 90 + periodSeconds: 10 + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: /actuator/health + port: 8082 + initialDelaySeconds: 120 + periodSeconds: 30 + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: spot-app-config +--- +# Source: spot-apps/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spot-payment + labels: + app: spot-payment +spec: + replicas: 1 + selector: + matchLabels: + app: spot-payment + template: + metadata: + labels: + app: spot-payment + spec: + containers: + - name: spot-payment + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-payment:latest + imagePullPolicy: Always + ports: + - containerPort: 8084 + env: + - name: SERVER_PORT + value: "8084" + - name: SPRING_PROFILES_ACTIVE + value: k8s + - name: LOGGING_CONFIG + value: classpath:logback-spring.xml + envFrom: + - secretRef: + name: spot-secrets + + # 리소스 설정 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + + # 헬스 체크 + readinessProbe: + httpGet: + path: /actuator/health + port: 8084 + initialDelaySeconds: 90 + periodSeconds: 10 + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: /actuator/health + port: 8084 + initialDelaySeconds: 120 + periodSeconds: 30 + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: spot-app-config +--- +# Source: spot-apps/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spot-store + labels: + app: spot-store +spec: + replicas: 1 + selector: + matchLabels: + app: spot-store + template: + metadata: + labels: + app: spot-store + spec: + containers: + - name: spot-store + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-store:latest + imagePullPolicy: Always + ports: + - containerPort: 8083 + env: + - name: SERVER_PORT + value: "8083" + - name: SPRING_PROFILES_ACTIVE + value: k8s + - name: LOGGING_CONFIG + value: classpath:logback-spring.xml + envFrom: + - secretRef: + name: spot-secrets + + # 리소스 설정 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + + # 헬스 체크 + readinessProbe: + httpGet: + path: /actuator/health + port: 8083 + initialDelaySeconds: 90 + periodSeconds: 10 + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: /actuator/health + port: 8083 + initialDelaySeconds: 120 + periodSeconds: 30 + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: spot-app-config +--- +# Source: spot-apps/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spot-user + labels: + app: spot-user +spec: + replicas: 1 + selector: + matchLabels: + app: spot-user + template: + metadata: + labels: + app: spot-user + spec: + containers: + - name: spot-user + image: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com/spot-user:latest + imagePullPolicy: Always + ports: + - containerPort: 8081 + env: + - name: SERVER_PORT + value: "8081" + - name: SPRING_PROFILES_ACTIVE + value: k8s + - name: LOGGING_CONFIG + value: classpath:logback-spring.xml + envFrom: + - secretRef: + name: spot-secrets + + # 리소스 설정 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 800m + memory: 1Gi + + # 헬스 체크 + readinessProbe: + httpGet: + path: /actuator/health + port: 8081 + initialDelaySeconds: 90 + periodSeconds: 10 + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: /actuator/health + port: 8081 + initialDelaySeconds: 120 + periodSeconds: 30 + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: spot-app-config +--- +# Source: spot-apps/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: www-ingress + namespace: spot + annotations: + alb.ingress.kubernetes.io/load-balancer-name: "spot-dev-alb" + alb.ingress.kubernetes.io/scheme: "internet-facing" + alb.ingress.kubernetes.io/subnets: "subnet-0da7c6829952833c5, subnet-05c166744c29468c0" + alb.ingress.kubernetes.io/success-codes: "200" + alb.ingress.kubernetes.io/target-type: "ip" + alb.ingress.kubernetes.io/healthcheck-path: "/actuator/health" +spec: + ingressClassName: alb + rules: + # 도메인(Spot, Kafka, Temporal, Grafana) + - host: www.spotorder.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: spot-gateway + port: + number: 80 +--- +# Source: spot-apps/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: kafka-ingress + namespace: infra + annotations: + alb.ingress.kubernetes.io/load-balancer-name: "spot-dev-alb" + alb.ingress.kubernetes.io/scheme: "internet-facing" + alb.ingress.kubernetes.io/subnets: "subnet-0da7c6829952833c5, subnet-05c166744c29468c0" + alb.ingress.kubernetes.io/success-codes: "200" + alb.ingress.kubernetes.io/target-type: "ip" + alb.ingress.kubernetes.io/healthcheck-path: "/" +spec: + ingressClassName: alb + rules: + # 도메인(Spot, Kafka, Temporal, Grafana) + - host: kafka.spotorder.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: kafka-ui-svc + port: + number: 80 +--- +# Source: spot-apps/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: temporal-ingress + namespace: infra + annotations: + alb.ingress.kubernetes.io/load-balancer-name: "spot-dev-alb" + alb.ingress.kubernetes.io/scheme: "internet-facing" + alb.ingress.kubernetes.io/subnets: "subnet-0da7c6829952833c5, subnet-05c166744c29468c0" + alb.ingress.kubernetes.io/success-codes: "200" + alb.ingress.kubernetes.io/target-type: "ip" + alb.ingress.kubernetes.io/healthcheck-path: "/" +spec: + ingressClassName: alb + rules: + # 도메인(Spot, Kafka, Temporal, Grafana) + - host: temporal.spotorder.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: temporal-ui-svc + port: + number: 80 +--- +# Source: spot-apps/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana-ingress + namespace: monitoring + annotations: + alb.ingress.kubernetes.io/load-balancer-name: "spot-dev-alb" + alb.ingress.kubernetes.io/scheme: "internet-facing" + alb.ingress.kubernetes.io/subnets: "subnet-0da7c6829952833c5, subnet-05c166744c29468c0" + alb.ingress.kubernetes.io/success-codes: "200" + alb.ingress.kubernetes.io/target-type: "ip" + alb.ingress.kubernetes.io/healthcheck-path: "/api/health" +spec: + ingressClassName: alb + rules: + # 도메인(Spot, Kafka, Temporal, Grafana) + - host: grafana.spotorder.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana-service + port: + number: 80 diff --git a/infra/spot-apps/templates/deployment.yaml b/infra/spot-apps/templates/deployment.yaml new file mode 100644 index 00000000..fe46c4c7 --- /dev/null +++ b/infra/spot-apps/templates/deployment.yaml @@ -0,0 +1,81 @@ +{{- range $key, $val := .Values.apps }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $val.name }} + labels: + app: {{ $val.name }} +spec: + replicas: {{ $.Values.global.replicas | default 1}} + selector: + matchLabels: + app: {{ $val.name }} + template: + metadata: + labels: + app: {{ $val.name }} + spec: + containers: + - name: {{ $val.name }} + image: {{ $.Values.global.repository }}/{{ $val.name }}:latest + imagePullPolicy: {{ $.Values.global.imagePullPolicy }} + ports: + - containerPort: {{ $val.targetPort | default $val.port }} + env: + - name: SPRING_PROFILES_ACTIVE + value: {{ $.Values.env.activeProfile }} + - name: LOGGING_CONFIG + value: {{ $.Values.env.loggingConfig }} + envFrom: + - secretRef: + name: {{ $.Values.config.secretName }} + + # 리소스 설정 + resources: + requests: + cpu: {{ $.Values.global.requests.cpu }} + memory: {{ $.Values.global.requests.memory}} + limits: + cpu: {{ $.Values.global.limits.cpu }} + memory: {{ $.Values.global.limits.memory }} + + # 헬스 체크 + readinessProbe: + httpGet: + path: {{ $.Values.global.path }} + port: {{ $val.targetPort | default $val.port }} + initialDelaySeconds: {{ $.Values.global.readiness.initial }} + periodSeconds: {{ $.Values.global.readiness.period }} + {{- if $val.timeoutSeconds }} + timeoutSeconds: {{ $val.timeoutSeconds }} + {{- end }} + {{- if $val.failureThreshold }} + failureThreshold: {{ $val.failureThreshold }} + {{- end }} + {{- if $val.successThreshold }} + successThreshold: {{ $val.successThreshold }} + {{- end }} + + # 헬스 체크 (Liveness) + livenessProbe: + httpGet: + path: {{ $.Values.global.path }} + port: {{ $val.targetPort | default $val.port }} + initialDelaySeconds: {{ $.Values.global.liveness.initial }} + periodSeconds: {{ $.Values.global.liveness.period }} + {{- if $val.timeoutSeconds }} + timeoutSeconds: {{ $val.timeoutSeconds }} + {{- end }} + + # 볼륨 마운트 + volumeMounts: + - name: app-config + mountPath: /config + readOnly: true + + volumes: + - name: app-config + configMap: + name: {{ $.Values.config.configMapName }} + {{- end }} \ No newline at end of file diff --git a/infra/spot-apps/templates/ingress.yaml b/infra/spot-apps/templates/ingress.yaml new file mode 100644 index 00000000..782cf3f5 --- /dev/null +++ b/infra/spot-apps/templates/ingress.yaml @@ -0,0 +1,38 @@ +{{- range .Values.ingress.subs }} +{{- $sub := . }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ $sub.name }}-ingress + namespace: {{ $sub.namespace }} + annotations: + {{- range $key, $value := $.Values.ingress.annotations }} + {{- /* ALB healthcheck-path일 때만 sub에 정의된 경로로 덮어쓰기 */ -}} + {{- if eq $key "alb.ingress.kubernetes.io/healthcheck-path" }} + {{ $key }}: {{ $sub.healthPath | default "/" | quote }} + {{- else }} + {{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} + + {{- /* ArgoCD */ -}} +{{/* {{- if $sub.annotations }}*/}} +{{/* {{- range $key, $value := $sub.annotations }}*/}} +{{/* {{ $key }}: {{ $value | quote }}*/}} +{{/* {{- end }}*/}} +{{/* {{- end }}*/}} +spec: + ingressClassName: {{ $.Values.ingress.className }} + rules: + - host: {{ $sub.name }}.{{ $.Values.domain }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ $sub.serviceName }} + port: + number: 80 +{{- end}} \ No newline at end of file diff --git a/infra/spot-apps/templates/service.yaml b/infra/spot-apps/templates/service.yaml new file mode 100644 index 00000000..b363bb95 --- /dev/null +++ b/infra/spot-apps/templates/service.yaml @@ -0,0 +1,16 @@ +{{- range $key, $val := .Values.apps }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ $val.name }} + labels: + app: {{ $val.name }} +spec: + type: ClusterIP + selector: + app: {{ $val.name }} + ports: + - port: {{ $val.port }} + targetPort: {{ $val.targetPort | default $val.port }} + {{- end }} \ No newline at end of file diff --git a/infra/spot-apps/values.yaml b/infra/spot-apps/values.yaml new file mode 100644 index 00000000..c2dcb04d --- /dev/null +++ b/infra/spot-apps/values.yaml @@ -0,0 +1,116 @@ +# [1] Ingress 공통 설정 (기본값: AWS ALB) +ingress: + enabled: true + className: "alb" + + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip # IP 모드 + alb.ingress.kubernetes.io/healthcheck-path: /actuator/health + alb.ingress.kubernetes.io/success-codes: "200" # 헬스 체크 성공 코드 + alb.ingress.kubernetes.io/group.name: spot-ingress + + # HTTPS/HTTP 설정 + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS":443}]' + alb.ingress.kubernetes.io/ssl-redirect: '443' + alb.ingress.kubernetes.io/certificate-arn: "arn:aws:acm:ap-northeast-2:322546275072:certificate/9de3b66e-e361-43f9-bb92-14bc4d677cbf" +# alb.ingress.kubernetes.io/load-balancer-name: spot-dev-alb # tf위한 이름 고정 + + subs: + - name: www + namespace: spot + serviceName: spot-gateway + healthPath: /actuator/health + + - name: kafka + namespace: infra + serviceName: kafka-ui-svc + healthPath: / + + - name: temporal + namespace: infra + serviceName: temporal-ui-svc + healthPath: / + + - name: grafana + namespace: monitoring + serviceName: grafana-service + healthPath: "/api/health" + +# - name: argo +# namespace: argocd +# groupName: spot-ingress +# serviceName: argocd-server +# healthPath: / +# annotations: +# alb.ingress.kubernetes.io/backend-protocol: "HTTPS" +# alb.ingress.kubernetes.io/healthcheck-protocol: "HTTPS" + +# 도메인 (AWS) +domain: spotorder.org + +# [2] Deployment 공통 설정 (Global) +global: + name: spot + repository: 322546275072.dkr.ecr.ap-northeast-2.amazonaws.com + imagePullPolicy: Always + replicas: 1 + + # 리소스 제한 + requests: + cpu: "250m" + memory: "256Mi" + limits: + cpu: "800m" + memory: "1Gi" + + # 헬스 체크 + path: /actuator/health + + readiness: + initial: 90 + period: 10 + liveness: + initial: 120 + period: 30 + +# [3] app 목록 +apps: + gateway: + name: spot-gateway + port: 80 + targetPort: 8080 + timeoutSeconds: 5 + failureThreshold: 5 + successThreshold: 1 + + user: + name: spot-user + port: 8081 + + store: + name: spot-store + port: 8083 + + order: + name: spot-order + port: 8082 + + payment: + name: spot-payment + port: 8084 + +# [4] env +env: + activeProfile: "k8s" + loggingConfig: "classpath:logback-spring.xml" + postgres: + - db: POSTGRES_DB + value: "myapp_db" + - user: POSTGRES_USER + value: "admin" + +# [5] config +config: + secretName: spot-secrets + configMapName: spot-app-config \ No newline at end of file diff --git a/infra/spot-apps/values/local-values.yaml b/infra/spot-apps/values/local-values.yaml new file mode 100644 index 00000000..a94c2979 --- /dev/null +++ b/infra/spot-apps/values/local-values.yaml @@ -0,0 +1,20 @@ +# --- Ingress --- +ingress: + className: "nginx" + annotations: + kubernetes.io/ingress.class: nginx + alb.ingress.kubernetes.io/scheme: null + alb.ingress.kubernetes.io/target-type: null + alb.ingress.kubernetes.io/group.name: null + alb.ingress.kubernetes.io/healthcheck-path: null + alb.ingress.kubernetes.io/success-codes: null + alb.ingress.kubernetes.io/listen-ports: null + alb.ingress.kubernetes.io/ssl-redirect: null + alb.ingress.kubernetes.io/certificate-arn: null + +# domain +domain: localhost + +# [2] 로컬 환경 Deployment 설정 (Global) +global: + repository: spot-registry.localhost:5111 \ No newline at end of file diff --git a/infra/terraform/environments/dev/.terraform.lock.hcl b/infra/terraform/environments/dev/.terraform.lock.hcl index cdc1668d..efd52918 100644 --- a/infra/terraform/environments/dev/.terraform.lock.hcl +++ b/infra/terraform/environments/dev/.terraform.lock.hcl @@ -23,3 +23,63 @@ provider "registry.terraform.io/hashicorp/aws" { "zh:ff461571e3f233699bf690db319dfe46aec75e58726636a0d97dd9ac6e32fb70", ] } + +provider "registry.terraform.io/hashicorp/helm" { + version = "2.17.0" + constraints = "~> 2.12" + hashes = [ + "h1:kQMkcPVvHOguOqnxoEU2sm1ND9vCHiT8TvZ2x6v/Rsw=", + "zh:06fb4e9932f0afc1904d2279e6e99353c2ddac0d765305ce90519af410706bd4", + "zh:104eccfc781fc868da3c7fec4385ad14ed183eb985c96331a1a937ac79c2d1a7", + "zh:129345c82359837bb3f0070ce4891ec232697052f7d5ccf61d43d818912cf5f3", + "zh:3956187ec239f4045975b35e8c30741f701aa494c386aaa04ebabffe7749f81c", + "zh:66a9686d92a6b3ec43de3ca3fde60ef3d89fb76259ed3313ca4eb9bb8c13b7dd", + "zh:88644260090aa621e7e8083585c468c8dd5e09a3c01a432fb05da5c4623af940", + "zh:a248f650d174a883b32c5b94f9e725f4057e623b00f171936dcdcc840fad0b3e", + "zh:aa498c1f1ab93be5c8fbf6d48af51dc6ef0f10b2ea88d67bcb9f02d1d80d3930", + "zh:bf01e0f2ec2468c53596e027d376532a2d30feb72b0b5b810334d043109ae32f", + "zh:c46fa84cc8388e5ca87eb575a534ebcf68819c5a5724142998b487cb11246654", + "zh:d0c0f15ffc115c0965cbfe5c81f18c2e114113e7a1e6829f6bfd879ce5744fbb", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.25" + hashes = [ + "h1:soK8Lt0SZ6dB+HsypFRDzuX/npqlMU6M0fvyaR1yW0k=", + "zh:0af928d776eb269b192dc0ea0f8a3f0f5ec117224cd644bdacdc682300f84ba0", + "zh:1be998e67206f7cfc4ffe77c01a09ac91ce725de0abaec9030b22c0a832af44f", + "zh:326803fe5946023687d603f6f1bab24de7af3d426b01d20e51d4e6fbe4e7ec1b", + "zh:4a99ec8d91193af961de1abb1f824be73df07489301d62e6141a656b3ebfff12", + "zh:5136e51765d6a0b9e4dbcc3b38821e9736bd2136cf15e9aac11668f22db117d2", + "zh:63fab47349852d7802fb032e4f2b6a101ee1ce34b62557a9ad0f0f0f5b6ecfdc", + "zh:924fb0257e2d03e03e2bfe9c7b99aa73c195b1f19412ca09960001bee3c50d15", + "zh:b63a0be5e233f8f6727c56bed3b61eb9456ca7a8bb29539fba0837f1badf1396", + "zh:d39861aa21077f1bc899bc53e7233262e530ba8a3a2d737449b100daeb303e4d", + "zh:de0805e10ebe4c83ce3b728a67f6b0f9d18be32b25146aa89116634df5145ad4", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:faf23e45f0090eef8ba28a8aac7ec5d4fdf11a36c40a8d286304567d71c1e7db", + ] +} + +provider "registry.terraform.io/hashicorp/tls" { + version = "4.2.1" + constraints = ">= 4.0.0" + hashes = [ + "h1:akFNuHwvrtnYMBofieoeXhPJDhYZzJVu/Q/BgZK2fgg=", + "zh:0d1e7d07ac973b97fa228f46596c800de830820506ee145626f079dd6bbf8d8a", + "zh:5c7e3d4348cb4861ab812973ef493814a4b224bdd3e9d534a7c8a7c992382b86", + "zh:7c6d4a86cd7a4e9c1025c6b3a3a6a45dea202af85d870cddbab455fb1bd568ad", + "zh:7d0864755ba093664c4b2c07c045d3f5e3d7c799dda1a3ef33d17ed1ac563191", + "zh:83734f57950ab67c0d6a87babdb3f13c908cbe0a48949333f489698532e1391b", + "zh:951e3c285218ebca0cf20eaa4265020b4ef042fea9c6ade115ad1558cfe459e5", + "zh:b9543955b4297e1d93b85900854891c0e645d936d8285a190030475379c5c635", + "zh:bb1bd9e86c003d08c30c1b00d44118ed5bbbf6b1d2d6f7eaac4fa5c6ebea5933", + "zh:c9477bfe00653629cd77ddac3968475f7ad93ac3ca8bc45b56d1d9efb25e4a6e", + "zh:d4cfda8687f736d0cba664c22ec49dae1188289e214ef57f5afe6a7217854fed", + "zh:dc77ee066cf96532a48f0578c35b1eaf6dc4d8ddd0e3ae8e029a3b10676dd5d3", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/infra/terraform/environments/dev/locals.tf b/infra/terraform/environments/dev/locals.tf index 9512a84d..cbc91fd8 100644 --- a/infra/terraform/environments/dev/locals.tf +++ b/infra/terraform/environments/dev/locals.tf @@ -6,4 +6,19 @@ locals { Environment = var.environment ManagedBy = "terraform" } + + service_accounts = { + aws_load_balancer_controller = { + namespace = "kube-system" + service_account = "aws-load-balancer-controller" + policy_arn = "" + create_k8s_sa = true + } + ebs_csi_driver = { + namespace = "kube-system" + service_account = "ebs-csi-controller-sa" + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" + create_k8s_sa = false + } + } } diff --git a/infra/terraform/environments/dev/main.tf b/infra/terraform/environments/dev/main.tf index 1cb06bfa..cfeeaa83 100644 --- a/infra/terraform/environments/dev/main.tf +++ b/infra/terraform/environments/dev/main.tf @@ -4,12 +4,12 @@ data "aws_caller_identity" "current" {} # ============================================================================= -# Network +# Network (SPOT) # ============================================================================= -module "network" { +module "network_spot" { source = "../../modules/network" - name_prefix = local.name_prefix + name_prefix = "${local.name_prefix}-spot" common_tags = local.common_tags vpc_cidr = var.vpc_cidr public_subnet_cidrs = var.public_subnet_cidrs @@ -18,6 +18,8 @@ module "network" { nat_instance_type = var.nat_instance_type } + + # ============================================================================= # Database # ============================================================================= @@ -26,9 +28,12 @@ module "database" { name_prefix = local.name_prefix common_tags = local.common_tags - vpc_id = module.network.vpc_id - vpc_cidr = module.network.vpc_cidr - subnet_ids = module.network.private_subnet_ids + vpc_id = module.network_spot.vpc_id + vpc_cidr = module.network_spot.vpc_cidr + subnet_ids = module.network_spot.private_subnet_ids + + allowed_security_group_ids = [module.eks.node_security_group_id] + db_name = var.db_name username = var.db_username password = var.db_password @@ -37,6 +42,8 @@ module "database" { engine_version = var.db_engine_version } + + # ============================================================================= # ECR (Multiple Repositories) # ============================================================================= @@ -49,125 +56,35 @@ module "ecr" { service_names = toset(keys(var.services)) } -# ============================================================================= -# ALB (Gateway Pass-through) -# ============================================================================= -module "alb" { - source = "../../modules/alb" - - name_prefix = local.name_prefix - common_tags = local.common_tags - vpc_id = module.network.vpc_id - vpc_cidr = module.network.vpc_cidr - subnet_ids = module.network.private_subnet_ids - - # Gateway만 ALB에 연결 - 모든 트래픽이 Spring Gateway로 전달됨 - services = { - "gateway" = { - container_port = var.services["gateway"].container_port - health_check_path = var.services["gateway"].health_check_path - path_patterns = ["/*"] - priority = 1 - } - } -} - -# ============================================================================= -# ECS (Multiple Services with Service Connect) -# ============================================================================= -module "ecs" { - source = "../../modules/ecs" - - project = var.project - environment = var.environment - name_prefix = local.name_prefix - common_tags = local.common_tags - region = var.region - vpc_id = module.network.vpc_id - subnet_ids = [module.network.public_subnet_a_id] # NAT 문제로 public 사용 - ecr_repository_urls = module.ecr.repository_urls - alb_security_group_id = module.alb.security_group_id - target_group_arns = module.alb.target_group_arns - alb_listener_arn = module.alb.listener_arn - assign_public_ip = true # NAT 문제로 public IP 사용 - - services = var.services - enable_service_connect = var.enable_service_connect - standby_mode = var.standby_mode - - # Database 연결 정보 - db_endpoint = module.database.endpoint - db_name = var.db_name - db_username = var.db_username - - # Redis 연결 정보 - redis_endpoint = module.elasticache.redis_endpoint - - # Kafka 연결 정보 - kafka_bootstrap_servers = module.kafka.bootstrap_servers - - # Parameter Store ARNs (민감 정보 주입) - parameter_arns = { - db_password = module.parameters.db_password_arn - jwt_secret = module.parameters.jwt_secret_arn - mail_password = module.parameters.mail_password_arn - toss_secret_key = module.parameters.toss_secret_key_arn - } - - # JWT 설정 (비민감 정보) - jwt_expire_ms = var.jwt_expire_ms - refresh_token_expire_days = var.refresh_token_expire_days - - # Mail 설정 (비민감 정보) - mail_username = var.mail_username - - # Toss 결제 설정 (비민감 정보) - toss_customer_key = var.toss_customer_key - - # 서비스 설정 - service_active_regions = var.service_active_regions - - depends_on = [module.parameters] -} - -# ============================================================================= -# API Gateway -# ============================================================================= -module "api_gateway" { - source = "../../modules/api-gateway" - - name_prefix = local.name_prefix - common_tags = local.common_tags - subnet_ids = module.network.private_subnet_ids - ecs_security_group_id = module.ecs.security_group_id - alb_listener_arn = module.alb.listener_arn -} - # ============================================================================= # DNS (Route 53 + ACM) # ============================================================================= module "dns" { source = "../../modules/dns" - name_prefix = local.name_prefix - common_tags = local.common_tags - domain_name = var.domain_name - create_api_domain = var.create_api_domain - api_gateway_id = module.api_gateway.api_id + name_prefix = local.name_prefix + common_tags = local.common_tags + domain_name = var.domain_name + + create_alb_record = var.create_alb_record + alb_name = "spot-dev-alb" + alb_record_name = "spotorder.org" } + # ============================================================================= # WAF (Web Application Firewall) # ============================================================================= module "waf" { source = "../../modules/waf" - name_prefix = local.name_prefix - common_tags = local.common_tags - api_gateway_stage_arn = module.api_gateway.stage_arn - rate_limit = var.waf_rate_limit + name_prefix = local.name_prefix + common_tags = local.common_tags + rate_limit = var.waf_rate_limit + log_retention_days = var.waf_log_retention_days } + # ============================================================================= # S3 (정적 파일 / 로그 저장) # ============================================================================= @@ -190,9 +107,9 @@ module "elasticache" { name_prefix = local.name_prefix common_tags = local.common_tags - vpc_id = module.network.vpc_id - subnet_ids = module.network.private_subnet_ids - allowed_security_group_ids = [module.ecs.security_group_id] + vpc_id = module.network_spot.vpc_id + subnet_ids = module.network_spot.private_subnet_ids + allowed_security_group_ids = [module.eks.node_security_group_id] node_type = var.redis_node_type num_cache_clusters = var.redis_num_cache_clusters engine_version = var.redis_engine_version @@ -206,10 +123,10 @@ module "kafka" { name_prefix = local.name_prefix common_tags = local.common_tags - vpc_id = module.network.vpc_id - vpc_cidr = module.network.vpc_cidr - subnet_id = module.network.public_subnet_a_id # NAT 문제로 public 사용 - allowed_security_group_ids = [module.ecs.security_group_id] + vpc_id = module.network_spot.vpc_id + vpc_cidr = module.network_spot.vpc_cidr + subnet_id = module.network_spot.public_subnet_a_id # NAT 문제로 public 사용 + allowed_security_group_ids = [module.eks.node_security_group_id] assign_public_ip = true instance_type = var.kafka_instance_type @@ -249,16 +166,92 @@ module "monitoring" { common_tags = local.common_tags alert_email = var.alert_email - # ECS 모니터링 (대표 서비스) - ecs_cluster_name = module.ecs.cluster_name - ecs_service_name = module.ecs.service_names["user"] - # RDS 모니터링 rds_instance_id = module.database.instance_id - # ALB 모니터링 - alb_arn_suffix = module.alb.arn_suffix - - # Redis 모니터링 (선택) + # Redis 모니터링 redis_cluster_id = "${local.name_prefix}-redis-001" } + + +# ============================================================================= +# eks +# ============================================================================= +module "eks" { + source = "../../modules/eks" + + name_prefix = "${local.name_prefix}-spot" + common_tags = local.common_tags + + cluster_name = "${var.cluster_name}-spot" + cluster_version = var.cluster_version + + vpc_id = module.network_spot.vpc_id + + subnet_ids = module.network_spot.private_subnet_ids + node_subnet_ids = module.network_spot.private_subnet_ids + + endpoint_private_access = true + endpoint_public_access = true + public_access_cidrs = var.eks_public_access_cidrs + + enable_node_group = true + node_desired_size = 1 + node_min_size = 1 + node_max_size = 1 + + enable_node_ssm = true +} + + +module "irsa" { + source = "../../modules/irsa" + + name_prefix = "${local.name_prefix}-spot" + common_tags = local.common_tags + + oidc_issuer_url = module.eks.oidc_issuer_url + service_accounts = local.service_accounts + + +} + +module "eks_addons" { + source = "../../modules/eks-addons" + + common_tags = local.common_tags + cluster_name = module.eks.cluster_name + ebs_csi_irsa_role_arn = module.irsa.service_account_role_arns["ebs_csi_driver"] + + + enable_vpc_cni = true + enable_coredns = true + enable_kube_proxy = true + enable_ebs_csi = true + +} + + +module "k8s_bootstrap" { + source = "../../modules/k8s-bootstrap" + + providers = { + kubernetes = kubernetes + helm = helm + } + + cluster_name = module.eks.cluster_name + alb_controller_chart_version = var.alb_controller_chart_version + + service_accounts = local.service_accounts + service_account_role_arns = module.irsa.service_account_role_arns + enable_lbc = true + enable_argocd_access = true + + depends_on = [ + module.eks, + module.irsa, + module.eks_addons + ] +} + diff --git a/infra/terraform/environments/dev/outputs.tf b/infra/terraform/environments/dev/outputs.tf index 66efc4be..68c092ed 100644 --- a/infra/terraform/environments/dev/outputs.tf +++ b/infra/terraform/environments/dev/outputs.tf @@ -3,7 +3,7 @@ # ============================================================================= output "vpc_id" { description = "VPC ID" - value = module.network.vpc_id + value = module.network_spot.vpc_id } # ============================================================================= @@ -27,60 +27,13 @@ output "ecr_repository_urls" { value = module.ecr.repository_urls } -# ============================================================================= -# ECS -# ============================================================================= -output "ecs_cluster_name" { - description = "ECS 클러스터 이름" - value = module.ecs.cluster_name -} - -output "ecs_service_names" { - description = "ECS 서비스 이름 맵" - value = module.ecs.service_names -} - -# ============================================================================= -# ALB -# ============================================================================= -output "alb_dns" { - description = "ALB DNS" - value = module.alb.alb_dns_name -} - -# ============================================================================= -# API Gateway -# ============================================================================= -output "api_url" { - description = "API Gateway URL" - value = module.api_gateway.api_endpoint -} - -# ============================================================================= -# DNS -# ============================================================================= -output "name_servers" { - description = "Route 53 네임서버 (도메인 등록 기관에 설정 필요)" - value = module.dns.name_servers -} - -output "api_custom_domain" { - description = "API 커스텀 도메인" - value = module.dns.api_domain -} - -output "certificate_arn" { - description = "SSL 인증서 ARN" - value = module.dns.certificate_arn -} - # ============================================================================= # WAF # ============================================================================= -output "waf_web_acl_arn" { - description = "WAF Web ACL ARN" - value = module.waf.web_acl_arn -} +# output "waf_web_acl_arn" { +# description = "WAF Web ACL ARN" +# value = module.waf.web_acl_arn +# } # ============================================================================= # S3 @@ -115,3 +68,32 @@ output "sns_alerts_topic_arn" { description = "알람 알림 SNS Topic ARN" value = module.monitoring.sns_topic_arn } + + +# ============================================================================= +# eks +# ============================================================================= + +output "eks_cluster_name" { + value = module.eks.cluster_name +} + +output "eks_cluster_endpoint" { + value = module.eks.cluster_endpoint +} + +output "eks_cluster_ca" { + value = module.eks.cluster_ca +} + +output "eks_node_security_group_id" { + value = module.eks.node_security_group_id +} + +output "oidc_issuer_url_spot" { + value = module.eks.oidc_issuer_url +} + +output "irsa_role_arns" { + value = module.irsa.service_account_role_arns +} diff --git a/infra/terraform/environments/dev/provider.tf b/infra/terraform/environments/dev/provider.tf index a6e68b57..97e3a371 100644 --- a/infra/terraform/environments/dev/provider.tf +++ b/infra/terraform/environments/dev/provider.tf @@ -6,6 +6,18 @@ terraform { source = "hashicorp/aws" version = "~> 5.0" } + tls = { + source = "hashicorp/tls" + version = ">= 4.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.25" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.12" + } # postgresql = { # source = "cyrilgdn/postgresql" # version = "~> 1.21" @@ -41,4 +53,24 @@ provider "aws" { # resource "postgresql_schema" "users" { # name = "users" # var.services["user"].environment_vars["DB_SCHEMA"] 값과 일치해야 함 # owner = var.db_username -# } \ No newline at end of file +# } + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_ca) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_ca) + token = data.aws_eks_cluster_auth.this.token + } +} + + diff --git a/infra/terraform/environments/dev/variables.tf b/infra/terraform/environments/dev/variables.tf index 09471ef0..a05c4ec5 100644 --- a/infra/terraform/environments/dev/variables.tf +++ b/infra/terraform/environments/dev/variables.tf @@ -33,6 +33,7 @@ variable "public_subnet_cidrs" { type = map(string) default = { "a" = "10.0.1.0/24" + "c" = "10.0.2.0/24" } } @@ -122,8 +123,8 @@ variable "services" { desired_count = 1 health_check_path = "/actuator/health" # 모든 트래픽을 Gateway로 몰아주기 위해 /* 패턴 사용 - path_patterns = ["/*"] - priority = 1 # 가장 높은 우선순위 + path_patterns = ["/*"] + priority = 1 # 가장 높은 우선순위 environment_vars = { SERVICE_NAME = "spot-gateway" } @@ -213,6 +214,13 @@ variable "waf_rate_limit" { default = 2000 } +variable "waf_log_retention_days" { + description = "WAF 로그 보관 일수" + type = number + default = 30 +} + + # ============================================================================= # S3 설정 # ============================================================================= @@ -265,6 +273,7 @@ variable "jwt_secret" { description = "JWT 시크릿 키" type = string sensitive = true + default = "jmXDQDnLqV+lOaPKMR06v4+RQ7aj2cj8LR+zgPOlz/GS989tptPtAmIpyaZHrsLOPqKoVtPus28YeXZTL8O8nw==" } variable "jwt_expire_ms" { @@ -341,7 +350,7 @@ variable "kafka_instance_type" { variable "kafka_volume_size" { description = "Kafka EBS 볼륨 크기 (GB)" type = number - default = 20 + default = 30 } variable "kafka_log_retention_hours" { @@ -349,3 +358,36 @@ variable "kafka_log_retention_hours" { type = number default = 168 # 7일 } + + +# ============================================================================= +# eks 설정 +# ============================================================================= +variable "cluster_name" { + type = string + default = "spot-cluster-test" +} + +variable "cluster_version" { + type = string + default = "1.29" +} + +# ============================================================================= +# k8s_bootstrap.tf +# ============================================================================= +variable "alb_controller_chart_version" { + type = string + description = "Helm chart version for aws-load-balancer-controller" + default = "1.7.2" +} + +variable "create_alb_record" { + type = bool + default = true +} +variable "eks_public_access_cidrs" { + description = "CIDRs allowed to access EKS public endpoint" + type = list(string) + default = [] +} diff --git a/infra/terraform/modules/alb/main.tf b/infra/terraform/modules/alb/main.tf deleted file mode 100644 index 8b6c2f68..00000000 --- a/infra/terraform/modules/alb/main.tf +++ /dev/null @@ -1,203 +0,0 @@ -# ============================================================================= -# ALB Security Group -# ============================================================================= -resource "aws_security_group" "alb_sg" { - name = "${var.name_prefix}-alb-sg" - vpc_id = var.vpc_id - - ingress { - from_port = 80 - to_port = 80 - protocol = "tcp" - cidr_blocks = [var.vpc_cidr] - } - - # HTTPS 인바운드 (Production) - dynamic "ingress" { - for_each = var.enable_https ? [1] : [] - content { - from_port = 443 - to_port = 443 - protocol = "tcp" - cidr_blocks = [var.vpc_cidr] - } - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-alb-sg" }) -} - - # ============================================================================= - # Application Load Balancer (Internal) - # ============================================================================= - resource "aws_lb" "main" { - name = "${var.name_prefix}-alb" - internal = true - load_balancer_type = "application" - security_groups = [aws_security_group.alb_sg.id] - subnets = var.subnet_ids - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-alb" }) - } - - # ============================================================================= - # Target Groups (Multiple Services) - # ============================================================================= - resource "aws_lb_target_group" "services" { - for_each = var.services - - name = "${var.name_prefix}-${each.key}-tg" - port = each.value.container_port - protocol = "HTTP" - vpc_id = var.vpc_id - target_type = "ip" - - health_check { - enabled = true - healthy_threshold = 2 - unhealthy_threshold = 3 - timeout = 10 - interval = 30 - path = each.value.health_check_path - matcher = "200" - } - - tags = merge(var.common_tags, { - Name = "${var.name_prefix}-${each.key}-tg" - Service = each.key - }) - } - -# ============================================================================= -# Green Target Groups (for Blue/Green Deployment) -# ============================================================================= -resource "aws_lb_target_group" "services_green" { - for_each = var.enable_blue_green ? var.services : {} - - name = "${var.name_prefix}-${each.key}-tg-g" - port = each.value.container_port - protocol = "HTTP" - vpc_id = var.vpc_id - target_type = "ip" - - health_check { - enabled = true - healthy_threshold = 2 - unhealthy_threshold = 3 - timeout = 10 - interval = 30 - path = each.value.health_check_path - matcher = "200" - } - - tags = merge(var.common_tags, { - Name = "${var.name_prefix}-${each.key}-tg-green" - Service = each.key - Color = "green" - }) -} - -# ============================================================================= -# ALB Listener - HTTP (Default action returns 404 or redirects to HTTPS) -# ============================================================================= -resource "aws_lb_listener" "main" { - load_balancer_arn = aws_lb.main.arn - port = 80 - protocol = "HTTP" - - default_action { - type = var.enable_https && var.certificate_arn != null ? "redirect" : "fixed-response" - - dynamic "redirect" { - for_each = var.enable_https && var.certificate_arn != null ? [1] : [] - content { - port = "443" - protocol = "HTTPS" - status_code = "HTTP_301" - } - } - - dynamic "fixed_response" { - for_each = var.enable_https && var.certificate_arn != null ? [] : [1] - content { - content_type = "application/json" - message_body = jsonencode({ error = "Not Found", message = "No matching route" }) - status_code = "404" - } - } - } -} - -# ============================================================================= -# ALB Listener - HTTPS (Production) -# ============================================================================= -resource "aws_lb_listener" "https" { - count = var.enable_https && var.certificate_arn != null ? 1 : 0 - - load_balancer_arn = aws_lb.main.arn - port = 443 - protocol = "HTTPS" - ssl_policy = var.ssl_policy - certificate_arn = var.certificate_arn - - default_action { - type = "fixed-response" - fixed_response { - content_type = "application/json" - message_body = jsonencode({ error = "Not Found", message = "No matching route" }) - status_code = "404" - } - } -} - -# ============================================================================= -# ALB Listener Rules - HTTP (Path-based Routing) -# ============================================================================= -resource "aws_lb_listener_rule" "services" { - for_each = var.enable_https && var.certificate_arn != null ? {} : var.services - - listener_arn = aws_lb_listener.main.arn - priority = each.value.priority - - action { - type = "forward" - target_group_arn = aws_lb_target_group.services[each.key].arn - } - - condition { - path_pattern { - values = each.value.path_patterns - } - } - - tags = merge(var.common_tags, { Service = each.key }) -} - -# ============================================================================= -# ALB Listener Rules - HTTPS (Path-based Routing for Production) -# ============================================================================= -resource "aws_lb_listener_rule" "services_https" { - for_each = var.enable_https && var.certificate_arn != null ? var.services : {} - - listener_arn = aws_lb_listener.https[0].arn - priority = each.value.priority - - action { - type = "forward" - target_group_arn = aws_lb_target_group.services[each.key].arn - } - - condition { - path_pattern { - values = each.value.path_patterns - } - } - - tags = merge(var.common_tags, { Service = each.key }) -} diff --git a/infra/terraform/modules/alb/outputs.tf b/infra/terraform/modules/alb/outputs.tf deleted file mode 100644 index 8c50db0e..00000000 --- a/infra/terraform/modules/alb/outputs.tf +++ /dev/null @@ -1,57 +0,0 @@ -output "alb_arn" { - description = "ALB ARN" - value = aws_lb.main.arn -} - -output "alb_dns_name" { - description = "ALB DNS 이름" - value = aws_lb.main.dns_name -} - -output "target_group_arns" { - description = "Target Group ARN 맵" - value = { for k, v in aws_lb_target_group.services : k => v.arn } -} - -output "target_group_names" { - description = "Target Group 이름 맵" - value = { for k, v in aws_lb_target_group.services : k => v.name } -} - -output "listener_arn" { - description = "Listener ARN (HTTP)" - value = aws_lb_listener.main.arn -} - -output "https_listener_arn" { - description = "HTTPS Listener ARN" - value = var.enable_https && var.certificate_arn != null ? aws_lb_listener.https[0].arn : null -} - -output "security_group_id" { - description = "ALB 보안그룹 ID" - value = aws_security_group.alb_sg.id -} - -output "arn_suffix" { - description = "ALB ARN suffix (CloudWatch용)" - value = aws_lb.main.arn_suffix -} - -output "target_group_arn_suffixes" { - description = "Target Group ARN suffix 맵 (CloudWatch용)" - value = { for k, v in aws_lb_target_group.services : k => v.arn_suffix } -} - -# ============================================================================= -# Blue/Green Outputs -# ============================================================================= -output "green_target_group_arns" { - description = "Green Target Group ARN 맵 (Blue/Green용)" - value = var.enable_blue_green ? { for k, v in aws_lb_target_group.services_green : k => v.arn } : {} -} - -output "green_target_group_names" { - description = "Green Target Group 이름 맵" - value = var.enable_blue_green ? { for k, v in aws_lb_target_group.services_green : k => v.name } : {} -} diff --git a/infra/terraform/modules/alb/variables.tf b/infra/terraform/modules/alb/variables.tf deleted file mode 100644 index f228910a..00000000 --- a/infra/terraform/modules/alb/variables.tf +++ /dev/null @@ -1,65 +0,0 @@ -variable "name_prefix" { - description = "리소스 네이밍 프리픽스" - type = string -} - -variable "common_tags" { - description = "공통 태그" - type = map(string) - default = {} -} - -variable "vpc_id" { - description = "VPC ID" - type = string -} - -variable "vpc_cidr" { - description = "VPC CIDR" - type = string -} - -variable "subnet_ids" { - description = "ALB 서브넷 ID 목록" - type = list(string) -} - -variable "services" { - description = "서비스 구성 맵" - type = map(object({ - container_port = number - health_check_path = string - path_patterns = list(string) - priority = number - })) -} - -# ============================================================================= -# HTTPS Settings -# ============================================================================= -variable "enable_https" { - description = "HTTPS 활성화" - type = bool - default = false -} - -variable "certificate_arn" { - description = "ACM 인증서 ARN" - type = string - default = null -} - -variable "ssl_policy" { - description = "SSL 정책" - type = string - default = "ELBSecurityPolicy-TLS13-1-2-2021-06" -} - -# ============================================================================= -# Blue/Green Deployment Settings -# ============================================================================= -variable "enable_blue_green" { - description = "Blue/Green 배포용 추가 Target Group 생성" - type = bool - default = false -} diff --git a/infra/terraform/modules/api-gateway/main.tf b/infra/terraform/modules/api-gateway/main.tf deleted file mode 100644 index 27713dd5..00000000 --- a/infra/terraform/modules/api-gateway/main.tf +++ /dev/null @@ -1,224 +0,0 @@ -# ============================================================================= -# API Gateway (HTTP API) -# ============================================================================= -resource "aws_apigatewayv2_api" "main" { - name = "${var.name_prefix}-api" - protocol_type = "HTTP" - - cors_configuration { - allow_origins = ["*"] - allow_methods = ["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"] - allow_headers = ["Content-Type", "Authorization", "X-Requested-With"] - expose_headers = ["X-Request-Id"] - max_age = 3600 - allow_credentials = false - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-api" }) -} - -# ============================================================================= -# Cognito User Pool -# ============================================================================= -resource "aws_cognito_user_pool" "main" { - count = var.enable_cognito ? 1 : 0 - name = var.cognito_user_pool_name != null ? var.cognito_user_pool_name : "${var.name_prefix}-user-pool" - - username_attributes = ["email"] - auto_verified_attributes = ["email"] - - password_policy { - minimum_length = 8 - require_lowercase = true - require_numbers = true - require_symbols = true - require_uppercase = true - temporary_password_validity_days = 7 - } - - email_configuration { - email_sending_account = "COGNITO_DEFAULT" - } - - mfa_configuration = "OPTIONAL" - - software_token_mfa_configuration { - enabled = true - } - - account_recovery_setting { - recovery_mechanism { - name = "verified_email" - priority = 1 - } - } - - schema { - name = "email" - attribute_data_type = "String" - mutable = true - required = true - string_attribute_constraints { - min_length = 1 - max_length = 256 - } - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-user-pool" }) -} - -resource "aws_cognito_user_pool_client" "main" { - count = var.enable_cognito ? 1 : 0 - name = "${var.name_prefix}-api-client" - user_pool_id = aws_cognito_user_pool.main[0].id - - generate_secret = false - allowed_oauth_flows_user_pool_client = true - allowed_oauth_flows = ["code", "implicit"] - allowed_oauth_scopes = ["email", "openid", "profile"] - callback_urls = var.cognito_callback_urls - logout_urls = var.cognito_logout_urls - supported_identity_providers = ["COGNITO"] - - explicit_auth_flows = [ - "ALLOW_REFRESH_TOKEN_AUTH", - "ALLOW_USER_SRP_AUTH", - "ALLOW_USER_PASSWORD_AUTH" - ] - - access_token_validity = 1 - id_token_validity = 1 - refresh_token_validity = 30 - - token_validity_units { - access_token = "hours" - id_token = "hours" - refresh_token = "days" - } -} - -resource "aws_cognito_user_pool_domain" "main" { - count = var.enable_cognito ? 1 : 0 - domain = var.name_prefix - user_pool_id = aws_cognito_user_pool.main[0].id -} - -# ============================================================================= -# JWT Authorizer -# ============================================================================= -resource "aws_apigatewayv2_authorizer" "cognito" { - count = var.enable_cognito ? 1 : 0 - - api_id = aws_apigatewayv2_api.main.id - authorizer_type = "JWT" - name = "${var.name_prefix}-cognito-authorizer" - identity_sources = ["$request.header.Authorization"] - - jwt_configuration { - audience = [aws_cognito_user_pool_client.main[0].id] - issuer = "https://${aws_cognito_user_pool.main[0].endpoint}" - } -} - -# ============================================================================= -# VPC Link -# ============================================================================= -resource "aws_apigatewayv2_vpc_link" "main" { - name = "${var.name_prefix}-vpc-link" - security_group_ids = [var.ecs_security_group_id] - subnet_ids = var.subnet_ids - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-vpc-link" }) -} - -# ============================================================================= -# Integration (VPC Link -> ALB) -# ============================================================================= -resource "aws_apigatewayv2_integration" "main" { - api_id = aws_apigatewayv2_api.main.id - integration_type = "HTTP_PROXY" - integration_method = "ANY" - integration_uri = var.alb_listener_arn - connection_type = "VPC_LINK" - connection_id = aws_apigatewayv2_vpc_link.main.id - - payload_format_version = "1.0" -} - -# ============================================================================= -# Routes (Public - No Auth Required) -# ============================================================================= -resource "aws_apigatewayv2_route" "public" { - for_each = var.enable_cognito ? toset(var.public_routes) : toset([]) - - api_id = aws_apigatewayv2_api.main.id - route_key = "ANY ${each.value}" - target = "integrations/${aws_apigatewayv2_integration.main.id}" -} - -# ============================================================================= -# Routes (Protected - Auth Required) -# ============================================================================= -resource "aws_apigatewayv2_route" "protected" { - for_each = var.enable_cognito ? toset(var.protected_route_patterns) : toset([]) - - api_id = aws_apigatewayv2_api.main.id - route_key = "ANY ${each.value}" - target = "integrations/${aws_apigatewayv2_integration.main.id}" - authorizer_id = aws_apigatewayv2_authorizer.cognito[0].id - authorization_type = "JWT" -} - -# ============================================================================= -# Route (Fallback - When Cognito Disabled) -# ============================================================================= -resource "aws_apigatewayv2_route" "main" { - count = var.enable_cognito ? 0 : 1 - - api_id = aws_apigatewayv2_api.main.id - route_key = "ANY /{proxy+}" - target = "integrations/${aws_apigatewayv2_integration.main.id}" -} - -# ============================================================================= -# Stage -# ============================================================================= -resource "aws_apigatewayv2_stage" "main" { - api_id = aws_apigatewayv2_api.main.id - name = "$default" - auto_deploy = true - - access_log_settings { - destination_arn = aws_cloudwatch_log_group.api_logs.arn - format = jsonencode({ - requestId = "$context.requestId" - ip = "$context.identity.sourceIp" - requestTime = "$context.requestTime" - httpMethod = "$context.httpMethod" - routeKey = "$context.routeKey" - status = "$context.status" - protocol = "$context.protocol" - responseLength = "$context.responseLength" - integrationError = "$context.integrationErrorMessage" - authorizerError = "$context.authorizer.error" - }) - } - - default_route_settings { - detailed_metrics_enabled = true - throttling_burst_limit = 5000 - throttling_rate_limit = 2000 - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-stage" }) -} - -# ============================================================================= -# CloudWatch Log Group for API Gateway -# ============================================================================= -resource "aws_cloudwatch_log_group" "api_logs" { - name = "/aws/apigateway/${var.name_prefix}" - retention_in_days = 30 - - tags = var.common_tags -} diff --git a/infra/terraform/modules/api-gateway/outputs.tf b/infra/terraform/modules/api-gateway/outputs.tf deleted file mode 100644 index a098edd1..00000000 --- a/infra/terraform/modules/api-gateway/outputs.tf +++ /dev/null @@ -1,42 +0,0 @@ -output "api_endpoint" { - description = "API Gateway 엔드포인트 URL" - value = aws_apigatewayv2_api.main.api_endpoint -} - -output "api_id" { - description = "API Gateway ID" - value = aws_apigatewayv2_api.main.id -} - -output "stage_arn" { - description = "API Gateway Stage ARN (WAF 연결용)" - value = aws_apigatewayv2_stage.main.arn -} - -output "execution_arn" { - description = "API Gateway Execution ARN" - value = aws_apigatewayv2_api.main.execution_arn -} - -# ============================================================================= -# Cognito Outputs -# ============================================================================= -output "cognito_user_pool_id" { - description = "Cognito User Pool ID" - value = var.enable_cognito ? aws_cognito_user_pool.main[0].id : null -} - -output "cognito_user_pool_client_id" { - description = "Cognito User Pool Client ID" - value = var.enable_cognito ? aws_cognito_user_pool_client.main[0].id : null -} - -output "cognito_user_pool_endpoint" { - description = "Cognito User Pool Endpoint" - value = var.enable_cognito ? aws_cognito_user_pool.main[0].endpoint : null -} - -output "cognito_domain" { - description = "Cognito Domain URL" - value = var.enable_cognito ? "https://${aws_cognito_user_pool_domain.main[0].domain}.auth.ap-northeast-2.amazoncognito.com" : null -} diff --git a/infra/terraform/modules/api-gateway/variables.tf b/infra/terraform/modules/api-gateway/variables.tf deleted file mode 100644 index 98af3525..00000000 --- a/infra/terraform/modules/api-gateway/variables.tf +++ /dev/null @@ -1,64 +0,0 @@ -variable "name_prefix" { - description = "리소스 네이밍 프리픽스" - type = string -} - -variable "common_tags" { - description = "공통 태그" - type = map(string) - default = {} -} - -variable "subnet_ids" { - description = "VPC Link 서브넷 ID 목록" - type = list(string) -} - -variable "ecs_security_group_id" { - description = "ECS 보안그룹 ID" - type = string -} - -variable "alb_listener_arn" { - description = "ALB Listener ARN" - type = string -} - -# ============================================================================= -# Cognito Settings -# ============================================================================= -variable "enable_cognito" { - description = "Cognito 인증 활성화" - type = bool - default = false -} - -variable "cognito_user_pool_name" { - description = "Cognito User Pool 이름" - type = string - default = null -} - -variable "cognito_callback_urls" { - description = "OAuth 콜백 URL 목록" - type = list(string) - default = ["https://localhost:3000/callback"] -} - -variable "cognito_logout_urls" { - description = "로그아웃 URL 목록" - type = list(string) - default = ["https://localhost:3000"] -} - -variable "public_routes" { - description = "인증이 필요없는 공개 라우트 패턴" - type = list(string) - default = ["/api/auth/*", "/health", "/actuator/health"] -} - -variable "protected_route_patterns" { - description = "보호된 라우트 패턴 목록" - type = list(string) - default = ["/api/*"] -} diff --git a/infra/terraform/modules/database/main.tf b/infra/terraform/modules/database/main.tf index 7fa13ed9..4b63c713 100644 --- a/infra/terraform/modules/database/main.tf +++ b/infra/terraform/modules/database/main.tf @@ -9,7 +9,7 @@ resource "aws_security_group" "db_sg" { from_port = 5432 to_port = 5432 protocol = "tcp" - cidr_blocks = [var.vpc_cidr] + security_groups = var.allowed_security_group_ids } egress { diff --git a/infra/terraform/modules/database/variables.tf b/infra/terraform/modules/database/variables.tf index 0eb08d5f..7202d761 100644 --- a/infra/terraform/modules/database/variables.tf +++ b/infra/terraform/modules/database/variables.tf @@ -121,3 +121,9 @@ variable "max_allocated_storage" { type = number default = null } + +variable "allowed_security_group_ids" { + type = list(string) + description = "Security group IDs allowed to access the database" + default = [] +} diff --git a/infra/terraform/modules/dns/main.tf b/infra/terraform/modules/dns/main.tf index b4e09cb3..f6dcbcc4 100644 --- a/infra/terraform/modules/dns/main.tf +++ b/infra/terraform/modules/dns/main.tf @@ -2,11 +2,22 @@ # Route 53 Hosted Zone # ============================================================================= resource "aws_route53_zone" "main" { - name = var.domain_name + count = var.create_hosted_zone ? 1 : 0 + name = var.domain_name tags = merge(var.common_tags, { Name = "${var.name_prefix}-zone" }) } +data "aws_route53_zone" "main" { + count = var.create_hosted_zone ? 0 : 1 + name = var.domain_name + private_zone = false +} + +locals { + zone_id = var.create_hosted_zone ? aws_route53_zone.main[0].zone_id : data.aws_route53_zone.main[0].zone_id +} + # ============================================================================= # ACM Certificate # ============================================================================= @@ -34,7 +45,7 @@ resource "aws_route53_record" "acm_validation" { } } - zone_id = aws_route53_zone.main.zone_id + zone_id = local.zone_id name = each.value.name type = each.value.type ttl = 60 @@ -42,6 +53,7 @@ resource "aws_route53_record" "acm_validation" { allow_overwrite = true } + # ============================================================================= # ACM Certificate Validation # ============================================================================= @@ -50,41 +62,27 @@ resource "aws_acm_certificate_validation" "main" { validation_record_fqdns = [for record in aws_route53_record.acm_validation : record.fqdn] } + # ============================================================================= -# API Gateway Custom Domain (Optional) +# EKS(ALB) Record # ============================================================================= -resource "aws_apigatewayv2_domain_name" "api" { - count = var.create_api_domain ? 1 : 0 - - domain_name = "api.${var.domain_name}" - - domain_name_configuration { - certificate_arn = aws_acm_certificate_validation.main.certificate_arn - endpoint_type = "REGIONAL" - security_policy = "TLS_1_2" - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-api-domain" }) +data "aws_lb" "ingress_alb" { + count = var.create_alb_record ? 1 : 0 + name = var.alb_name } -resource "aws_apigatewayv2_api_mapping" "api" { - count = var.create_api_domain ? 1 : 0 - - api_id = var.api_gateway_id - domain_name = aws_apigatewayv2_domain_name.api[0].id - stage = "$default" -} +resource "aws_route53_record" "alb" { + count = var.create_alb_record ? 1 : 0 -resource "aws_route53_record" "api" { - count = var.create_api_domain ? 1 : 0 - - zone_id = aws_route53_zone.main.zone_id - name = "api.${var.domain_name}" + zone_id = local.zone_id + name = var.alb_record_name type = "A" alias { - name = aws_apigatewayv2_domain_name.api[0].domain_name_configuration[0].target_domain_name - zone_id = aws_apigatewayv2_domain_name.api[0].domain_name_configuration[0].hosted_zone_id + name = data.aws_lb.ingress_alb[0].dns_name + zone_id = data.aws_lb.ingress_alb[0].zone_id evaluate_target_health = false } } + + diff --git a/infra/terraform/modules/dns/outputs.tf b/infra/terraform/modules/dns/outputs.tf index 7e827d95..cbbb3588 100644 --- a/infra/terraform/modules/dns/outputs.tf +++ b/infra/terraform/modules/dns/outputs.tf @@ -1,16 +1,13 @@ output "zone_id" { - description = "Route 53 Hosted Zone ID" - value = aws_route53_zone.main.zone_id + value = local.zone_id } output "zone_name" { - description = "Route 53 Hosted Zone 이름" - value = aws_route53_zone.main.name + value = var.domain_name } output "name_servers" { - description = "Route 53 네임서버 목록 (도메인 등록 기관에 설정 필요)" - value = aws_route53_zone.main.name_servers + value = var.create_hosted_zone ? aws_route53_zone.main[0].name_servers : null } output "certificate_arn" { @@ -18,12 +15,8 @@ output "certificate_arn" { value = aws_acm_certificate_validation.main.certificate_arn } -output "api_domain" { - description = "API 커스텀 도메인" - value = var.create_api_domain ? "api.${var.domain_name}" : null -} - -output "api_domain_target" { - description = "API Gateway 커스텀 도메인의 target domain name" - value = var.create_api_domain ? aws_apigatewayv2_domain_name.api[0].domain_name_configuration[0].target_domain_name : null -} +# ALB 레코드 FQDN +output "alb_record_fqdn" { + description = "ALB Alias 레코드 FQDN" + value = var.create_alb_record ? aws_route53_record.alb[0].fqdn : null +} \ No newline at end of file diff --git a/infra/terraform/modules/dns/variables.tf b/infra/terraform/modules/dns/variables.tf index 357eb59e..e77f45c4 100644 --- a/infra/terraform/modules/dns/variables.tf +++ b/infra/terraform/modules/dns/variables.tf @@ -13,15 +13,41 @@ variable "domain_name" { description = "도메인 이름 (ex: spotorder.org)" type = string } - -variable "create_api_domain" { - description = "API Gateway 커스텀 도메인 생성 여부" +variable "create_hosted_zone" { + type = bool + default = false +} +# ALB alias record 생성 여부 +variable "create_alb_record" { + description = "ALB(Route53 Alias) 레코드 생성 여부" type = bool default = true } -variable "api_gateway_id" { - description = "API Gateway ID" +variable "alb_record_name" { + description = "생성할 레코드 이름" + type = string + default = "" +} + +variable "alb_dns_name" { + description = "ALB DNS name (ex: xxx.ap-northeast-2.elb.amazonaws.com)" type = string default = "" } + +variable "alb_zone_id" { + description = "ALB Hosted Zone ID" + type = string + default = "" +} + +variable "associate_to_alb" { + type = bool + default = true +} + +variable "alb_name" { + description = "Ingress가 생성한 ALB 이름 (Ingress annotation load-balancer-name과 동일)" + type = string +} diff --git a/infra/terraform/modules/ecs/codedeploy.tf b/infra/terraform/modules/ecs/codedeploy.tf deleted file mode 100644 index 93757b8f..00000000 --- a/infra/terraform/modules/ecs/codedeploy.tf +++ /dev/null @@ -1,93 +0,0 @@ -# ============================================================================= -# CodeDeploy Application (for Blue/Green ECS Deployment) -# ============================================================================= -resource "aws_codedeploy_app" "main" { - count = var.enable_blue_green ? 1 : 0 - compute_platform = "ECS" - name = "${var.name_prefix}-ecs-app" - - tags = var.common_tags -} - -# ============================================================================= -# CodeDeploy IAM Role -# ============================================================================= -resource "aws_iam_role" "codedeploy" { - count = var.enable_blue_green ? 1 : 0 - name = "${var.name_prefix}-codedeploy-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = "codedeploy.amazonaws.com" - } - }] - }) - - tags = var.common_tags -} - -resource "aws_iam_role_policy_attachment" "codedeploy" { - count = var.enable_blue_green ? 1 : 0 - role = aws_iam_role.codedeploy[0].name - policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS" -} - -# ============================================================================= -# CodeDeploy Deployment Groups (per service) -# ============================================================================= -resource "aws_codedeploy_deployment_group" "services" { - for_each = var.enable_blue_green ? local.active_services : {} - - app_name = aws_codedeploy_app.main[0].name - deployment_group_name = "${var.name_prefix}-${each.key}-dg" - deployment_config_name = var.deployment_config - service_role_arn = aws_iam_role.codedeploy[0].arn - - auto_rollback_configuration { - enabled = true - events = ["DEPLOYMENT_FAILURE"] - } - - blue_green_deployment_config { - deployment_ready_option { - action_on_timeout = "CONTINUE_DEPLOYMENT" - } - - terminate_blue_instances_on_deployment_success { - action = "TERMINATE" - termination_wait_time_in_minutes = var.termination_wait_time - } - } - - deployment_style { - deployment_option = "WITH_TRAFFIC_CONTROL" - deployment_type = "BLUE_GREEN" - } - - ecs_service { - cluster_name = aws_ecs_cluster.main.name - service_name = aws_ecs_service.services[each.key].name - } - - load_balancer_info { - target_group_pair_info { - prod_traffic_route { - listener_arns = [var.alb_listener_arn] - } - - target_group { - name = var.target_group_names[each.key] - } - - target_group { - name = lookup(var.target_group_names, "${each.key}-green", "${var.name_prefix}-${each.key}-tg-g") - } - } - } - - tags = merge(var.common_tags, { Service = each.key }) -} diff --git a/infra/terraform/modules/ecs/main.tf b/infra/terraform/modules/ecs/main.tf deleted file mode 100644 index ed6cfbf7..00000000 --- a/infra/terraform/modules/ecs/main.tf +++ /dev/null @@ -1,684 +0,0 @@ -# ============================================================================= -# Local Variables -# ============================================================================= -locals { - # Gateway 및 excluded_services에 포함된 서비스 필터링 - active_services = { - for k, v in var.services : k => v - if !contains(var.excluded_services, k) - } -} - -# ============================================================================= -# Cloud Map (Service Discovery Namespace) -# ============================================================================= -resource "aws_service_discovery_private_dns_namespace" "main" { - name = "${var.project}.local" - vpc = var.vpc_id - - tags = merge(var.common_tags, { Name = "${var.project}.local" }) -} - -# ============================================================================= -# Cloud Map Services (Multiple) - Only when Service Connect is disabled -# ============================================================================= -resource "aws_service_discovery_service" "services" { - for_each = var.enable_service_connect ? {} : var.services - - name = each.key - - dns_config { - namespace_id = aws_service_discovery_private_dns_namespace.main.id - routing_policy = "MULTIVALUE" - - dns_records { - ttl = 10 - type = "A" - } - dns_records { - ttl = 10 - type = "SRV" - } - } - - health_check_custom_config { - failure_threshold = 1 - } - - tags = merge(var.common_tags, { Service = each.key }) -} - -# ============================================================================= -# ECS Cluster (Single shared cluster) -# ============================================================================= -resource "aws_ecs_cluster" "main" { - name = "${var.name_prefix}-cluster" - - service_connect_defaults { - namespace = aws_service_discovery_private_dns_namespace.main.arn - } - - setting { - name = "containerInsights" - value = "enabled" - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-cluster" }) -} - -resource "aws_ecs_cluster_capacity_providers" "main" { - cluster_name = aws_ecs_cluster.main.name - - capacity_providers = ["FARGATE", "FARGATE_SPOT"] - - default_capacity_provider_strategy { - base = 1 - weight = 100 - capacity_provider = "FARGATE" - } -} - -# ============================================================================= -# MSA Security Group (Shared by all services) -# ============================================================================= -resource "aws_security_group" "msa_sg" { - name = "${var.name_prefix}-msa-sg" - description = "Security group for MSA services" - vpc_id = var.vpc_id - - # ALB에서 들어오는 트래픽 허용 (Gateway: 8080) - ingress { - description = "Traffic from ALB" - from_port = 8080 - to_port = 8080 - protocol = "tcp" - security_groups = [var.alb_security_group_id] - } - - # Service Connect를 위한 자기 참조 규칙 (서비스 간 통신: 8080-8084) - ingress { - description = "Inter-service communication" - from_port = 8080 - to_port = 8084 - protocol = "tcp" - self = true - } - - # Service Connect proxy port - ingress { - description = "Service Connect proxy" - from_port = 15000 - to_port = 15001 - protocol = "tcp" - self = true - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-msa-sg" }) -} - -# ============================================================================= -# IAM Role for ECS Task Execution -# ============================================================================= -resource "aws_iam_role" "ecs_task_execution_role" { - name = "${var.name_prefix}-ecs-task-execution-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { Service = "ecs-tasks.amazonaws.com" } - }] - }) - - tags = var.common_tags -} - -resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" { - role = aws_iam_role.ecs_task_execution_role.name - policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" -} - -# ============================================================================= -# SSM Parameter Store 읽기 권한 (Secrets 주입용) -# ============================================================================= -resource "aws_iam_role_policy" "ecs_task_execution_ssm" { - name = "${var.name_prefix}-ecs-ssm-policy" - role = aws_iam_role.ecs_task_execution_role.id - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "ssm:GetParameters", - "ssm:GetParameter" - ] - Resource = "arn:aws:ssm:${var.region}:*:parameter/${var.project}/${var.environment}/*" - }, - { - Effect = "Allow" - Action = [ - "kms:Decrypt" - ] - Resource = "*" - Condition = { - StringEquals = { - "kms:ViaService" = "ssm.${var.region}.amazonaws.com" - } - } - } - ] - }) -} - -# ============================================================================= -# IAM Role for ECS Task (Application level) -# ============================================================================= -resource "aws_iam_role" "ecs_task_role" { - name = "${var.name_prefix}-ecs-task-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { Service = "ecs-tasks.amazonaws.com" } - }] - }) - - tags = var.common_tags -} - -# Service Connect requires CloudWatch permissions -resource "aws_iam_role_policy" "ecs_service_connect" { - name = "${var.name_prefix}-service-connect-policy" - role = aws_iam_role.ecs_task_role.id - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "logs:CreateLogStream", - "logs:PutLogEvents" - ] - Resource = "*" - } - ] - }) -} - -# ============================================================================= -# CloudWatch Log Groups (per service) -# ============================================================================= -resource "aws_cloudwatch_log_group" "services" { - for_each = var.services - - name = "/ecs/${var.project}-${each.key}" - retention_in_days = var.log_retention_days - - tags = merge(var.common_tags, { Service = each.key }) -} - -# ============================================================================= -# ECS Task Definitions (per service) -# ============================================================================= -resource "aws_ecs_task_definition" "services" { - for_each = var.services - - family = "${var.project}-${each.key}-task" - network_mode = "awsvpc" - requires_compatibilities = ["FARGATE"] - cpu = each.value.cpu - memory = each.value.memory - execution_role_arn = aws_iam_role.ecs_task_execution_role.arn - task_role_arn = aws_iam_role.ecs_task_role.arn - - container_definitions = jsonencode([ - { - name = "${var.project}-${each.key}-container" - image = "${var.ecr_repository_urls[each.key]}:latest" - essential = true - - portMappings = [{ - name = each.key - containerPort = each.value.container_port - hostPort = each.value.container_port - protocol = "tcp" - appProtocol = "http" - }] - - environment = concat( - # 공통 환경 변수 - [ - { - name = "SPRING_PROFILES_ACTIVE" - value = var.environment - }, - { - name = "SPRING_DATA_REDIS_HOST" - value = var.redis_endpoint - }, - { - name = "SPRING_DATA_REDIS_PORT" - value = "6379" - } - ], - # Kafka 환경 변수 (gateway 제외) - each.key != "gateway" && var.kafka_bootstrap_servers != "" ? [ - { - name = "SPRING_KAFKA_BOOTSTRAP_SERVERS" - value = var.kafka_bootstrap_servers - }, - { - name = "SPRING_KAFKA_CONSUMER_GROUP_ID" - value = "${var.project}-${each.key}" - }, - { - name = "SPRING_KAFKA_CONSUMER_AUTO_OFFSET_RESET" - value = "earliest" - } - ] : [], - # 백엔드 서비스 전용 (gateway 제외) - DB, JPA, JWT 설정 - each.key != "gateway" ? [ - { - name = "SPRING_DATASOURCE_URL" - value = "jdbc:postgresql://${var.db_endpoint}/${var.db_name}?currentSchema=${lookup(each.value.environment_vars, "DB_SCHEMA", each.key)}" - }, - { - name = "SPRING_DATASOURCE_USERNAME" - value = var.db_username - }, - { - name = "SPRING_JPA_HIBERNATE_DDL_AUTO" - value = "update" - }, - { - name = "SPRING_JPA_SHOW_SQL" - value = "false" - }, - { - name = "SPRING_JPA_PROPERTIES_HIBERNATE_DIALECT" - value = "org.hibernate.dialect.PostgreSQLDialect" - }, - { - name = "SPRING_JWT_EXPIRE_MS" - value = tostring(var.jwt_expire_ms) - }, - { - name = "SPRING_SECURITY_REFRESH_TOKEN_EXPIRE_DAYS" - value = tostring(var.refresh_token_expire_days) - }, - { - name = "SERVICE_ACTIVE_REGIONS" - value = var.service_active_regions - } - ] : [], - # Service Discovery 환경 변수 (Feign Client URLs) - each.key != "gateway" ? [ - { - name = "FEIGN_ORDER_URL" - value = "http://order.${var.project}.local:${var.services["order"].container_port}" - }, - { - name = "FEIGN_PAYMENT_URL" - value = "http://payment.${var.project}.local:${var.services["payment"].container_port}" - }, - { - name = "FEIGN_STORE_URL" - value = "http://store.${var.project}.local:${var.services["store"].container_port}" - }, - { - name = "FEIGN_USER_URL" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - } - ] : [], - # Mail 설정 (user 서비스용) - each.key == "user" ? [ - { - name = "SPRING_MAIL_HOST" - value = var.mail_host - }, - { - name = "SPRING_MAIL_PORT" - value = tostring(var.mail_port) - }, - { - name = "SPRING_MAIL_USERNAME" - value = var.mail_username - }, - { - name = "SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH" - value = "true" - }, - { - name = "SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE" - value = "true" - } - ] : [], - # Toss 결제 설정 (payment 서비스용) - each.key == "payment" ? [ - { - name = "TOSS_PAYMENTS_BASE_URL" - value = var.toss_base_url - }, - { - name = "TOSS_PAYMENTS_CUSTOMER_KEY" - value = var.toss_customer_key - } - ] : [], - # Gateway 전용 설정 - Spring Cloud Gateway 라우트 (WebFlux 버전용 새 property 이름) - each.key == "gateway" ? [ - # User Service - Auth 관련 - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_0_ID" - value = "user-login" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_0_URI" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_0_PREDICATES_0" - value = "Path=/api/login" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_1_ID" - value = "user-join" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_1_URI" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_1_PREDICATES_0" - value = "Path=/api/join" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_2_ID" - value = "user-auth" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_2_URI" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_2_PREDICATES_0" - value = "Path=/api/auth/**" - }, - # User Service - Users & Admin - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_3_ID" - value = "user-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_3_URI" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_3_PREDICATES_0" - value = "Path=/api/users/**" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_4_ID" - value = "admin-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_4_URI" - value = "http://user.${var.project}.local:${var.services["user"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_4_PREDICATES_0" - value = "Path=/api/admin/**" - }, - # Store Service - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_5_ID" - value = "store-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_5_URI" - value = "http://store.${var.project}.local:${var.services["store"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_5_PREDICATES_0" - value = "Path=/api/stores/**" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_6_ID" - value = "category-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_6_URI" - value = "http://store.${var.project}.local:${var.services["store"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_6_PREDICATES_0" - value = "Path=/api/categories/**" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_7_ID" - value = "review-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_7_URI" - value = "http://store.${var.project}.local:${var.services["store"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_7_PREDICATES_0" - value = "Path=/api/reviews/**" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_8_ID" - value = "menu-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_8_URI" - value = "http://store.${var.project}.local:${var.services["store"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_8_PREDICATES_0" - value = "Path=/api/menus/**" - }, - # Order Service - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_9_ID" - value = "order-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_9_URI" - value = "http://order.${var.project}.local:${var.services["order"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_9_PREDICATES_0" - value = "Path=/api/orders/**" - }, - # Payment Service - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_10_ID" - value = "payment-service" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_10_URI" - value = "http://payment.${var.project}.local:${var.services["payment"].container_port}" - }, - { - name = "SPRING_CLOUD_GATEWAY_SERVER_WEBFLUX_ROUTES_10_PREDICATES_0" - value = "Path=/api/payments/**" - }, - # Actuator 설정 (새 property 이름) - { - name = "MANAGEMENT_ENDPOINTS_WEB_EXPOSURE_INCLUDE" - value = "health,info,gateway" - }, - { - name = "MANAGEMENT_ENDPOINT_GATEWAY_ACCESS" - value = "unrestricted" - }, - # 디버깅용 로깅 - { - name = "LOGGING_LEVEL_ORG_SPRINGFRAMEWORK_CLOUD_GATEWAY" - value = "DEBUG" - } - ] : [], - # 서비스별 커스텀 환경 변수 - [for k, v in each.value.environment_vars : { - name = k - value = v - }] - ) - - # ============================================================= - # Secrets (Parameter Store에서 주입) - # ============================================================= - secrets = concat( - # 백엔드 서비스 (gateway 제외) - DB 비밀번호, JWT 시크릿 - each.key != "gateway" ? [ - { - name = "SPRING_DATASOURCE_PASSWORD" - valueFrom = var.parameter_arns.db_password - }, - { - name = "SPRING_JWT_SECRET" - valueFrom = var.parameter_arns.jwt_secret - } - ] : [], - # Mail 비밀번호 (user 서비스) - each.key == "user" && var.parameter_arns.mail_password != null ? [ - { - name = "SPRING_MAIL_PASSWORD" - valueFrom = var.parameter_arns.mail_password - } - ] : [], - # Toss 시크릿 키 (payment 서비스) - each.key == "payment" && var.parameter_arns.toss_secret_key != null ? [ - { - name = "TOSS_PAYMENTS_SECRET_KEY" - valueFrom = var.parameter_arns.toss_secret_key - } - ] : [] - ) - - logConfiguration = { - logDriver = "awslogs" - options = { - "awslogs-group" = aws_cloudwatch_log_group.services[each.key].name - "awslogs-region" = var.region - "awslogs-stream-prefix" = "ecs" - } - } - - healthCheck = { - command = ["CMD-SHELL", "curl -f http://localhost:${each.value.container_port}${each.value.health_check_path} || exit 1"] - interval = 30 - timeout = 5 - retries = 3 - startPeriod = 60 - } - } - ]) - - tags = merge(var.common_tags, { Service = each.key }) -} - -# ============================================================================= -# ECS Services (per service) - Active Services Only -# ============================================================================= -resource "aws_ecs_service" "services" { - for_each = local.active_services - - name = "${var.project}-${each.key}-service" - cluster = aws_ecs_cluster.main.id - task_definition = aws_ecs_task_definition.services[each.key].arn - desired_count = var.standby_mode ? 0 : each.value.desired_count - launch_type = "FARGATE" - - # 모든 active 서비스를 ALB에 연결 - dynamic "load_balancer" { - for_each = contains(keys(var.target_group_arns), each.key) ? [1] : [] - content { - target_group_arn = var.target_group_arns[each.key] - container_name = "${var.project}-${each.key}-container" - container_port = each.value.container_port - } - } - - network_configuration { - subnets = var.subnet_ids - security_groups = [aws_security_group.msa_sg.id] - assign_public_ip = var.assign_public_ip - } - - # Blue/Green 배포 컨트롤러 - dynamic "deployment_controller" { - for_each = var.enable_blue_green ? [1] : [] - content { - type = "CODE_DEPLOY" - } - } - - # 기본 롤링 배포 설정 (Blue/Green 비활성화시) - dynamic "deployment_circuit_breaker" { - for_each = var.enable_blue_green ? [] : [1] - content { - enable = true - rollback = true - } - } - - # Service Connect Configuration - dynamic "service_connect_configuration" { - for_each = var.enable_service_connect ? [1] : [] - content { - enabled = true - namespace = aws_service_discovery_private_dns_namespace.main.arn - - service { - port_name = each.key - discovery_name = each.key - - client_alias { - port = each.value.container_port - dns_name = "${each.key}.${var.project}.local" - } - } - - log_configuration { - log_driver = "awslogs" - options = { - "awslogs-group" = aws_cloudwatch_log_group.services[each.key].name - "awslogs-region" = var.region - "awslogs-stream-prefix" = "service-connect" - } - } - } - } - - # Service Discovery Registration (only when Service Connect is disabled) - dynamic "service_registries" { - for_each = var.enable_service_connect ? [] : [1] - content { - registry_arn = aws_service_discovery_service.services[each.key].arn - container_name = "${var.project}-${each.key}-container" - container_port = each.value.container_port - } - } - - depends_on = [var.alb_listener_arn] - - tags = merge(var.common_tags, { Service = each.key }) - - lifecycle { - ignore_changes = var.enable_blue_green ? [task_definition, load_balancer] : [] - } -} diff --git a/infra/terraform/modules/ecs/outputs.tf b/infra/terraform/modules/ecs/outputs.tf deleted file mode 100644 index fc8710db..00000000 --- a/infra/terraform/modules/ecs/outputs.tf +++ /dev/null @@ -1,57 +0,0 @@ -output "cluster_name" { - description = "ECS 클러스터 이름" - value = aws_ecs_cluster.main.name -} - -output "cluster_arn" { - description = "ECS 클러스터 ARN" - value = aws_ecs_cluster.main.arn -} - -output "service_names" { - description = "ECS 서비스 이름 맵" - value = { for k, v in aws_ecs_service.services : k => v.name } -} - -output "service_arns" { - description = "ECS 서비스 ARN 맵" - value = { for k, v in aws_ecs_service.services : k => v.id } -} - -output "security_group_id" { - description = "MSA 보안그룹 ID" - value = aws_security_group.msa_sg.id -} - -output "task_definition_arns" { - description = "Task Definition ARN 맵" - value = { for k, v in aws_ecs_task_definition.services : k => v.arn } -} - -output "cloudwatch_log_groups" { - description = "CloudWatch Log Group 이름 맵" - value = { for k, v in aws_cloudwatch_log_group.services : k => v.name } -} - -output "service_discovery_namespace_id" { - description = "Service Discovery Namespace ID" - value = aws_service_discovery_private_dns_namespace.main.id -} - -output "service_discovery_namespace_arn" { - description = "Service Discovery Namespace ARN" - value = aws_service_discovery_private_dns_namespace.main.arn -} - -# ============================================================================= -# CodeDeploy Outputs -# ============================================================================= -output "codedeploy_app_name" { - description = "CodeDeploy Application 이름" - value = var.enable_blue_green ? aws_codedeploy_app.main[0].name : null -} - -output "codedeploy_deployment_group_names" { - description = "CodeDeploy Deployment Group 이름 맵" - value = var.enable_blue_green ? { for k, v in aws_codedeploy_deployment_group.services : k => v.deployment_group_name } : {} -} diff --git a/infra/terraform/modules/ecs/variables.tf b/infra/terraform/modules/ecs/variables.tf deleted file mode 100644 index 3ecbf149..00000000 --- a/infra/terraform/modules/ecs/variables.tf +++ /dev/null @@ -1,261 +0,0 @@ -# ============================================================================= -# Project Settings -# ============================================================================= -variable "project" { - description = "프로젝트 이름" - type = string -} - -variable "environment" { - description = "환경 (dev, prod)" - type = string - default = "dev" -} - -variable "name_prefix" { - description = "리소스 네이밍 프리픽스" - type = string -} - -variable "common_tags" { - description = "공통 태그" - type = map(string) - default = {} -} - -variable "region" { - description = "AWS 리전" - type = string -} - -# ============================================================================= -# Network Settings -# ============================================================================= -variable "vpc_id" { - description = "VPC ID" - type = string -} - -variable "subnet_ids" { - description = "ECS 서비스 서브넷 ID 목록" - type = list(string) -} - -variable "assign_public_ip" { - description = "Public IP 할당 여부" - type = bool - default = true -} - -# ============================================================================= -# ALB Integration -# ============================================================================= -variable "alb_security_group_id" { - description = "ALB 보안그룹 ID" - type = string -} - -variable "target_group_arns" { - description = "ALB Target Group ARN 맵" - type = map(string) -} - -variable "alb_listener_arn" { - description = "ALB Listener ARN (의존성용)" - type = string -} - -# ============================================================================= -# ECR Integration -# ============================================================================= -variable "ecr_repository_urls" { - description = "ECR 저장소 URL 맵" - type = map(string) -} - -# ============================================================================= -# Services Configuration -# ============================================================================= -variable "services" { - description = "서비스 구성 맵" - type = map(object({ - container_port = number - cpu = string - memory = string - desired_count = number - health_check_path = string - path_patterns = list(string) - priority = number - environment_vars = map(string) - })) -} - -variable "enable_service_connect" { - description = "ECS Service Connect 활성화 여부" - type = bool - default = true -} - -variable "log_retention_days" { - description = "로그 보관 일수" - type = number - default = 30 -} - -# ============================================================================= -# Parameter Store ARNs (Secrets 주입용) -# ============================================================================= -variable "parameter_arns" { - description = "Parameter Store ARN 맵" - type = object({ - db_password = string - jwt_secret = string - mail_password = optional(string) - toss_secret_key = optional(string) - }) -} - -# ============================================================================= -# Database Settings -# ============================================================================= -variable "db_endpoint" { - description = "RDS 엔드포인트" - type = string -} - -variable "db_name" { - description = "데이터베이스 이름" - type = string -} - -variable "db_username" { - description = "데이터베이스 사용자 이름" - type = string - sensitive = true -} - -# ============================================================================= -# Redis Settings -# ============================================================================= -variable "redis_endpoint" { - description = "Redis 엔드포인트" - type = string - default = "" -} - -# ============================================================================= -# Kafka Settings -# ============================================================================= -variable "kafka_bootstrap_servers" { - description = "Kafka Bootstrap Servers" - type = string - default = "" -} - -# ============================================================================= -# JWT Settings -# ============================================================================= -variable "jwt_expire_ms" { - description = "JWT 만료 시간 (밀리초)" - type = number - default = 3600000 -} - -variable "refresh_token_expire_days" { - description = "리프레시 토큰 만료 일수" - type = number - default = 14 -} - -# ============================================================================= -# Mail Settings -# ============================================================================= -variable "mail_host" { - description = "SMTP 호스트" - type = string - default = "smtp.gmail.com" -} - -variable "mail_port" { - description = "SMTP 포트" - type = number - default = 587 -} - -variable "mail_username" { - description = "SMTP 사용자 이름" - type = string - default = "" -} - -# ============================================================================= -# Toss Payments Settings -# ============================================================================= -variable "toss_base_url" { - description = "Toss Payments API URL" - type = string - default = "https://api.tosspayments.com" -} - -variable "toss_customer_key" { - description = "Toss Payments 고객 키" - type = string - default = "customer_1" -} - -# ============================================================================= -# Service Settings -# ============================================================================= -variable "service_active_regions" { - description = "서비스 활성 지역" - type = string - default = "종로구" -} - -# ============================================================================= -# Standby Mode (비용 절감) -# ============================================================================= -variable "standby_mode" { - description = "스탠바이 모드 (true면 모든 서비스 desired_count = 0)" - type = bool - default = false -} - -# ============================================================================= -# Blue/Green Deployment -# ============================================================================= -variable "enable_blue_green" { - description = "Blue/Green 배포 활성화 (CodeDeploy)" - type = bool - default = false -} - -variable "excluded_services" { - description = "배포에서 제외할 서비스 목록 (예: gateway)" - type = list(string) - default = [] -} - -variable "target_group_names" { - description = "ALB Target Group 이름 맵" - type = map(string) - default = {} -} - -variable "green_target_group_arns" { - description = "Green Target Group ARN 맵 (Blue/Green용)" - type = map(string) - default = {} -} - -variable "deployment_config" { - description = "CodeDeploy 배포 구성" - type = string - default = "CodeDeployDefault.ECSAllAtOnce" -} - -variable "termination_wait_time" { - description = "이전 태스크 종료 대기 시간 (분)" - type = number - default = 5 -} diff --git a/infra/terraform/modules/eks-addons/main.tf b/infra/terraform/modules/eks-addons/main.tf new file mode 100644 index 00000000..77e5d91d --- /dev/null +++ b/infra/terraform/modules/eks-addons/main.tf @@ -0,0 +1,41 @@ +resource "aws_eks_addon" "vpc_cni" { + count = var.enable_vpc_cni ? 1 : 0 + cluster_name = var.cluster_name + addon_name = "vpc-cni" + addon_version = var.vpc_cni_version != "" ? var.vpc_cni_version : null + + resolve_conflicts_on_update = "OVERWRITE" + tags = var.common_tags +} + +resource "aws_eks_addon" "coredns" { + count = var.enable_coredns ? 1 : 0 + cluster_name = var.cluster_name + addon_name = "coredns" + addon_version = var.coredns_version != "" ? var.coredns_version : null + + resolve_conflicts_on_update = "OVERWRITE" + tags = var.common_tags +} + +resource "aws_eks_addon" "kube_proxy" { + count = var.enable_kube_proxy ? 1 : 0 + cluster_name = var.cluster_name + addon_name = "kube-proxy" + addon_version = var.kube_proxy_version != "" ? var.kube_proxy_version : null + + resolve_conflicts_on_update = "OVERWRITE" + tags = var.common_tags +} + +resource "aws_eks_addon" "ebs_csi" { + count = var.enable_ebs_csi ? 1 : 0 + cluster_name = var.cluster_name + addon_name = "aws-ebs-csi-driver" + addon_version = var.ebs_csi_version != "" ? var.ebs_csi_version : null + + resolve_conflicts_on_update = "OVERWRITE" + service_account_role_arn = var.ebs_csi_irsa_role_arn != "" ? var.ebs_csi_irsa_role_arn : null + + tags = var.common_tags +} diff --git a/infra/terraform/modules/eks-addons/output.tf b/infra/terraform/modules/eks-addons/output.tf new file mode 100644 index 00000000..5100729c --- /dev/null +++ b/infra/terraform/modules/eks-addons/output.tf @@ -0,0 +1,8 @@ +output "enabled_addons" { + value = { + vpc_cni = var.enable_vpc_cni + coredns = var.enable_coredns + kube_proxy = var.enable_kube_proxy + ebs_csi = var.enable_ebs_csi + } +} diff --git a/infra/terraform/modules/eks-addons/variables.tf b/infra/terraform/modules/eks-addons/variables.tf new file mode 100644 index 00000000..5449817d --- /dev/null +++ b/infra/terraform/modules/eks-addons/variables.tf @@ -0,0 +1,47 @@ +variable "common_tags" { + type = map(string) + default = {} +} +variable "cluster_name" { + type = string +} + +variable "enable_vpc_cni" { + type = bool + default = true +} + +variable "enable_coredns" { + type = bool + default = true +} +variable "enable_kube_proxy" { + type = bool + default = true +} +variable "enable_ebs_csi" { + type = bool + default = true +} + +variable "vpc_cni_version" { + type = string + default = "" +} +variable "coredns_version" { + type = string + default = "" +} +variable "kube_proxy_version" { + type = string + default = "" +} +variable "ebs_csi_version" { + type = string + default = "" +} + +variable "ebs_csi_irsa_role_arn" { + type = string + default = "" +} diff --git a/infra/terraform/modules/eks/main.tf b/infra/terraform/modules/eks/main.tf new file mode 100644 index 00000000..17143160 --- /dev/null +++ b/infra/terraform/modules/eks/main.tf @@ -0,0 +1,155 @@ +resource "aws_iam_role" "cluster" { + name = "${var.name_prefix}-eks-cluster-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = "sts:AssumeRole" + Principal = { Service = "eks.amazonaws.com" } + }] + }) + + tags = var.common_tags +} + +resource "aws_iam_role_policy_attachment" "cluster_policy" { + role = aws_iam_role.cluster.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" +} + +resource "aws_iam_role_policy_attachment" "vpc_resource_controller" { + role = aws_iam_role.cluster.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" +} + +resource "aws_eks_cluster" "this" { + name = var.cluster_name + role_arn = aws_iam_role.cluster.arn + version = var.cluster_version + + vpc_config { + subnet_ids = var.subnet_ids + endpoint_private_access = var.endpoint_private_access + endpoint_public_access = var.endpoint_public_access + public_access_cidrs = var.public_access_cidrs + } + + enabled_cluster_log_types = var.enabled_cluster_log_types + + tags = merge(var.common_tags, { Name = var.cluster_name }) + + depends_on = [ + aws_iam_role_policy_attachment.cluster_policy, + aws_iam_role_policy_attachment.vpc_resource_controller + ] +} + +resource "aws_security_group" "node" { + name = "${var.name_prefix}-eks-node-sg" + description = "EKS node security group" + vpc_id = var.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(var.common_tags, { + Name = "${var.name_prefix}-eks-node-sg" + }) +} + +resource "aws_iam_role" "node" { + name = "${var.name_prefix}-eks-node-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = "sts:AssumeRole" + Principal = { Service = "ec2.amazonaws.com" } + }] + }) + + tags = var.common_tags +} + +resource "aws_iam_role_policy_attachment" "node_worker" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" +} + +resource "aws_iam_role_policy_attachment" "node_cni" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" +} + +resource "aws_iam_role_policy_attachment" "node_ecr" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" +} + +resource "aws_iam_role_policy_attachment" "node_ssm" { + count = var.enable_node_ssm ? 1 : 0 + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_launch_template" "node" { + name_prefix = "${var.name_prefix}-eks-ng-" + + vpc_security_group_ids = [aws_security_group.node.id] + + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 2 + } + + block_device_mappings { + device_name = "/dev/xvda" + ebs { + volume_size = var.node_disk_size + volume_type = "gp3" + encrypted = true + delete_on_termination = true + } + } + + tag_specifications { + resource_type = "instance" + tags = merge(var.common_tags, { + Name = "${var.name_prefix}-eks-node" + }) + } + + tags = var.common_tags +} + +resource "aws_eks_node_group" "default" { + count = var.enable_node_group && var.node_desired_size > 0 ? 1 : 0 + + cluster_name = aws_eks_cluster.this.name + node_group_name = "${var.name_prefix}-node-group" + node_role_arn = aws_iam_role.node.arn + subnet_ids = var.node_subnet_ids + + # ec2spot + capacity_type = "SPOT" + + + instance_types = var.node_instance_types + + scaling_config { + desired_size = var.node_desired_size + min_size = var.node_min_size + max_size = var.node_max_size + } + + tags = merge(var.common_tags, { + Name = "${var.name_prefix}-node-group" + }) +} diff --git a/infra/terraform/modules/eks/output.tf b/infra/terraform/modules/eks/output.tf new file mode 100644 index 00000000..130e488b --- /dev/null +++ b/infra/terraform/modules/eks/output.tf @@ -0,0 +1,36 @@ +output "cluster_name" { + value = aws_eks_cluster.this.name +} + +output "cluster_arn" { + value = aws_eks_cluster.this.arn +} + +output "cluster_endpoint" { + value = aws_eks_cluster.this.endpoint +} + + +output "cluster_ca" { + description = "EKS cluster CA certificate (base64 encoded)" + value = aws_eks_cluster.this.certificate_authority[0].data +} + + +output "oidc_issuer_url" { + value = aws_eks_cluster.this.identity[0].oidc[0].issuer +} + +output "node_security_group_id" { + value = aws_security_group.node.id +} + +output "node_role_arn" { + value = aws_iam_role.node.arn +} + + + +output "cluster_security_group_id" { + value = aws_eks_cluster.this.vpc_config[0].cluster_security_group_id +} \ No newline at end of file diff --git a/infra/terraform/modules/eks/variables.tf b/infra/terraform/modules/eks/variables.tf new file mode 100644 index 00000000..90ab639f --- /dev/null +++ b/infra/terraform/modules/eks/variables.tf @@ -0,0 +1,95 @@ +variable "name_prefix" { + type = string +} + +variable "common_tags" { + type = map(string) + default = {} +} + +variable "cluster_name" { + type = string +} + +variable "cluster_version" { + type = string + default = "1.29" +} + +variable "vpc_id" { + type = string +} + +variable "subnet_ids" { + type = list(string) +} + +variable "node_subnet_ids" { + type = list(string) +} + +variable "endpoint_private_access" { + type = bool + default = true +} + +variable "endpoint_public_access" { + type = bool + default = false +} + +variable "public_access_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] +} + +variable "enabled_cluster_log_types" { + type = list(string) + default = ["api", "audit", "authenticator", "controllerManager", "scheduler"] +} + +variable "node_instance_types" { + type = list(string) + default = ["t3.large"] +} + +variable "node_capacity_type" { + type = string + default = "ON_DEMAND" +} + +variable "node_ami_type" { + type = string + default = "AL2_x86_64" +} + +variable "node_disk_size" { + type = number + default = 50 +} + +variable "node_desired_size" { + type = number + default = 2 +} + +variable "node_min_size" { + type = number + default = 2 +} + +variable "node_max_size" { + type = number + default = 4 +} + +variable "enable_node_ssm" { + type = bool + default = true +} + +variable "enable_node_group" { + description = "Worker Node Group 생성 여부" + type = bool + default = false +} diff --git a/infra/terraform/modules/elasticache/main.tf b/infra/terraform/modules/elasticache/main.tf index 67ab2806..2ddd25bf 100644 --- a/infra/terraform/modules/elasticache/main.tf +++ b/infra/terraform/modules/elasticache/main.tf @@ -1,7 +1,7 @@ # ============================================================================= # ElastiCache Redis - 캐시 및 세션 저장소 # ============================================================================= - +# # ----------------------------------------------------------------------------- # Subnet Group # ----------------------------------------------------------------------------- @@ -12,16 +12,16 @@ resource "aws_elasticache_subnet_group" "redis" { tags = merge(var.common_tags, { Name = "${var.name_prefix}-redis-subnet" }) } -# ----------------------------------------------------------------------------- -# Security Group -# ----------------------------------------------------------------------------- +# # ----------------------------------------------------------------------------- +# # Security Group +# # ----------------------------------------------------------------------------- resource "aws_security_group" "redis" { name = "${var.name_prefix}-redis-sg" description = "Security group for ElastiCache Redis" vpc_id = var.vpc_id ingress { - description = "Redis from ECS" + description = "Redis from EKS nodes" from_port = 6379 to_port = 6379 protocol = "tcp" diff --git a/infra/terraform/modules/irsa/main.tf b/infra/terraform/modules/irsa/main.tf new file mode 100644 index 00000000..e2c4d014 --- /dev/null +++ b/infra/terraform/modules/irsa/main.tf @@ -0,0 +1,82 @@ +data "tls_certificate" "oidc" { + url = var.oidc_issuer_url +} +resource "aws_iam_openid_connect_provider" "this" { + url = var.oidc_issuer_url + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.oidc.certificates[0].sha1_fingerprint] + tags = var.common_tags +} + +resource "aws_iam_role" "service_account" { + for_each = var.service_accounts + + name = "${var.name_prefix}-${each.key}-irsa" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = "sts:AssumeRoleWithWebIdentity" + Principal = { Federated = aws_iam_openid_connect_provider.this.arn } + Condition = { + StringEquals = { + "${replace(var.oidc_issuer_url, "https://", "")}:sub" = "system:serviceaccount:${each.value.namespace}:${each.value.service_account}" + "${replace(var.oidc_issuer_url, "https://", "")}:aud" = "sts.amazonaws.com" + } + } + }] + }) + + tags = var.common_tags +} + +data "aws_iam_policy_document" "alb_controller" { + statement { + effect = "Allow" + actions = [ + "iam:CreateServiceLinkedRole", + "ec2:Describe*", + "elasticloadbalancing:*", + "ec2:CreateSecurityGroup", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "acm:DescribeCertificate", + "acm:ListCertificates", + "acm:GetCertificate", + "waf-regional:*", + "wafv2:*", + "shield:*", + "cognito-idp:DescribeUserPoolClient" + ] + resources = ["*"] + } +} + +resource "aws_iam_policy" "alb_controller" { + name = "${var.name_prefix}-AWSLoadBalancerControllerIAMPolicy" + description = "IAM policy for AWS Load Balancer Controller" + policy = data.aws_iam_policy_document.alb_controller.json + tags = var.common_tags +} + +resource "aws_iam_role_policy_attachment" "alb_controller" { + role = aws_iam_role.service_account["aws_load_balancer_controller"].name + policy_arn = aws_iam_policy.alb_controller.arn +} + +resource "aws_iam_role_policy_attachment" "managed" { + for_each = { + for k, v in var.service_accounts : k => v + if try(length(v.policy_arn), 0) > 0 + } + + role = aws_iam_role.service_account[each.key].name + policy_arn = each.value.policy_arn +} + + + + diff --git a/infra/terraform/modules/irsa/output.tf b/infra/terraform/modules/irsa/output.tf new file mode 100644 index 00000000..f3a39ca7 --- /dev/null +++ b/infra/terraform/modules/irsa/output.tf @@ -0,0 +1,7 @@ +output "oidc_provider_arn" { + value = aws_iam_openid_connect_provider.this.arn +} + +output "service_account_role_arns" { + value = { for k, r in aws_iam_role.service_account : k => r.arn } +} diff --git a/infra/terraform/modules/irsa/variables.tf b/infra/terraform/modules/irsa/variables.tf new file mode 100644 index 00000000..d784c168 --- /dev/null +++ b/infra/terraform/modules/irsa/variables.tf @@ -0,0 +1,21 @@ +variable "name_prefix" { + type = string +} +variable "common_tags" { + type = map(string) + default = {} +} + +variable "oidc_issuer_url" { + type = string +} + +variable "service_accounts" { + type = map(object({ + namespace = string + service_account = string + policy_arn = optional(string) + create_k8s_sa = optional(bool, true) + })) + default = {} +} diff --git a/infra/terraform/modules/k8s-bootstrap/main.tf b/infra/terraform/modules/k8s-bootstrap/main.tf new file mode 100644 index 00000000..25d57596 --- /dev/null +++ b/infra/terraform/modules/k8s-bootstrap/main.tf @@ -0,0 +1,108 @@ +# ----------------------------------------------------------------------------- +# (A) ArgoCD Remote Access (namespace/sa/crb) +# ----------------------------------------------------------------------------- +resource "kubernetes_namespace_v1" "argocd" { + count = var.enable_argocd_access ? 1 : 0 + + metadata { + name = "argocd" + } +} + +resource "kubernetes_service_account_v1" "argocd_manager" { + count = var.enable_argocd_access ? 1 : 0 + + metadata { + name = "argocd-manager" + namespace = kubernetes_namespace_v1.argocd[0].metadata[0].name + } +} + +resource "kubernetes_cluster_role_binding_v1" "argocd_manager_admin" { + count = var.enable_argocd_access ? 1 : 0 + + metadata { + name = "argocd-manager-cluster-admin" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cluster-admin" + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account_v1.argocd_manager[0].metadata[0].name + namespace = kubernetes_service_account_v1.argocd_manager[0].metadata[0].namespace + } +} + +# ----------------------------------------------------------------------------- +# (B) AWS Load Balancer Controller (Helm) +# ----------------------------------------------------------------------------- +resource "helm_release" "aws_load_balancer_controller" { + count = var.enable_lbc ? 1 : 0 + name = "aws-load-balancer-controller" + namespace = "kube-system" + repository = "https://aws.github.io/eks-charts" + chart = "aws-load-balancer-controller" + version = var.alb_controller_chart_version + + set { + name = "clusterName" + value = var.cluster_name + } + + # IRSA에서 SA를 만들고 role-arn annotation 붙일 거면 false 유지 + set { + name = "serviceAccount.create" + value = "false" + } + + set { + name = "serviceAccount.name" + value = "aws-load-balancer-controller" + } +} + + + + +# irsa - SA 생성 +locals { + k8s_sas = { + for k, v in var.service_accounts : k => v + if try(v.create_k8s_sa, true) + } + + namespaces = toset([ + for k, v in local.k8s_sas : v.namespace + if v.namespace != "kube-system" && v.namespace != "default" + ]) +} + +resource "kubernetes_namespace_v1" "this" { + for_each = local.namespaces + + metadata { + name = each.value + } +} + +resource "kubernetes_service_account_v1" "this" { + for_each = { + for k, v in local.k8s_sas : k => v + if try(v.create_k8s_sa, true) + } + + metadata { + name = each.value.service_account + namespace = each.value.namespace + annotations = { + "eks.amazonaws.com/role-arn" = var.service_account_role_arns[each.key] + } + } + + depends_on = [kubernetes_namespace_v1.this] +} diff --git a/infra/terraform/modules/k8s-bootstrap/variables.tf b/infra/terraform/modules/k8s-bootstrap/variables.tf new file mode 100644 index 00000000..696dbf6c --- /dev/null +++ b/infra/terraform/modules/k8s-bootstrap/variables.tf @@ -0,0 +1,32 @@ +variable "cluster_name" { + type = string +} + +variable "alb_controller_chart_version" { + type = string +} + + +variable "enable_lbc" { + type = bool + default = true +} + +variable "enable_argocd_access" { + type = bool + default = true +} + + +# irsa - SA 생성 +variable "service_accounts" { + type = map(object({ + namespace = string + service_account = string + create_k8s_sa = optional(bool, true) + })) +} + +variable "service_account_role_arns" { + type = map(string) +} diff --git a/infra/terraform/modules/kafka/main.tf b/infra/terraform/modules/kafka/main.tf index 0bc11bb5..78346d09 100644 --- a/infra/terraform/modules/kafka/main.tf +++ b/infra/terraform/modules/kafka/main.tf @@ -1,23 +1,27 @@ # ============================================================================= # Kafka EC2 Module (KRaft Mode - Single/Multi Broker) # ============================================================================= - locals { kafka_port = 9092 kraft_port = 9093 internal_port = 9094 - # 브로커 배치: AZ-a에 1개, AZ-c에 2개 - brokers = var.broker_count > 1 ? { + effective_subnet_ids = length(var.subnet_ids) > 0 ? var.subnet_ids : (var.subnet_id != null ? [var.subnet_id] : []) + single_subnet = length(local.effective_subnet_ids) <= 1 + + brokers = var.broker_count > 1 ? ( + local.single_subnet ? { + "1" = { subnet_index = 0, az_suffix = "a" } + "2" = { subnet_index = 0, az_suffix = "a" } + "3" = { subnet_index = 0, az_suffix = "a" } + } : { "1" = { subnet_index = 0, az_suffix = "a" } "2" = { subnet_index = 1, az_suffix = "c" } "3" = { subnet_index = 1, az_suffix = "c" } - } : { + } + ) : { "1" = { subnet_index = 0, az_suffix = "a" } } - - # 사용할 서브넷 결정 - effective_subnet_ids = length(var.subnet_ids) > 0 ? var.subnet_ids : (var.subnet_id != null ? [var.subnet_id] : []) } # ============================================================================= @@ -27,7 +31,7 @@ resource "aws_security_group" "kafka" { name = "${var.name_prefix}-kafka-sg" vpc_id = var.vpc_id - # Kafka 클라이언트 포트 (ECS에서 접근) + # Kafka 클라이언트 포트 (EKS에서 접근) ingress { from_port = local.kafka_port to_port = local.kafka_port @@ -149,12 +153,12 @@ resource "aws_instance" "kafka" { volume_size = var.volume_size iops = 3000 throughput = 125 - delete_on_termination = var.broker_count == 1 # prod에서는 데이터 보존 + delete_on_termination = var.delete_on_termination encrypted = true } user_data = base64encode(templatefile( - var.broker_count > 1 ? "${path.module}/user-data-cluster.sh" : "${path.module}/user-data.sh", + var.broker_count > 1 ? "${path.module}/user-data-cluster.sh" : "${path.module}/user-data.sh", { kafka_version = var.kafka_version kafka_cluster_id = var.cluster_id @@ -180,13 +184,14 @@ resource "aws_instance" "kafka" { } } + # ============================================================================= # Route53 Private DNS (서비스 디스커버리용) # ============================================================================= resource "aws_route53_zone" "kafka" { count = var.create_private_dns ? 1 : 0 - name = "kafka.internal" + name = "${var.name_prefix}.kafka.internal" vpc { vpc_id = var.vpc_id @@ -195,7 +200,6 @@ resource "aws_route53_zone" "kafka" { tags = merge(var.common_tags, { Name = "${var.name_prefix}-kafka-zone" }) } -# 개별 브로커 DNS 레코드 resource "aws_route53_record" "kafka_brokers" { for_each = var.create_private_dns ? local.brokers : {} @@ -206,7 +210,6 @@ resource "aws_route53_record" "kafka_brokers" { records = [aws_instance.kafka[each.key].private_ip] } -# 클러스터 부트스트랩 레코드 (모든 브로커 IP) resource "aws_route53_record" "kafka_bootstrap" { count = var.create_private_dns ? 1 : 0 @@ -215,4 +218,4 @@ resource "aws_route53_record" "kafka_bootstrap" { type = "A" ttl = 60 records = [for k, v in aws_instance.kafka : v.private_ip] -} +} \ No newline at end of file diff --git a/infra/terraform/modules/kafka/variables.tf b/infra/terraform/modules/kafka/variables.tf index 0ae646aa..e333ef82 100644 --- a/infra/terraform/modules/kafka/variables.tf +++ b/infra/terraform/modules/kafka/variables.tf @@ -110,3 +110,9 @@ variable "create_private_dns" { type = bool default = false } + + +variable "delete_on_termination" { + type = bool + default = true +} \ No newline at end of file diff --git a/infra/terraform/modules/monitoring/main.tf b/infra/terraform/modules/monitoring/main.tf index 8a83e87f..c9a86536 100644 --- a/infra/terraform/modules/monitoring/main.tf +++ b/infra/terraform/modules/monitoring/main.tf @@ -21,7 +21,12 @@ resource "aws_sns_topic_subscription" "email" { # ----------------------------------------------------------------------------- # ECS Alarms # ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- +# ECS Alarms (Optional) +# ----------------------------------------------------------------------------- resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" { + count = var.enable_ecs_alarms ? 1 : 0 + alarm_name = "${var.name_prefix}-ecs-cpu-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 @@ -43,6 +48,8 @@ resource "aws_cloudwatch_metric_alarm" "ecs_cpu_high" { } resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" { + count = var.enable_ecs_alarms ? 1 : 0 + alarm_name = "${var.name_prefix}-ecs-memory-high" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 @@ -63,6 +70,7 @@ resource "aws_cloudwatch_metric_alarm" "ecs_memory_high" { tags = var.common_tags } + # ----------------------------------------------------------------------------- # RDS Alarms # ----------------------------------------------------------------------------- @@ -127,9 +135,11 @@ resource "aws_cloudwatch_metric_alarm" "rds_storage_low" { } # ----------------------------------------------------------------------------- -# ALB Alarms +# ALB Alarms (Optional) # ----------------------------------------------------------------------------- resource "aws_cloudwatch_metric_alarm" "alb_5xx_errors" { + count = var.enable_alb_alarms ? 1 : 0 + alarm_name = "${var.name_prefix}-alb-5xx-errors" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 @@ -151,6 +161,8 @@ resource "aws_cloudwatch_metric_alarm" "alb_5xx_errors" { } resource "aws_cloudwatch_metric_alarm" "alb_response_time" { + count = var.enable_alb_alarms ? 1 : 0 + alarm_name = "${var.name_prefix}-alb-response-time" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 @@ -170,6 +182,7 @@ resource "aws_cloudwatch_metric_alarm" "alb_response_time" { tags = var.common_tags } + # ----------------------------------------------------------------------------- # ElastiCache (Redis) Alarms # ----------------------------------------------------------------------------- diff --git a/infra/terraform/modules/monitoring/outputs.tf b/infra/terraform/modules/monitoring/outputs.tf index e315af4e..5b5e3332 100644 --- a/infra/terraform/modules/monitoring/outputs.tf +++ b/infra/terraform/modules/monitoring/outputs.tf @@ -14,12 +14,12 @@ output "sns_topic_name" { output "alarm_arns" { description = "생성된 CloudWatch Alarm ARN 목록" value = { - ecs_cpu = aws_cloudwatch_metric_alarm.ecs_cpu_high.arn - ecs_memory = aws_cloudwatch_metric_alarm.ecs_memory_high.arn + ecs_cpu = var.enable_ecs_alarms ? aws_cloudwatch_metric_alarm.ecs_cpu_high[0].arn : null + ecs_memory = var.enable_ecs_alarms ? aws_cloudwatch_metric_alarm.ecs_memory_high[0].arn : null rds_cpu = aws_cloudwatch_metric_alarm.rds_cpu_high.arn rds_connections = aws_cloudwatch_metric_alarm.rds_connections_high.arn rds_storage = aws_cloudwatch_metric_alarm.rds_storage_low.arn - alb_5xx = aws_cloudwatch_metric_alarm.alb_5xx_errors.arn - alb_response = aws_cloudwatch_metric_alarm.alb_response_time.arn + alb_5xx = var.enable_alb_alarms ? aws_cloudwatch_metric_alarm.alb_5xx_errors[0].arn : null + alb_response = var.enable_alb_alarms ? aws_cloudwatch_metric_alarm.alb_response_time[0].arn : null } } diff --git a/infra/terraform/modules/monitoring/variables.tf b/infra/terraform/modules/monitoring/variables.tf index c153416e..ab4e813e 100644 --- a/infra/terraform/modules/monitoring/variables.tf +++ b/infra/terraform/modules/monitoring/variables.tf @@ -26,11 +26,13 @@ variable "alert_email" { variable "ecs_cluster_name" { description = "ECS 클러스터 이름" type = string + default = "" } variable "ecs_service_name" { description = "ECS 서비스 이름" type = string + default = "" } variable "ecs_cpu_threshold" { @@ -77,6 +79,7 @@ variable "rds_storage_threshold_bytes" { variable "alb_arn_suffix" { description = "ALB ARN suffix (app/xxx/xxx 형식)" type = string + default = "" } variable "alb_5xx_threshold" { @@ -111,3 +114,18 @@ variable "redis_memory_threshold" { type = number default = 80 } + +# ============================================================================= +# Monitoring 옵션 +# ============================================================================= +variable "enable_ecs_alarms" { + description = "ECS 알람 생성 여부 (EKS 전환 시 보통 false)" + type = bool + default = false +} + +variable "enable_alb_alarms" { + description = "ALB 알람 생성 여부 (ALB ARN suffix가 확정된 경우에만 true)" + type = bool + default = false +} diff --git a/infra/terraform/modules/network/main.tf b/infra/terraform/modules/network/main.tf index 0831a44f..b570f196 100644 --- a/infra/terraform/modules/network/main.tf +++ b/infra/terraform/modules/network/main.tf @@ -13,25 +13,33 @@ resource "aws_vpc" "main" { # Public Subnets # ============================================================================= resource "aws_subnet" "public_a" { - vpc_id = aws_vpc.main.id - cidr_block = var.public_subnet_cidrs["a"] - availability_zone = var.availability_zones["a"] + vpc_id = aws_vpc.main.id + cidr_block = var.public_subnet_cidrs["a"] + availability_zone = var.availability_zones["a"] + map_public_ip_on_launch = true tags = merge(var.common_tags, { Name = "${var.name_prefix}-public-a" Tier = "public" + + "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared" + "kubernetes.io/role/elb" = "1" }) } resource "aws_subnet" "public_c" { - count = var.use_nat_gateway && !var.single_nat_gateway ? 1 : (contains(keys(var.public_subnet_cidrs), "c") ? 1 : 0) - vpc_id = aws_vpc.main.id - cidr_block = lookup(var.public_subnet_cidrs, "c", "10.1.2.0/24") - availability_zone = var.availability_zones["c"] + count = var.use_nat_gateway && !var.single_nat_gateway ? 1 : (contains(keys(var.public_subnet_cidrs), "c") ? 1 : 0) + vpc_id = aws_vpc.main.id + cidr_block = lookup(var.public_subnet_cidrs, "c", "10.1.2.0/24") + availability_zone = var.availability_zones["c"] + map_public_ip_on_launch = true tags = merge(var.common_tags, { Name = "${var.name_prefix}-public-c" Tier = "public" + + "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared" + "kubernetes.io/role/elb" = "1" }) } @@ -39,22 +47,33 @@ resource "aws_subnet" "public_c" { # Private Subnets # ============================================================================= resource "aws_subnet" "private_a" { - vpc_id = aws_vpc.main.id - cidr_block = var.private_subnet_cidrs["a"] - availability_zone = var.availability_zones["a"] + vpc_id = aws_vpc.main.id + cidr_block = var.private_subnet_cidrs["a"] + availability_zone = var.availability_zones["a"] + map_public_ip_on_launch = false tags = merge(var.common_tags, { Name = "${var.name_prefix}-private-a" Tier = "private" + + "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared" + "kubernetes.io/role/internal-elb" = "1" }) } resource "aws_subnet" "private_c" { - vpc_id = aws_vpc.main.id - cidr_block = var.private_subnet_cidrs["c"] - availability_zone = var.availability_zones["c"] + vpc_id = aws_vpc.main.id + cidr_block = var.private_subnet_cidrs["c"] + availability_zone = var.availability_zones["c"] + map_public_ip_on_launch = false + + tags = merge(var.common_tags, { + Name = "${var.name_prefix}-private-c" + Tier = "private" - tags = merge(var.common_tags, { Name = "${var.name_prefix}-private-c" }) + "kubernetes.io/cluster/${var.eks_cluster_name}" = "shared" + "kubernetes.io/role/internal-elb" = "1" + }) } # ============================================================================= @@ -118,10 +137,13 @@ resource "aws_instance" "nat_instance" { user_data = <<-EOF #!/bin/bash - sudo sysctl -w net.ipv4.ip_forward=1 - sudo nft add table ip nat - sudo nft add chain ip nat postrouting { type nat hook postrouting priority 100 \; } - sudo nft add rule ip nat postrouting oifname eth0 masquerade + set -euo pipefail + sysctl -w net.ipv4.ip_forward=1 + + # NAT (masquerade) via nftables (AL2023) + nft list table ip nat >/dev/null 2>&1 || nft add table ip nat + nft list chain ip nat postrouting >/dev/null 2>&1 || nft add chain ip nat postrouting '{ type nat hook postrouting priority 100 ; }' + nft add rule ip nat postrouting oifname "eth0" masquerade 2>/dev/null || true EOF tags = merge(var.common_tags, { Name = "${var.name_prefix}-nat-instance" }) @@ -152,19 +174,21 @@ resource "aws_nat_gateway" "main" { depends_on = [aws_internet_gateway.igw] } - # ============================================================================= # Route Tables # ============================================================================= +# ------------------------- +# Public Route Table +# ------------------------- resource "aws_route_table" "public" { vpc_id = aws_vpc.main.id + tags = merge(var.common_tags, { Name = "${var.name_prefix}-public-rt" }) +} - route { - cidr_block = "0.0.0.0/0" - gateway_id = aws_internet_gateway.igw.id - } - - tags = merge(var.common_tags, { Name = "${var.name_prefix}-public-rt" }) +resource "aws_route" "public_internet" { + route_table_id = aws_route_table.public.id + destination_cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.igw.id } resource "aws_route_table_association" "public_a" { @@ -172,44 +196,75 @@ resource "aws_route_table_association" "public_a" { route_table_id = aws_route_table.public.id } +locals { + create_public_c = ( + var.use_nat_gateway && !var.single_nat_gateway + ? true + : contains(keys(var.public_subnet_cidrs), "c") + ) +} + resource "aws_route_table_association" "public_c" { - count = length(aws_subnet.public_c) > 0 ? 1 : 0 + count = local.create_public_c ? 1 : 0 subnet_id = aws_subnet.public_c[0].id route_table_id = aws_route_table.public.id } -# Private Route Table for AZ-a + + +# ------------------------- +# Private Route Table (AZ-a) +# ------------------------- resource "aws_route_table" "private_a" { vpc_id = aws_vpc.main.id + tags = merge(var.common_tags, { Name = "${var.name_prefix}-private-rt-a" }) +} - route { - cidr_block = "0.0.0.0/0" - nat_gateway_id = var.use_nat_gateway ? aws_nat_gateway.main[0].id : null - network_interface_id = var.use_nat_gateway ? null : aws_instance.nat_instance[0].primary_network_interface_id - } +# Private default route via NAT Gateway (when enabled) +resource "aws_route" "private_a_nat_gw" { + count = var.use_nat_gateway ? 1 : 0 + route_table_id = aws_route_table.private_a.id + destination_cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.main[0].id +} - tags = merge(var.common_tags, { Name = "${var.name_prefix}-private-rt-a" }) +# Private default route via NAT Instance (dev cost optimized) +resource "aws_route" "private_a_nat_instance" { + count = var.use_nat_gateway ? 0 : 1 + route_table_id = aws_route_table.private_a.id + destination_cidr_block = "0.0.0.0/0" + network_interface_id = aws_instance.nat_instance[0].primary_network_interface_id +} + +resource "aws_route_table_association" "private_a" { + subnet_id = aws_subnet.private_a.id + route_table_id = aws_route_table.private_a.id } -# Private Route Table for AZ-c (separate when using multi NAT Gateway) +# ------------------------- +# Private Route Table (AZ-c) +# - Multi NAT GW이면 별도 RT + NAT GW(1) +# - Single NAT GW이면 private_a RT를 공유 +# ------------------------- resource "aws_route_table" "private_c" { count = var.use_nat_gateway && !var.single_nat_gateway ? 1 : 0 vpc_id = aws_vpc.main.id - route { - cidr_block = "0.0.0.0/0" - nat_gateway_id = aws_nat_gateway.main[1].id - } - tags = merge(var.common_tags, { Name = "${var.name_prefix}-private-rt-c" }) } -resource "aws_route_table_association" "private_a" { - subnet_id = aws_subnet.private_a.id - route_table_id = aws_route_table.private_a.id +resource "aws_route" "private_c_nat_gw" { + count = var.use_nat_gateway && !var.single_nat_gateway ? 1 : 0 + route_table_id = aws_route_table.private_c[0].id + destination_cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.main[1].id } resource "aws_route_table_association" "private_c" { - subnet_id = aws_subnet.private_c.id - route_table_id = var.use_nat_gateway && !var.single_nat_gateway ? aws_route_table.private_c[0].id : aws_route_table.private_a.id + subnet_id = aws_subnet.private_c.id + route_table_id = ( + var.use_nat_gateway && !var.single_nat_gateway + ? aws_route_table.private_c[0].id + : aws_route_table.private_a.id + ) } diff --git a/infra/terraform/modules/network/outputs.tf b/infra/terraform/modules/network/outputs.tf index 008c656d..0a3cbad5 100644 --- a/infra/terraform/modules/network/outputs.tf +++ b/infra/terraform/modules/network/outputs.tf @@ -14,13 +14,15 @@ output "public_subnet_a_id" { } output "public_subnet_c_id" { - description = "Public Subnet C ID" - value = length(aws_subnet.public_c) > 0 ? aws_subnet.public_c[0].id : null + value = try(aws_subnet.public_c[0].id, null) } + output "public_subnet_ids" { - description = "Public Subnet IDs" - value = length(aws_subnet.public_c) > 0 ? [aws_subnet.public_a.id, aws_subnet.public_c[0].id] : [aws_subnet.public_a.id] + value = compact([ + aws_subnet.public_a.id, + try(aws_subnet.public_c[0].id, null) + ]) } output "private_subnet_a_id" { @@ -55,3 +57,10 @@ output "nat_type" { description = "NAT 유형 (gateway 또는 instance)" value = var.use_nat_gateway ? "gateway" : "instance" } + + + +# ============================================================================= +# argoCD - peer routing +# ============================================================================= + diff --git a/infra/terraform/modules/network/variables.tf b/infra/terraform/modules/network/variables.tf index fba795a0..556a0ae3 100644 --- a/infra/terraform/modules/network/variables.tf +++ b/infra/terraform/modules/network/variables.tf @@ -41,7 +41,7 @@ variable "nat_instance_type" { variable "use_nat_gateway" { description = "NAT Gateway 사용 여부 (false면 NAT Instance)" type = bool - default = false + default = true } variable "single_nat_gateway" { @@ -49,3 +49,12 @@ variable "single_nat_gateway" { type = bool default = true } + +# ============================================================================= +# EKS Cluster Name +# ============================================================================= +variable "eks_cluster_name" { + description = "EKS 클러스터 이름 (Subnet tag 용도)" + type = string + default = "spot-eks" +} diff --git a/infra/terraform/modules/parameter-store/outputs.tf b/infra/terraform/modules/parameter-store/outputs.tf index a1d19c00..00c777f0 100644 --- a/infra/terraform/modules/parameter-store/outputs.tf +++ b/infra/terraform/modules/parameter-store/outputs.tf @@ -3,7 +3,7 @@ # ============================================================================= # ============================================================================= -# Parameter ARNs (ECS Task Definition secrets 블록에서 사용) +# Parameter ARNs (EKS에서는 IRSA로 Pod에서 SSM GetParameter 권한 부여 후 사용) # ============================================================================= output "db_password_arn" { description = "DB Password Parameter ARN" @@ -39,14 +39,14 @@ output "redis_endpoint_arn" { # All Parameter ARNs (IAM Policy용) # ============================================================================= output "all_parameter_arns" { - description = "모든 Parameter ARN 목록 (IAM Policy용)" + description = "모든 Parameter ARN 목록 (EKS IRSA/IAM Policy용)" value = compact([ aws_ssm_parameter.db_password.arn, aws_ssm_parameter.jwt_secret.arn, - var.mail_password != "" ? aws_ssm_parameter.mail_password[0].arn : null, - var.toss_secret_key != "" ? aws_ssm_parameter.toss_secret_key[0].arn : null, + var.mail_password != "" ? aws_ssm_parameter.mail_password[0].arn : null, + var.toss_secret_key != "" ? aws_ssm_parameter.toss_secret_key[0].arn : null, aws_ssm_parameter.db_endpoint.arn, - var.redis_endpoint != "" ? aws_ssm_parameter.redis_endpoint[0].arn : null, + var.redis_endpoint != "" ? aws_ssm_parameter.redis_endpoint[0].arn : null, ]) } @@ -54,6 +54,6 @@ output "all_parameter_arns" { # Parameter Name Prefix (for wildcard IAM policies) # ============================================================================= output "parameter_prefix" { - description = "Parameter Store prefix for IAM policies" + description = "Parameter Store prefix (EKS IRSA/IAM wildcard policy에서 사용)" value = "/${var.project}/${var.environment}" } diff --git a/infra/terraform/modules/s3/main.tf b/infra/terraform/modules/s3/main.tf index 402c5cf7..db7747e5 100644 --- a/infra/terraform/modules/s3/main.tf +++ b/infra/terraform/modules/s3/main.tf @@ -206,7 +206,23 @@ resource "aws_s3_bucket_policy" "logs" { "aws:SourceAccount" = var.account_id } } + }, + # ALB Access Logs + { + Sid = "AllowALBAccessLogs" + Effect = "Allow" + Principal = { + Service = "elasticloadbalancing.amazonaws.com" + } + Action = "s3:PutObject" + Resource = "${aws_s3_bucket.logs.arn}/alb-logs/*" + Condition = { + StringEquals = { + "aws:SourceAccount" = var.account_id + } + } } + ] }) } diff --git a/infra/terraform/modules/waf/main.tf b/infra/terraform/modules/waf/main.tf index 5c7f91a0..05421317 100644 --- a/infra/terraform/modules/waf/main.tf +++ b/infra/terraform/modules/waf/main.tf @@ -1,9 +1,9 @@ # ============================================================================= -# WAF Web ACL for API Gateway +# WAF Web ALB (EKS Ingress via AWS Load Balancer Controller) # ============================================================================= resource "aws_wafv2_web_acl" "main" { name = "${var.name_prefix}-waf" - description = "WAF for API Gateway" + description = "WAF for ALB - EKS Ingress" scope = "REGIONAL" default_action { @@ -111,24 +111,38 @@ resource "aws_wafv2_web_acl" "main" { tags = merge(var.common_tags, { Name = "${var.name_prefix}-waf" }) } -# ============================================================================= -# WAF Association with API Gateway -# ============================================================================= -#resource "aws_wafv2_web_acl_association" "api_gateway" { -# count = var.api_gateway_stage_arn != "" ? 1 : 0 -# -# resource_arn = var.api_gateway_stage_arn -# web_acl_arn = aws_wafv2_web_acl.main.arn -#} + + + # ============================================================================= -# CloudWatch Log Group for WAF +# CloudWatch Log Group for WAF logs +# NOTE: WAF requires the log group name to start with "aws-waf-logs-" # ============================================================================= resource "aws_cloudwatch_log_group" "waf" { name = "aws-waf-logs-${var.name_prefix}" retention_in_days = var.log_retention_days + tags = var.common_tags +} - tags = var.common_tags +# ============================================================================= +# Allow WAF to write logs to CloudWatch Logs +# ============================================================================= +resource "aws_cloudwatch_log_resource_policy" "waf" { + policy_name = "aws-waf-logs-${var.name_prefix}" + + policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AWSWAFLoggingPermissions" + Effect = "Allow" + Principal = { Service = "wafv2.amazonaws.com" } + Action = ["logs:CreateLogStream", "logs:PutLogEvents"] + Resource = "${aws_cloudwatch_log_group.waf.arn}:*" + } + ] + }) } # ============================================================================= @@ -137,4 +151,6 @@ resource "aws_cloudwatch_log_group" "waf" { resource "aws_wafv2_web_acl_logging_configuration" "main" { log_destination_configs = [aws_cloudwatch_log_group.waf.arn] resource_arn = aws_wafv2_web_acl.main.arn -} + + depends_on = [aws_cloudwatch_log_resource_policy.waf] +} \ No newline at end of file diff --git a/infra/terraform/modules/waf/variables.tf b/infra/terraform/modules/waf/variables.tf index 6b92f8a7..bcea0f54 100644 --- a/infra/terraform/modules/waf/variables.tf +++ b/infra/terraform/modules/waf/variables.tf @@ -9,11 +9,6 @@ variable "common_tags" { default = {} } -variable "api_gateway_stage_arn" { - description = "API Gateway Stage ARN" - type = string - default = "" -} variable "rate_limit" { description = "5분당 최대 요청 수 (Rate Limiting)" @@ -26,3 +21,8 @@ variable "log_retention_days" { type = number default = 30 } + +variable "alb_name" { + type = string + default = "spot-dev-alb" +} diff --git a/run_docker.sh b/run_docker.sh deleted file mode 100755 index b2757bd4..00000000 --- a/run_docker.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -clear - -set -e - -echo "=== 기존 컨테이너 종료 및 삭제 ===" -docker compose down --remove-orphans - -docker volume ls -q | grep "kafka-data" | xargs -r docker volume rm -docker rm -f redis_cache local-postgres_db spot-gateway spot-user spot-store spot-order spot-payment 2>/dev/null || true - -echo "=== 각 MSA 서비스 빌드 ===" -for service in spot-gateway spot-user spot-store spot-order spot-payment; do - echo ">> $service 빌드 시작" - (cd "$service" && ./gradlew bootJar -x test) -done - -docker compose up --build -d - -mkdir -p ./logs -LOG_FILE="./logs/current_logs_$(date +'%Y%m%d_%H%M%S').txt" - -docker compose logs | \ - grep --line-buffered -v -E "redis_cache|local-postgres_db" | \ - tee -a "$LOG_FILE" \ No newline at end of file diff --git a/run_k3d.sh b/run_k3d.sh index 8e4429b1..f529014a 100755 --- a/run_k3d.sh +++ b/run_k3d.sh @@ -4,7 +4,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CLUSTER_NAME="spot-cluster" -REGISTRY_NAME="spot-registry.localhost" +REGISTRY_NAME="127.0.0.1" REGISTRY_PORT="5111" # 로컬 레지스트리는 프록시 우회 @@ -89,6 +89,12 @@ create_cluster() { kubectl wait --for=condition=ready node --all --timeout=180s log_info "Cluster created successfully!" + + log_info "Creating Namespaces..." + + kubectl apply -f "$SCRIPT_DIR/infra/k8s/base/namespace.yaml" + + log_info "Creating Namespaces..." } build_and_push_images() { @@ -126,8 +132,6 @@ build_and_push_images() { install_argocd() { log_info "Installing ArgoCD..." - kubectl create namespace argocd --dry-run=client -o yaml | kubectl apply -f - - kubectl apply -n argocd --server-side -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml log_info "Waiting for ArgoCD to be ready..." @@ -143,8 +147,6 @@ install_argocd() { install_prometheus() { log_info "Installing Prometheus (kube-prometheus-stack) via Helm..." - kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - - helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true helm repo update @@ -167,14 +169,13 @@ install_strimzi() { log_info "Installing Strimzi Kafka Operator via Helm..." kubectl create namespace strimzi --dry-run=client -o yaml | kubectl apply -f - - kubectl create namespace spot --dry-run=client -o yaml | kubectl apply -f - - + helm repo add strimzi https://strimzi.io/charts/ >/dev/null 2>&1 || true helm repo update helm upgrade --install strimzi-operator strimzi/strimzi-kafka-operator \ -n strimzi \ - --set watchNamespaces={spot} \ + --set watchNamespaces={infra} \ --wait log_info "Strimzi Operator installed successfully!" @@ -195,20 +196,34 @@ deploy_all() { # kubectl wait --for=condition=available deployment/redis -n spot --timeout=180s log_info "Waiting for Kafka Cluster (KRaft)..." - kubectl wait --for=condition=Ready kafka/spot-cluster -n spot --timeout=300s + kubectl wait --for=condition=Ready kafka/kafka-cluster -n infra --timeout=300s log_info "Waiting for Kafka Connect..." - kubectl wait --for=condition=Ready kafkaconnect/spot-connect -n spot --timeout=300s + kubectl wait --for=condition=Ready kafkaconnect/spot-connect -n infra --timeout=300s log_info "Waiting for Kafka UI..." - kubectl wait --for=condition=available deployment/kafka-ui -n spot --timeout=180s + kubectl wait --for=condition=available deployment/kafka-ui -n infra --timeout=180s log_info "Waiting for Temporal..." - kubectl wait --for=condition=available deployment/temporal -n spot --timeout=180s - kubectl wait --for=condition=available deployment/temporal-ui -n spot --timeout=180s + kubectl wait --for=condition=available deployment/temporal -n infra --timeout=180s + kubectl wait --for=condition=available deployment/temporal-ui -n infra --timeout=180s log_info "Infrastructure deployed successfully!" + # Apps 배포 (Helm) + log_info "Deploying Apps (Helm)..." + + CHART_PATH="$SCRIPT_DIR/infra/spot-apps" + + helm upgrade --install spot "$CHART_PATH" \ + -n spot \ + -f "$CHART_PATH/values/local-values.yaml" \ + --set global.secretName=spot-secrets \ + --wait \ + --timeout 10m + + log_info "Apps Deployed Successfully!" + log_info "Waiting for monitoring system to be ready..." kubectl wait --for=condition=available deployment/loki-deploy -n monitoring --timeout=180s || true kubectl wait --for=condition=available deployment/grafana-deploy -n monitoring --timeout=180s || true @@ -268,7 +283,7 @@ main() { cleanup_existing create_cluster build_and_push_images - install_argocd +# install_argocd install_prometheus install_strimzi deploy_all diff --git a/temporal b/temporal new file mode 160000 index 00000000..ca6536ad --- /dev/null +++ b/temporal @@ -0,0 +1 @@ +Subproject commit ca6536ada85941ddec669540f637925cfbc3e6de