From 4e47648cb17f6492b956480f8a37ad4950f190e1 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sat, 29 Nov 2025 23:57:04 +0900 Subject: [PATCH 1/9] chore: migrate Datadog monitors and SLOs to Terraform --- terraform/datadog/.terraform.lock.hcl | 47 ++++++ terraform/datadog/backend.tf | 9 ++ terraform/datadog/integrations.tf | 44 ++++++ terraform/datadog/locals.tf | 41 +++++ terraform/datadog/monitors.tf | 217 ++++++++++++++++++++++++++ terraform/datadog/outputs.tf | 1 + terraform/datadog/providers.tf | 19 +++ terraform/datadog/slos.tf | 60 +++++++ terraform/datadog/terraform.tfvars | 1 + terraform/datadog/variables.tf | 1 + 10 files changed, 440 insertions(+) create mode 100644 terraform/datadog/.terraform.lock.hcl create mode 100644 terraform/datadog/backend.tf create mode 100644 terraform/datadog/integrations.tf create mode 100644 terraform/datadog/locals.tf create mode 100644 terraform/datadog/monitors.tf create mode 100644 terraform/datadog/outputs.tf create mode 100644 terraform/datadog/providers.tf create mode 100644 terraform/datadog/slos.tf create mode 100644 terraform/datadog/terraform.tfvars create mode 100644 terraform/datadog/variables.tf diff --git a/terraform/datadog/.terraform.lock.hcl b/terraform/datadog/.terraform.lock.hcl new file mode 100644 index 00000000..e26b9c1a --- /dev/null +++ b/terraform/datadog/.terraform.lock.hcl @@ -0,0 +1,47 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.81.0" + constraints = "~> 3.80" + hashes = [ + "h1:XUxhYjInwt1cfF80s42fU+ri4wX42zhWjbH7s42cCEE=", + "zh:200ab37bea383700f20621e7a310920496a2f341bd0fdee800de2d02f389c1cc", + "zh:2872776abeb51b7f882cdc49ca08b9ae1c45ee264c34092d6cd355dfc77d2d70", + "zh:4197d5532eeb1859333a246e088d7ba3c177d5453efcc8be84bea890fe1174e3", + "zh:5b31ab2ecd467bf18a8bb4407817d7da78b4fb5614776b248a3c9d8a850f8b40", + "zh:69dea7efa3c62d7c8c13a24694262ebeab398d29acf864089e55ab74be00ac9a", + "zh:794cb600d2986c442aa478aae9e35abe4b6ad0a825d5ddd8e87b1162a27d9653", + "zh:80ea1d581040702f0c2431673378989e40736d4268d5c26e2bac7d09e4079997", + "zh:95a2eb917286966a0f56696f395728d53ebd2f397311c944df014b1bc186927c", + "zh:a161e98a9900249bc81b835f6c715ff7501375f18e24bbf063000ee0b8f78d00", + "zh:a626843a6d017c5aae7ca761d5a2a03a5a2b0fe9d542805bd66ca7180beae6b2", + "zh:b8e443f025d14f047dcef653964568798c2a12979e453184146a16c513e778c5", + "zh:bb79839217aca5223dd58cb7998de43ffd2ee6e7cefa2661e52a7064b19d66fe", + "zh:c8ed857f7dd56dff35d5f753a3078fa41336f69f13ac11948d277dd12af9d0ae", + "zh:e3a85c2635ac77835d34398356fac043e4dba5d3b23ca56302afbe89c1cc300a", + ] +} + +provider "registry.terraform.io/hashicorp/aws" { + version = "6.23.0" + constraints = "6.23.0" + hashes = [ + "h1:rue7iU6J6MznSVz3EOAy/0Ko26Gh0oH1QYxqAxUmqWg=", + "zh:254eab8da4534fe937cb48992e4843d4165a6455eb6e989ef6cae7ebeb6fc107", + "zh:3c3913e4853c491ae3b65560fd5911c21959b57f2d1f890c252a12d4ac201852", + "zh:5baf2900b4b5fd7be40b0ce219359a6ec9a8896f267b932a249cee6cf0eccf09", + "zh:6c0784a51dfc31390a4d6a93f61c91c307fe8625872227236ba4dac5a59d33f6", + "zh:6c6af8b96d14893edb4e23c877a406ed85b970546c3b318edb20da05e505cd43", + "zh:732724c69ec82582804052ebe1f01d470e4af9a6c42a2a5d17df5da3d7e6e5af", + "zh:869c3c71510d9079ca7147a1660c233cffc872840995c4a3745ae26a59431414", + "zh:99db418c8b2473366a944164b7ce25c8fbcfd8ba6d1146154d885ce898351be0", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a54c2a3672d50a4c17ca04230af7a236b3d7f13418cc4b1982c2c4523744d5de", + "zh:acfd7014b33af2f36da63ebda72c500dc3e9bf1f2497b5423480bfce7db22fe8", + "zh:af0a81fcdf53064521ff9fb3d4e3e98098d5de04ed2938704807b2ef454456b5", + "zh:b042d5546151256a08f58eb7eaa1f65911b77be4ae2988c993d7ee94de1520a8", + "zh:cd033e0e52a2f13b59ab2e9cb84870306f3143d0793e0161a7ec12765e949a28", + "zh:d04a9e1861a08bdbe3a2be4c77680933cbbd8b87e6879d8079c3e5cd8d0f1ee7", + ] +} diff --git a/terraform/datadog/backend.tf b/terraform/datadog/backend.tf new file mode 100644 index 00000000..5bcfc7cf --- /dev/null +++ b/terraform/datadog/backend.tf @@ -0,0 +1,9 @@ +terraform { + backend "s3" { + bucket = "eatda-tf-state" + key = "datadog/terraform.tfstate" + region = "ap-northeast-2" + encrypt = true + dynamodb_table = "eatda-tf-lock" + } +} diff --git a/terraform/datadog/integrations.tf b/terraform/datadog/integrations.tf new file mode 100644 index 00000000..da68d859 --- /dev/null +++ b/terraform/datadog/integrations.tf @@ -0,0 +1,44 @@ +resource "datadog_webhook" "discord_alert" { + custom_headers = null + encode_as = "json" + name = "discord-alert-channel" + payload = jsonencode({ + embeds = [{ + color = 15548997 + description = "$EVENT_MSG" + title = "$EVENT_TITLE" + url = "$LINK" + }] + }) + url = data.aws_ssm_parameter.discord_alert_webhook_url.value +} + +resource "datadog_webhook" "discord_warn" { + custom_headers = null + encode_as = "json" + name = "discord-warn-channel" + payload = jsonencode({ + embeds = [{ + color = 16776960 + description = "$EVENT_MSG" + title = "$EVENT_TITLE" + url = "$LINK" + }] + }) + url = data.aws_ssm_parameter.discord_warn_webhook_url.value +} + +resource "datadog_webhook" "discord-recovery" { + custom_headers = null + encode_as = "json" + name = "discord-warn-channel-recovery" + payload = jsonencode({ + embeds = [{ + color = 5763719 + description = "$EVENT_MSG" + title = "$EVENT_TITLE" + url = "$LINK" + }] + }) + url = data.aws_ssm_parameter.discord_recovery_webhook_url.value +} diff --git a/terraform/datadog/locals.tf b/terraform/datadog/locals.tf new file mode 100644 index 00000000..2bc2b919 --- /dev/null +++ b/terraform/datadog/locals.tf @@ -0,0 +1,41 @@ +data "aws_ssm_parameter" "datadog_api" { + name = "/prod/DD_API_KEY" +} + +data "aws_ssm_parameter" "datadog_app" { + name = "/prod/DD_APP_KEY" +} + +data "aws_ssm_parameter" "discord_alert_webhook_url" { + name = "/discord/discord_alert_webhook_url" +} + +data "aws_ssm_parameter" "discord_warn_webhook_url" { + name = "/discord/discord_warn_webhook_url" +} + +data "aws_ssm_parameter" "discord_recovery_webhook_url" { + name = "/discord/discord_recovery_webhook_url" +} + +locals { + notification_footer = <<-EOT + + --- + + {{#is_alert}} + 🚨 **CRITICAL ALERT** + @webhook-discord-alert-channel + {{/is_alert}} + + {{#is_warning}} + ⚠️ **WARNING ALERT** + @webhook-discord-warn-channel + {{/is_warning}} + + {{#is_recovery}} + βœ… **RECOVERY** + @webhook-discord-warn-channel-recovery + {{/is_recovery}} + EOT +} diff --git a/terraform/datadog/monitors.tf b/terraform/datadog/monitors.tf new file mode 100644 index 00000000..b6c829a4 --- /dev/null +++ b/terraform/datadog/monitors.tf @@ -0,0 +1,217 @@ +resource "datadog_monitor" "cpu_usage" { + draft_status = "published" + enable_logs_sample = null + enable_samples = null + escalation_message = null + evaluation_delay = 0 + force_delete = null + group_retention_duration = null + groupby_simple_monitor = null + include_tags = true + message = <<-EOT + ## 🚨 [System] CPU μ‚¬μš©λŸ‰ μž„κ³„μΉ˜ 초과 + **Host:** {{host.name}} / **Usage:** {{value}}% + + {{#is_alert}} + **[CRITICAL] CPUκ°€ 포화 μƒνƒœμž…λ‹ˆλ‹€. (90% 이상)** + - **영ν–₯:** μ• ν”Œλ¦¬μΌ€μ΄μ…˜ 처리 μ§€μ—° 및 νƒ€μž„μ•„μ›ƒ λ°œμƒ μœ„ν—˜ + - **쑰치:** `top` λͺ…λ Ήμ–΄λ‘œ κ³ λΆ€ν•˜ ν”„λ‘œμ„ΈμŠ€ 확인 및 μŠ€μΌ€μΌλ§ κ²€ν†  ν•„μš” + {{/is_alert}} + + {{#is_warning}} + **[WARNING] CPU μ‚¬μš©λŸ‰μ΄ μ¦κ°€ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. (70% 이상)** + - 배치 μž‘μ—…μ΄λ‚˜ νŠΉμ • ν”„λ‘œμ„ΈμŠ€μ˜ λ¦¬μ†ŒμŠ€ 점유율 확인 ν•„μš” + {{/is_warning}} + + ${local.notification_footer} + EOT + name = "CPU usage is high for host {{host.name}}" + new_group_delay = 300 + notification_preset_name = null + notify_audit = false + notify_by = [] + notify_no_data = false + on_missing_data = null + priority = null + query = "avg(last_5m):100 - avg:system.cpu.idle{*} by {host} > 90" + renotify_interval = 0 + renotify_occurrences = 0 + renotify_statuses = null + require_full_window = false + restricted_roles = null + tags = ["integration:host"] + timeout_h = 0 + type = "query alert" + validate = null + monitor_thresholds { + critical = jsonencode(90) + critical_recovery = null + ok = null + unknown = null + warning = jsonencode(70) + warning_recovery = null + } +} + +resource "datadog_monitor" "memory_usage" { + draft_status = "published" + enable_logs_sample = null + enable_samples = null + escalation_message = null + evaluation_delay = 0 + force_delete = null + group_retention_duration = null + groupby_simple_monitor = null + include_tags = false + message = <<-EOT + ## πŸ’Ύ [System] λ©”λͺ¨λ¦¬ λΆ€μ‘± μœ„ν—˜ + **Host:** {{host.name}} / **Usage:** {{value}}% + + {{#is_alert}} + **[CRITICAL] κ°€μš© λ©”λͺ¨λ¦¬κ°€ 10% λ―Έλ§Œμž…λ‹ˆλ‹€.** + - **영ν–₯:** OOM Killer둜 μΈν•œ μ£Όμš” ν”„λ‘œμ„ΈμŠ€ κ°•μ œ μ’…λ£Œ μœ„ν—˜ + - **쑰치:** λ©”λͺ¨λ¦¬ λˆ„μˆ˜ 확인 및 덀프 뢄석 ꢌμž₯ + {{/is_alert}} + + {{#is_warning}} + **[WARNING] λ©”λͺ¨λ¦¬ μ‚¬μš©λŸ‰μ΄ μ•ˆμ „ ꡬ간을 λ²—μ–΄λ‚¬μŠ΅λ‹ˆλ‹€.** + - 지속적인 증가 좔세인지 λͺ¨λ‹ˆν„°λ§ ν•„μš” + {{/is_warning}} + + ${local.notification_footer} + EOT + name = "Memory space is high for host {{host.name}}" + new_group_delay = 300 + notification_preset_name = null + notify_audit = false + on_missing_data = "default" + priority = null + query = "avg(last_5m):avg:system.mem.pct_usable{*} by {host} > 0.9" + renotify_interval = 0 + renotify_occurrences = 0 + renotify_statuses = null + require_full_window = false + restricted_roles = null + tags = ["integration:host"] + timeout_h = 0 + type = "query alert" + validate = null + monitor_thresholds { + critical = jsonencode(0.9) + critical_recovery = null + ok = null + unknown = null + warning = jsonencode(0.7) + warning_recovery = null + } +} + +resource "datadog_monitor" "eatda_availability" { + draft_status = "published" + enable_logs_sample = null + enable_samples = null + escalation_message = null + evaluation_delay = 0 + force_delete = null + group_retention_duration = null + groupby_simple_monitor = null + include_tags = true + message = <<-EOT + ## πŸ”₯ [Service] κ°€μš©μ„±(Availability) SLO μœ„ν—˜ + **Metric:** 30-day Availability Burn Rate + + {{#is_alert}} + **[CRITICAL] μ—λŸ¬μœ¨μ΄ κΈ‰μ¦ν•˜μ—¬ κ°€μš©μ„± λͺ©ν‘œλ₯Ό μœ„ν˜‘ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€.** + - **영ν–₯:** λ‹€μˆ˜μ˜ API μš”μ²­ μ‹€νŒ¨ (5xx Error) + - **쑰치:** APM을 ν†΅ν•œ μ—λŸ¬ 원인 뢄석 및 졜근 배포 사항 점검 + {{/is_alert}} + + {{#is_warning}} + **[WARNING] μ—λŸ¬ μ˜ˆμ‚° μ†Œμ§„ 속도가 빨라지고 μžˆμŠ΅λ‹ˆλ‹€.** + - 간헐적인 μ—λŸ¬ λ°œμƒ μ—¬λΆ€ 확인 ν•„μš” + {{/is_warning}} + + ${local.notification_footer} + EOT + name = "[Eatda-prod-api] Availability - 30d" + new_group_delay = 0 + notification_preset_name = null + notify_audit = false + notify_by = [] + notify_no_data = false + on_missing_data = null + priority = null + query = "burn_rate(\"c2ba09c7153a5bcd91e9ba4f92245579\").over(\"30d\").long_window(\"1h\").short_window(\"5m\") > 14.4" + renotify_interval = 0 + renotify_occurrences = 0 + renotify_statuses = null + require_full_window = false + restricted_roles = null + tags = [] + timeout_h = 0 + type = "slo alert" + validate = null + monitor_thresholds { + critical = jsonencode(14.4) + critical_recovery = null + ok = null + unknown = null + warning = jsonencode(5.6) + warning_recovery = null + } +} + +resource "datadog_monitor" "eatda_latency" { + draft_status = "published" + enable_logs_sample = null + enable_samples = null + escalation_message = null + evaluation_delay = 0 + force_delete = null + group_retention_duration = null + groupby_simple_monitor = null + include_tags = true + message = <<-EOT + ## 🐒 [Service] 응닡 μ§€μ—°(Latency) 감지 + **Target:** P95 < 500ms + + {{#is_alert}} + **[CRITICAL] 응닡 속도가 λͺ©ν‘œμΉ˜λ³΄λ‹€ ν˜„μ €νžˆ λŠλ¦½λ‹ˆλ‹€.** + - **영ν–₯:** μ‚¬μš©μž κ²½ν—˜ μ €ν•˜ 및 ν΄λΌμ΄μ–ΈνŠΈ νƒ€μž„μ•„μ›ƒ + - **쑰치:** Slow Query 확인 및 APM 병λͺ© ꡬ간 뢄석 + {{/is_alert}} + + {{#is_warning}} + **[WARNING] 응닡 속도가 ν‰μ†Œλ³΄λ‹€ λŠλ €μ§€κ³  μžˆμŠ΅λ‹ˆλ‹€.** + - νŠΉμ • API의 μ„±λŠ₯ μ €ν•˜μΈμ§€ 확인 ν•„μš” + {{/is_warning}} + + ${local.notification_footer} + EOT + name = "[Eatda-prod-api] Latency (P95 < 500ms)" + new_group_delay = 0 + notification_preset_name = null + notify_audit = false + notify_by = [] + notify_no_data = false + on_missing_data = null + priority = null + query = "burn_rate(\"7beca231285d5639b23be8d182cd8d4a\").over(\"30d\").long_window(\"6h\").short_window(\"30m\") > 14.4" + renotify_interval = 0 + renotify_occurrences = 0 + renotify_statuses = null + require_full_window = false + restricted_roles = null + tags = [] + timeout_h = 0 + type = "slo alert" + validate = null + monitor_thresholds { + critical = jsonencode(14.4) + critical_recovery = null + ok = null + unknown = null + warning = jsonencode(5.6) + warning_recovery = null + } +} diff --git a/terraform/datadog/outputs.tf b/terraform/datadog/outputs.tf new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/terraform/datadog/outputs.tf @@ -0,0 +1 @@ + diff --git a/terraform/datadog/providers.tf b/terraform/datadog/providers.tf new file mode 100644 index 00000000..652a5a99 --- /dev/null +++ b/terraform/datadog/providers.tf @@ -0,0 +1,19 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "6.23.0" + } + + datadog = { + source = "DataDog/datadog" + version = "~> 3.80" + } + } +} + +provider "datadog" { + api_key = data.aws_ssm_parameter.datadog_api.value + app_key = data.aws_ssm_parameter.datadog_app.value + api_url = "https://api.us5.datadoghq.com/" +} diff --git a/terraform/datadog/slos.tf b/terraform/datadog/slos.tf new file mode 100644 index 00000000..7faafd23 --- /dev/null +++ b/terraform/datadog/slos.tf @@ -0,0 +1,60 @@ +resource "datadog_service_level_objective" "eatda_latency" { + description = "30일간 P95κ°€ 500ms μ΄λ‚΄λ‘œ μœ μ§€λ˜λŠ” SLO" + force_delete = null + groups = null + monitor_ids = null + name = "[Eatda-prod-api] Latency (P95 < 500ms)" + tags = ["service:eatda-api-prod"] + target_threshold = 99 + timeframe = "30d" + type = "time_slice" + validate = null + warning_threshold = 99.5 + sli_specification { + time_slice { + comparator = "<=" + query_interval_seconds = 300 + threshold = 0.5 + query { + formula { + formula_expression = "query1" + } + query { + metric_query { + data_source = "metrics" + name = "query1" + query = "p95:trace.servlet.request{env:prod}" + } + } + } + } + } + thresholds { + target = 99 + timeframe = "30d" + warning = 99.5 + } +} + +resource "datadog_service_level_objective" "eatda_availability" { + description = "30일간 κ°€μš©μ„± SLO" + force_delete = null + groups = null + monitor_ids = null + name = "[Eatda-prod-api] Availability - 30d" + tags = ["service:eatda-api-prod"] + target_threshold = 99 + timeframe = "30d" + type = "metric" + validate = null + warning_threshold = 99.5 + query { + denominator = "count:trace.servlet.request{service:eatda-api-prod}.as_count()" + numerator = "count:trace.servlet.request{service:eatda-api-prod, !http.status_code:5*, error:false}.as_count()" + } + thresholds { + target = 99 + timeframe = "30d" + warning = 99.5 + } +} diff --git a/terraform/datadog/terraform.tfvars b/terraform/datadog/terraform.tfvars new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/terraform/datadog/terraform.tfvars @@ -0,0 +1 @@ + diff --git a/terraform/datadog/variables.tf b/terraform/datadog/variables.tf new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/terraform/datadog/variables.tf @@ -0,0 +1 @@ + From bb3616063152f7e99e581f10f00c95c907b05da1 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:20:55 +0900 Subject: [PATCH 2/9] =?UTF-8?q?chore:=20=EC=9B=94=EA=B0=84=20=EC=84=9C?= =?UTF-8?q?=EB=B2=84=20=EB=A6=AC=ED=8F=AC=ED=8A=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/monthly-server-report.yml | 34 ++++ scripts/monthly-server-report.py | 187 ++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 .github/workflows/monthly-server-report.yml create mode 100644 scripts/monthly-server-report.py diff --git a/.github/workflows/monthly-server-report.yml b/.github/workflows/monthly-server-report.yml new file mode 100644 index 00000000..5dea4122 --- /dev/null +++ b/.github/workflows/monthly-server-report.yml @@ -0,0 +1,34 @@ +name: monthly-server-report.yml + +on: + + schedule: + - cron: '30 0 1 * *' + workflow_dispatch: + +jobs: + send-report: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Dependencies + run: | + pip install boto3 datadog-api-client requests python-dateutil + + - name: Run Report Script + env: + DD_API_KEY: ${{ secrets.DD_API_KEY }} + DD_APP_KEY: ${{ secrets.DD_APP_KEY }} + DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_MONTHLY_SERVER_REPORT_WEBHOOK }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.AWS_REGION }} + run: python scripts/monthly-server-report.py + diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py new file mode 100644 index 00000000..0918e4fa --- /dev/null +++ b/scripts/monthly-server-report.py @@ -0,0 +1,187 @@ +import boto3 +import os +import requests +from datadog_api_client import ApiClient, Configuration +from datadog_api_client.v1.api.events_api import EventsApi +from datadog_api_client.v1.api.service_level_objectives_api import ServiceLevelObjectivesApi +from datetime import datetime +from dateutil.relativedelta import relativedelta + +DD_API_KEY = os.getenv("DD_API_KEY") +DD_APP_KEY = os.getenv("DD_APP_KEY") +DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL") +AWS_REGION = os.getenv("AWS_REGION", "ap-northeast-2") +WAF_WEB_ACL_NAME = "eatda-web-acl" + +SLO_AVAILABILITY_ID = "c2ba09c7153a5bcd91e9ba4f92245579" +SLO_LATENCY_ID = "7beca231285d5639b23be8d182cd8d4a" + + +def get_date_ranges(): + today = datetime.now() + this_month_start = today.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + last_month_start = this_month_start - relativedelta(months=1) + month_before_last_start = last_month_start - relativedelta(months=1) + + report_period = { + 'start_ts': int(last_month_start.timestamp()), + 'end_ts': int(this_month_start.timestamp()), + 'start_iso': last_month_start.strftime('%Y-%m-%d'), + 'end_iso': this_month_start.strftime('%Y-%m-%d'), + 'month_str': last_month_start.strftime("%Yλ…„ %mμ›”") + } + + prev_period = { + 'start_iso': month_before_last_start.strftime('%Y-%m-%d'), + 'end_iso': last_month_start.strftime('%Y-%m-%d') + } + + return report_period, prev_period + + +def normalize_slo_value(value): + if value is None: + return 0.0 + if 0 < value <= 1.0: + return value * 100 + return value + + +def get_datadog_metrics(start_ts, end_ts): + configuration = Configuration() + configuration.api_key["apiKeyAuth"] = DD_API_KEY + configuration.api_key["appKeyAuth"] = DD_APP_KEY + + data = {'slo_avail': 0.0, 'slo_latency': 0.0, 'alert_count': 0} + + try: + with ApiClient(configuration) as api_client: + slo_api = ServiceLevelObjectivesApi(api_client) + + try: + avail = slo_api.get_slo_history(SLO_AVAILABILITY_ID, from_ts=start_ts, to_ts=end_ts) + data['slo_avail'] = normalize_slo_value(avail.data.overall.sli_value) + except Exception as e: + print(f"⚠️ Error fetching Availability SLO: {e}") + + try: + latency = slo_api.get_slo_history(SLO_LATENCY_ID, from_ts=start_ts, to_ts=end_ts) + data['slo_latency'] = normalize_slo_value(latency.data.overall.sli_value) + except Exception as e: + print(f"⚠️ Error fetching Latency SLO: {e}") + + event_api = EventsApi(api_client) + try: + events = event_api.list_events( + start=start_ts, + end=end_ts, + tags="status:error,source:monitor,service:eatda-api-prod" + ) + data['alert_count'] = len(events.events) if events.events else 0 + except Exception as e: + print(f"⚠️ Error fetching Events: {e}") + except Exception as e: + print(f"❌ Critical Datadog API Error: {e}") + + return data + + +def get_aws_waf_stats(start_iso, end_iso): + client = boto3.client('cloudwatch', region_name=AWS_REGION) + + def get_metric(metric_name): + try: + response = client.get_metric_statistics( + Namespace='AWS/WAFV2', + MetricName=metric_name, + Dimensions=[ + {'Name': 'WebACL', 'Value': WAF_WEB_ACL_NAME}, + {'Name': 'Region', 'Value': AWS_REGION}, + {'Name': 'Rule', 'Value': 'ALL'} + ], + StartTime=start_iso, + EndTime=end_iso, + Period=2592000, + Statistics=['Sum'] + ) + if response['Datapoints']: + return int(response['Datapoints'][0]['Sum']) + print(f"⚠️ No datapoints for WAF metric: {metric_name}") + return 0 + except Exception as e: + print(f"❌ Error fetching WAF metric {metric_name}: {e}") + return 0 + + return { + 'allowed': get_metric('AllowedRequests'), + 'blocked': get_metric('BlockedRequests') + } + + +def get_total_cost(start_iso, end_iso): + client = boto3.client('ce', region_name='us-east-1') + try: + response = client.get_cost_and_usage( + TimePeriod={'Start': start_iso, 'End': end_iso}, + Granularity='MONTHLY', + Metrics=['UnblendedCost'] + ) + if response['ResultsByTime']: + return float(response['ResultsByTime'][0]['Total']['UnblendedCost']['Amount']) + return 0.0 + except Exception as e: + print(f"❌ Error fetching AWS Cost: {e}") + return 0.0 + + +def send_discord_report(): + print("πŸš€ Starting Monthly Report Generation...") + current_period, prev_period = get_date_ranges() + + dd_data = get_datadog_metrics(current_period['start_ts'], current_period['end_ts']) + waf_data = get_aws_waf_stats(current_period['start_iso'], current_period['end_iso']) + + curr_cost = get_total_cost(current_period['start_iso'], current_period['end_iso']) + prev_cost = get_total_cost(prev_period['start_iso'], prev_period['end_iso']) + + cost_diff = curr_cost - prev_cost + cost_diff_str = f"+${cost_diff:.2f}" if cost_diff >= 0 else f"-${abs(cost_diff):.2f}" + cost_emoji = "πŸ“ˆ" if cost_diff > 0 else "πŸ“‰" if cost_diff < 0 else "➑️" + + total_req = waf_data['allowed'] + waf_data['blocked'] + if total_req == 0: total_req = 1 + + message = f""" +πŸ“Š **[Eatda] {current_period['month_str']} μ›”κ°„ 톡합 리포트** + +**1. Datadog (μ„œλΉ„μŠ€ ν’ˆμ§ˆ)** +- 🩺 **κ°€μš©μ„± SLO:** `{dd_data['slo_avail']:.3f}%` +- 🐒 **응닡속도 SLO:** `{dd_data['slo_latency']:.3f}%` +- 🚨 **λ°œμƒν•œ μž₯μ• :** `{dd_data['alert_count']}건` + +**2. AWS WAF (λ³΄μ•ˆ)** +- πŸ›‘οΈ **총 μš”μ²­:** `{total_req:,}건` +- βœ… **ν—ˆμš©λ¨:** `{waf_data['allowed']:,}건` ({waf_data['allowed'] / total_req * 100:.1f}%) +- 🚫 **차단됨:** `{waf_data['blocked']:,}건` + +**3. AWS Cost (λΉ„μš©)** +- πŸ’° **이번 달:** `${curr_cost:.2f}` +- {cost_emoji} **μ „μ›” λŒ€λΉ„:** `{cost_diff_str}` ({'증가' if cost_diff > 0 else 'κ°μ†Œ' if cost_diff < 0 else 'λ³€λ™μ—†μŒ'}) +""".strip() + + try: + requests.post(DISCORD_WEBHOOK_URL, json={ + "username": "Eatda Manager", + "embeds": [{ + "title": f"πŸ“… {current_period['month_str']} 운영 κ²°μ‚° 보고", + "description": message, + "color": 5763719 if dd_data['slo_avail'] >= 99 else 15548997 + }] + }) + print("βœ… Report sent successfully to Discord!") + except Exception as e: + print(f"❌ Failed to send Discord webhook: {e}") + + +if __name__ == "__main__": + send_discord_report() From 529f815ed605891e3fdc3fdca4045701977d73b4 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:23:37 +0900 Subject: [PATCH 3/9] =?UTF-8?q?chore:=20=ED=85=8C=EC=8A=A4=ED=8A=B8?= =?UTF-8?q?=EC=9A=A9=20=EC=9B=8C=ED=81=AC=ED=94=8C=EB=A1=9C=20=EC=A1=B0?= =?UTF-8?q?=EA=B1=B4=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/monthly-server-report.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/monthly-server-report.yml b/.github/workflows/monthly-server-report.yml index 5dea4122..34d24cef 100644 --- a/.github/workflows/monthly-server-report.yml +++ b/.github/workflows/monthly-server-report.yml @@ -1,7 +1,9 @@ name: monthly-server-report.yml on: - + push: + branches: + - feat/PRODUCT-281 schedule: - cron: '30 0 1 * *' workflow_dispatch: From a388fc36f714ffb8e97fb8185adb83d95b911bab Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:37:35 +0900 Subject: [PATCH 4/9] =?UTF-8?q?chore:=20=EB=B6=88=ED=95=84=EC=9A=94=20?= =?UTF-8?q?=EC=98=B5=EC=85=98=20=EC=82=AD=EC=A0=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/monthly-server-report.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py index 0918e4fa..1b21bf92 100644 --- a/scripts/monthly-server-report.py +++ b/scripts/monthly-server-report.py @@ -97,7 +97,6 @@ def get_metric(metric_name): Dimensions=[ {'Name': 'WebACL', 'Value': WAF_WEB_ACL_NAME}, {'Name': 'Region', 'Value': AWS_REGION}, - {'Name': 'Rule', 'Value': 'ALL'} ], StartTime=start_iso, EndTime=end_iso, From 71a63df32368ff64ce07abd1d71b2958e2ec06d2 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:52:05 +0900 Subject: [PATCH 5/9] =?UTF-8?q?fix:=20=EB=8D=B0=EC=9D=B4=ED=84=B0=EB=8F=85?= =?UTF-8?q?=20=EB=A6=AC=EC=A0=84=20=EC=B6=94=EA=B0=80=20=EB=B0=8F=20?= =?UTF-8?q?=EB=82=A0=EC=A7=9C=20=EC=98=B5=EC=85=98=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/monthly-server-report.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py index 1b21bf92..15551ad4 100644 --- a/scripts/monthly-server-report.py +++ b/scripts/monthly-server-report.py @@ -26,6 +26,8 @@ def get_date_ranges(): report_period = { 'start_ts': int(last_month_start.timestamp()), 'end_ts': int(this_month_start.timestamp()), + 'start_dt': last_month_start, + 'end_dt': this_month_start, 'start_iso': last_month_start.strftime('%Y-%m-%d'), 'end_iso': this_month_start.strftime('%Y-%m-%d'), 'month_str': last_month_start.strftime("%Yλ…„ %mμ›”") @@ -49,6 +51,7 @@ def normalize_slo_value(value): def get_datadog_metrics(start_ts, end_ts): configuration = Configuration() + configuration.server_variables["site"] = "us5" configuration.api_key["apiKeyAuth"] = DD_API_KEY configuration.api_key["appKeyAuth"] = DD_APP_KEY From 6372323d1145c86e3776759c6045f2669a68c1fd Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:53:38 +0900 Subject: [PATCH 6/9] =?UTF-8?q?fix:=20=EB=8D=B0=EC=9D=B4=ED=84=B0=EB=8F=85?= =?UTF-8?q?=20=EB=A6=AC=EC=A0=84=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/monthly-server-report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py index 15551ad4..67703ae4 100644 --- a/scripts/monthly-server-report.py +++ b/scripts/monthly-server-report.py @@ -51,7 +51,7 @@ def normalize_slo_value(value): def get_datadog_metrics(start_ts, end_ts): configuration = Configuration() - configuration.server_variables["site"] = "us5" + configuration.host = "https://api.us5.datadoghq.com" configuration.api_key["apiKeyAuth"] = DD_API_KEY configuration.api_key["appKeyAuth"] = DD_APP_KEY From 7dc931dcd371860818d28c6577fe388d211f20c2 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 00:57:19 +0900 Subject: [PATCH 7/9] =?UTF-8?q?fix:=20=EB=82=A0=EC=A7=9C=20=ED=8F=AC?= =?UTF-8?q?=EB=A7=B7=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/monthly-server-report.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py index 67703ae4..d41730d2 100644 --- a/scripts/monthly-server-report.py +++ b/scripts/monthly-server-report.py @@ -89,7 +89,7 @@ def get_datadog_metrics(start_ts, end_ts): return data -def get_aws_waf_stats(start_iso, end_iso): +def get_aws_waf_stats(start_dt, end_dt): client = boto3.client('cloudwatch', region_name=AWS_REGION) def get_metric(metric_name): @@ -101,13 +101,14 @@ def get_metric(metric_name): {'Name': 'WebACL', 'Value': WAF_WEB_ACL_NAME}, {'Name': 'Region', 'Value': AWS_REGION}, ], - StartTime=start_iso, - EndTime=end_iso, - Period=2592000, + StartTime=start_dt, + EndTime=end_dt, + Period=86400, Statistics=['Sum'] ) if response['Datapoints']: - return int(response['Datapoints'][0]['Sum']) + return int(sum([dp['Sum'] for dp in response['Datapoints']])) + print(f"⚠️ No datapoints for WAF metric: {metric_name}") return 0 except Exception as e: @@ -141,7 +142,8 @@ def send_discord_report(): current_period, prev_period = get_date_ranges() dd_data = get_datadog_metrics(current_period['start_ts'], current_period['end_ts']) - waf_data = get_aws_waf_stats(current_period['start_iso'], current_period['end_iso']) + + waf_data = get_aws_waf_stats(current_period['start_dt'], current_period['end_dt']) curr_cost = get_total_cost(current_period['start_iso'], current_period['end_iso']) prev_cost = get_total_cost(prev_period['start_iso'], prev_period['end_iso']) From 8fa631cbdeb13aa36ce1de9408f8803a590c64f9 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 01:11:20 +0900 Subject: [PATCH 8/9] =?UTF-8?q?fix:=20=EB=82=A0=EC=A7=9C=20=EC=98=B5?= =?UTF-8?q?=EC=85=98=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/monthly-server-report.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scripts/monthly-server-report.py b/scripts/monthly-server-report.py index d41730d2..40c541f0 100644 --- a/scripts/monthly-server-report.py +++ b/scripts/monthly-server-report.py @@ -93,19 +93,20 @@ def get_aws_waf_stats(start_dt, end_dt): client = boto3.client('cloudwatch', region_name=AWS_REGION) def get_metric(metric_name): + response = client.get_metric_statistics( + Namespace='AWS/WAFV2', + MetricName=metric_name, + Dimensions=[ + {'Name': 'WebACL', 'Value': WAF_WEB_ACL_NAME}, + {'Name': 'Rule', 'Value': 'ALL'}, + {'Name': 'Region', 'Value': AWS_REGION}, + ], + StartTime=start_dt, + EndTime=end_dt, + Period=86400, + Statistics=['Sum'] + ) try: - response = client.get_metric_statistics( - Namespace='AWS/WAFV2', - MetricName=metric_name, - Dimensions=[ - {'Name': 'WebACL', 'Value': WAF_WEB_ACL_NAME}, - {'Name': 'Region', 'Value': AWS_REGION}, - ], - StartTime=start_dt, - EndTime=end_dt, - Period=86400, - Statistics=['Sum'] - ) if response['Datapoints']: return int(sum([dp['Sum'] for dp in response['Datapoints']])) From 4f2e812b1b554b11a2b60fe401f451773f2056d1 Mon Sep 17 00:00:00 2001 From: lvalentine6 Date: Sun, 30 Nov 2025 01:22:59 +0900 Subject: [PATCH 9/9] =?UTF-8?q?fix:=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20?= =?UTF-8?q?=EC=A1=B0=EA=B1=B4=20=EC=82=AD=EC=A0=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/monthly-server-report.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/monthly-server-report.yml b/.github/workflows/monthly-server-report.yml index 34d24cef..92297672 100644 --- a/.github/workflows/monthly-server-report.yml +++ b/.github/workflows/monthly-server-report.yml @@ -1,9 +1,6 @@ name: monthly-server-report.yml on: - push: - branches: - - feat/PRODUCT-281 schedule: - cron: '30 0 1 * *' workflow_dispatch: