diff --git a/modules/regional-go-service/README.md b/modules/regional-go-service/README.md index eb766227..d4fe0b83 100644 --- a/modules/regional-go-service/README.md +++ b/modules/regional-go-service/README.md @@ -75,7 +75,6 @@ No requirements. | Name | Source | Version | |------|--------|---------| | [audit-serviceaccount](#module\_audit-serviceaccount) | ../audit-serviceaccount | n/a | -| [otel-collector](#module\_otel-collector) | ../otel-collector | n/a | ## Resources @@ -86,6 +85,8 @@ No requirements. | [google_cloud_run_v2_service_iam_member.public-services-are-unauthenticated](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_v2_service_iam_member) | resource | | [google_monitoring_alert_policy.anomalous-service-access](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.bad-rollout](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | +| [google_project_iam_member.metrics-writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [google_project_iam_member.trace-writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [ko_build.this](https://registry.terraform.io/providers/ko-build/ko/latest/docs/resources/build) | resource | | [google_client_openid_userinfo.me](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_openid_userinfo) | data source | | [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | @@ -101,6 +102,7 @@ No requirements. | [labels](#input\_labels) | Labels to apply to the service. | `map(string)` | `{}` | no | | [name](#input\_name) | n/a | `string` | n/a | yes | | [notification\_channels](#input\_notification\_channels) | List of notification channels to alert. | `list(string)` | n/a | yes | +| [otel\_collector\_image](#input\_otel\_collector\_image) | The otel collector image to use as a base. Must be on gcr.io or dockerhub. | `string` | `"chainguard/opentelemetry-collector-contrib:latest"` | no | | [project\_id](#input\_project\_id) | n/a | `string` | n/a | yes | | [regional-volumes](#input\_regional-volumes) | The volumes to make available to the containers in the service for mounting. |
list(object({
name = string
gcs = optional(map(object({
bucket = string
read_only = optional(bool, true)
})), {})
nfs = optional(map(object({
server = string
path = string
read_only = optional(bool, true)
})), {})
}))
| `[]` | no | | [regions](#input\_regions) | A map from region names to a network and subnetwork. A pub/sub topic and ingress service (publishing to the respective topic) will be created in each region, with the ingress service configured to egress all traffic via the specified subnetwork. |
map(object({
network = string
subnet = string
}))
| n/a | yes | diff --git a/modules/regional-go-service/main.tf b/modules/regional-go-service/main.tf index ee925e9e..db50fad9 100644 --- a/modules/regional-go-service/main.tf +++ b/modules/regional-go-service/main.tf @@ -18,6 +18,18 @@ module "audit-serviceaccount" { notification_channels = var.notification_channels } +resource "google_project_iam_member" "metrics-writer" { + project = var.project_id + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${var.service_account}" +} + +resource "google_project_iam_member" "trace-writer" { + project = var.project_id + role = "roles/cloudtrace.agent" + member = "serviceAccount:${var.service_account}" +} + // Build each of the application images from source. resource "ko_build" "this" { for_each = var.containers @@ -33,14 +45,6 @@ resource "cosign_sign" "this" { conflict = "REPLACE" } -// Build our otel-collector sidecar image. -module "otel-collector" { - source = "../otel-collector" - - project_id = var.project_id - service_account = var.service_account -} - // Deploy the service into each of our regions. resource "google_cloud_run_v2_service" "this" { for_each = var.regions @@ -132,7 +136,15 @@ resource "google_cloud_run_v2_service" "this" { } } } - containers { image = module.otel-collector.image } + containers { + image = var.otel_collector_image + // config via env is an option; https://pkg.go.dev/go.opentelemetry.io/collector/service#section-readme + args = ["--config=env:OTEL_CONFIG"] + env { + name = "OTEL_CONFIG" + value = file("${path.module}/otel-config/config.yaml") + } + } dynamic "volumes" { for_each = var.volumes diff --git a/modules/regional-go-service/otel-config/config.yaml b/modules/regional-go-service/otel-config/config.yaml new file mode 100644 index 00000000..7200c61a --- /dev/null +++ b/modules/regional-go-service/otel-config/config.yaml @@ -0,0 +1,83 @@ +receivers: + prometheus: + config: + scrape_configs: + - job_name: "localhost" + scrape_interval: 10s + static_configs: + # TODO: make this configurable + - targets: ["localhost:2112"] + # Do not relabel job and instance labels if existed. + honor_labels: true + metric_relabel_configs: + - source_labels: [ __name__ ] + regex: '^prometheus_.*' + action: drop + - source_labels: [ __name__ ] + regex: '^process_.*' + action: drop + - source_labels: [ __name__ ] + regex: '^go_.*' + action: drop + +processors: + batch: + # batch metrics before sending to reduce API usage + send_batch_max_size: 200 + send_batch_size: 200 + timeout: 5s + + memory_limiter: + # drop metrics if memory usage gets too high + check_interval: 1s + limit_percentage: 65 + spike_limit_percentage: 20 + + # automatically detect Cloud Run resource metadata + resourcedetection: + detectors: [env, gcp] + + resource: + attributes: + # Add instance_id as a resource attribute, so to avoid race conditions + # between multiple otel sidecar instance uploading overlapping time series + # to the same buckets. + - key: service.instance.id + from_attribute: faas.id + action: upsert + # The `gcp` resourcedetection processor sets `faas.name` to the name of the + # Cloud Run service or the Cloud Run job. + - from_attribute: faas.name + # The googlemanagedprometheus exporter consumes `service.name` attribute + # and set the `job` resource label to this value. (See + # https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/pull/764) + key: "service.name" + action: upsert + +exporters: + googlemanagedprometheus: + sending_queue: + enabled: true + # we are handling metrics for a single pod, no need to have + # too many senders. this will also avoid out-of-order data. + num_consumers: 1 + +extensions: + health_check: + +service: + telemetry: + logs: + # We don't want to see scraper startup logging every + # cold start. + level: "error" + # Stack trace is less useful and break lines. + disable_stacktrace: true + encoding: json + + extensions: [health_check] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch, memory_limiter, resourcedetection, resource] + exporters: [googlemanagedprometheus] diff --git a/modules/regional-go-service/variables.tf b/modules/regional-go-service/variables.tf index 813e312f..7eb01fc0 100644 --- a/modules/regional-go-service/variables.tf +++ b/modules/regional-go-service/variables.tf @@ -166,3 +166,9 @@ variable "labels" { type = map(string) default = {} } + +variable "otel_collector_image" { + type = string + default = "chainguard/opentelemetry-collector-contrib:latest" + description = "The otel collector image to use as a base. Must be on gcr.io or dockerhub." +}