diff --git a/README.md b/README.md index df812ae..9b089c5 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ Configuration could be defined in [prometheus-rds-exporter.yaml](https://github. | collect-quotas | Collect AWS RDS quotas (AWS quotas API) | true | | collect-usages | Collect AWS RDS usages (AWS Cloudwatch API) | true | | debug | Enable debug mode | | +| enable-otel-traces | Enable OpenTelemetry traces. See [configuration](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/) | false | | listen-address | Address to listen on for web interface | :9043 | | log-format | Log format (`text` or `json`) | json | | metrics-path | Path under which to expose metrics | /metrics | @@ -538,3 +539,23 @@ make helm-test # Helm unit test make kubeconform # Kubernetes manifest validation make checkcov # Check misconfigurations ``` + +### Tracing + +Prometheus RDS Exporter includes an OpenTelemetry trace exporter to facilitate troubleshooting. + +Traces can be forwarded to any OpenTelemetry server using gRPC protocol. + +1. Export the `OTEL_EXPORTER_OTLP_ENDPOINT` variable. + + ```bash + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 + ``` + + See [OTEL SDK configuration](https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#general-sdk-configuration) and [OpenTelemetry environments variables](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/) for all options. + +1. Start exporter with OpenTelemetry enabled + + ```bash + prometheus-rds-exporter --enable-otel-traces + ``` diff --git a/cmd/root.go b/cmd/root.go index d1b78df..241006c 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -45,6 +45,7 @@ type exporterConfig struct { CollectMaintenances bool `mapstructure:"collect-maintenances"` CollectQuotas bool `mapstructure:"collect-quotas"` CollectUsages bool `mapstructure:"collect-usages"` + OTELTracesEnabled bool `mapstructure:"enable-otel-traces"` } func run(configuration exporterConfig) { @@ -86,10 +87,11 @@ func run(configuration exporterConfig) { prometheus.MustRegister(collector) serverConfiguration := http.Config{ - ListenAddress: configuration.ListenAddress, - MetricPath: configuration.MetricPath, - TLSCertPath: configuration.TLSCertPath, - TLSKeyPath: configuration.TLSKeyPath, + ListenAddress: configuration.ListenAddress, + MetricPath: configuration.MetricPath, + TLSCertPath: configuration.TLSCertPath, + TLSKeyPath: configuration.TLSKeyPath, + OTELTracesEnabled: configuration.OTELTracesEnabled, } server := http.New(*logger, serverConfiguration) @@ -124,6 +126,7 @@ func NewRootCommand() (*cobra.Command, error) { cmd.Flags().StringVarP(&cfgFile, "config", "c", "", "config file (default is $HOME/prometheus-rds-exporter.yaml)") cmd.Flags().BoolP("debug", "d", false, "Enable debug mode") + cmd.Flags().BoolP("enable-otel-traces", "", false, "Enable OpenTelemetry traces") cmd.Flags().StringP("log-format", "l", "json", "Log format (text or json)") cmd.Flags().StringP("metrics-path", "", "/metrics", "Path under which to expose metrics") cmd.Flags().StringP("tls-cert-path", "", "", "Path to TLS certificate") @@ -149,6 +152,11 @@ func NewRootCommand() (*cobra.Command, error) { return cmd, fmt.Errorf("failed to bind 'log-format' parameter: %w", err) } + err = viper.BindPFlag("enable-otel-traces", cmd.Flags().Lookup("enable-otel-traces")) + if err != nil { + return cmd, fmt.Errorf("failed to bind 'enable-otel-traces' parameter: %w", err) + } + err = viper.BindPFlag("metrics-path", cmd.Flags().Lookup("metrics-path")) if err != nil { return cmd, fmt.Errorf("failed to bind 'metrics-path' parameter: %w", err) diff --git a/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml b/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml index 5ac69c0..a3fcd71 100644 --- a/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml +++ b/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml @@ -20,6 +20,10 @@ # Path to private key for TLS # tls-key-path: "" +# Enable OpenTelemetry traces +# See https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter for configuration parameters +# enable-otel-traces: true + # # AWS credentials # diff --git a/go.mod b/go.mod index e8aa7a2..073e8e1 100644 --- a/go.mod +++ b/go.mod @@ -12,10 +12,15 @@ require ( github.com/aws/aws-sdk-go-v2/service/rds v1.75.1 github.com/aws/aws-sdk-go-v2/service/servicequotas v1.21.2 github.com/aws/aws-sdk-go-v2/service/sts v1.28.4 - github.com/prometheus/client_golang v1.18.0 + github.com/prometheus/client_golang v1.19.0 github.com/spf13/cobra v1.8.0 github.com/spf13/viper v1.18.2 github.com/stretchr/testify v1.9.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 + go.opentelemetry.io/otel v1.22.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.22.0 + go.opentelemetry.io/otel/sdk v1.22.0 + go.opentelemetry.io/otel/trace v1.22.0 golang.org/x/exp v0.0.0-20231127185646-65229373498e ) @@ -29,9 +34,15 @@ require ( github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.2 // indirect github.com/aws/smithy-go v1.20.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect @@ -49,9 +60,16 @@ require ( github.com/spf13/cast v1.6.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/subosito/gotenv v1.6.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0 // indirect + go.opentelemetry.io/otel/metric v1.22.0 // indirect + go.opentelemetry.io/proto/otlp v1.0.0 // indirect go.uber.org/multierr v1.11.0 // indirect + golang.org/x/net v0.20.0 // indirect golang.org/x/sys v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20231120223509-83a465c0220f // indirect + google.golang.org/grpc v1.60.1 // indirect google.golang.org/protobuf v1.33.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 6ea1777..8a2a95a 100644 --- a/go.sum +++ b/go.sum @@ -34,6 +34,8 @@ github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw= github.com/aws/smithy-go v1.20.1/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -41,12 +43,27 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/glog v1.1.2 h1:DVjP2PbBOzHyzA+dn3WhHIq4NdVu3Q+pvivFICf/7fo= +github.com/golang/glog v1.1.2/go.mod h1:zR+okUeTbrL6EL3xHUDxZuEtGv04p5shwip1+mL/rLQ= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -105,14 +122,45 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 h1:sv9kVfal0MK0wBMCOGr+HeJm9v803BkJxGrk2au7j08= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0/go.mod h1:SK2UL73Zy1quvRPonmOmRDiWk1KBV3LyIeeIxcEApWw= +go.opentelemetry.io/otel v1.22.0 h1:xS7Ku+7yTFvDfDraDIJVpw7XPyuHlB9MCiqqX5mcJ6Y= +go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0 h1:9M3+rhx7kZCIQQhQRYaZCdNu1V73tm4TvXs2ntl98C4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0/go.mod h1:noq80iT8rrHP1SfybmPiRGc9dc5M8RPmGvtwo7Oo7tc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.22.0 h1:H2JFgRcGiyHg7H7bwcwaQJYrNFqCqrbTQ8K4p1OvDu8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.22.0/go.mod h1:WfCWp1bGoYK8MeULtI15MmQVczfR+bFkk0DF3h06QmQ= +go.opentelemetry.io/otel/metric v1.22.0 h1:lypMQnGyJYeuYPhOM/bgjbFM6WE44W1/T45er4d8Hhg= +go.opentelemetry.io/otel/metric v1.22.0/go.mod h1:evJGjVpZv0mQ5QBRJoBF64yMuOf4xCWdXjK8pzFvliY= +go.opentelemetry.io/otel/sdk v1.22.0 h1:6coWHw9xw7EfClIC/+O31R8IY3/+EiRFHevmHafB2Gw= +go.opentelemetry.io/otel/sdk v1.22.0/go.mod h1:iu7luyVGYovrRpe2fmj3CVKouQNdTOkxtLzPvPz1DOc= +go.opentelemetry.io/otel/trace v1.22.0 h1:Hg6pPujv0XG9QaVbGOBVHunyuLcCC3jN7WEhPx83XD0= +go.opentelemetry.io/otel/trace v1.22.0/go.mod h1:RbbHXVqKES9QhzZq/fE5UnOSILqRt40a21sPw2He1xo= +go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I= +go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/exp v0.0.0-20231127185646-65229373498e h1:Gvh4YaCaXNs6dKTlfgismwWZKyjVZXwOPfIyUaqU3No= golang.org/x/exp v0.0.0-20231127185646-65229373498e/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= +golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17 h1:wpZ8pe2x1Q3f2KyT5f8oP/fa9rHAKgFPr/HZdNuS+PQ= +google.golang.org/genproto v0.0.0-20231106174013-bbf56f31fb17/go.mod h1:J7XzRzVy1+IPwWHZUzoD0IccYZIrXILAQpc+Qy9CMhY= +google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17 h1:JpwMPBpFN3uKhdaekDpiNlImDdkUAyiJ6ez/uxGaUSo= +google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17/go.mod h1:0xJLfVdJqpAPl8tDg1ujOCGzx6LFLttXT5NhllGOXY4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231120223509-83a465c0220f h1:ultW7fxlIvee4HYrtnaRPon9HpEgFk5zYpmfMgtKB5I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231120223509-83a465c0220f/go.mod h1:L9KNLi232K1/xB6f7AlSX692koaRnKaWSR0stBki0Yc= +google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= +google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/app/cloudwatch/usage.go b/internal/app/cloudwatch/usage.go index 8a76a49..7924d79 100644 --- a/internal/app/cloudwatch/usage.go +++ b/internal/app/cloudwatch/usage.go @@ -11,8 +11,12 @@ import ( aws_cloudwatch "github.com/aws/aws-sdk-go-v2/service/cloudwatch" aws_cloudwatch_types "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" converter "github.com/qonto/prometheus-rds-exporter/internal/app/unit" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" ) +var tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/cloudwatch") + type UsageMetrics struct { AllocatedStorage float64 DBInstances float64 @@ -112,14 +116,16 @@ func generateCloudWatchQueriesForUsage() *aws_cloudwatch.GetMetricDataInput { } } -func NewUsageFetcher(client CloudWatchClient, logger slog.Logger) *usageFetcher { +func NewUsageFetcher(ctx context.Context, client CloudWatchClient, logger slog.Logger) *usageFetcher { return &usageFetcher{ + ctx: ctx, client: client, logger: &logger, } } type usageFetcher struct { + ctx context.Context client CloudWatchClient statistics Statistics logger *slog.Logger @@ -131,6 +137,9 @@ func (u *usageFetcher) GetStatistics() Statistics { // GetUsageMetrics returns RDS service usages metrics func (u *usageFetcher) GetUsageMetrics() (UsageMetrics, error) { + _, span := tracer.Start(u.ctx, "collect-usage") + defer span.End() + metrics := UsageMetrics{} query := generateCloudWatchQueriesForUsage() @@ -152,10 +161,15 @@ func (u *usageFetcher) GetUsageMetrics() (UsageMetrics, error) { if len(m.Values) > 0 { err = metrics.Update(*m.Label, m.Values[0]) if err != nil { + span.SetStatus(codes.Error, "can't update internal values") + span.RecordError(err) + return metrics, fmt.Errorf("can't update internal values: %w", err) } } } + span.SetStatus(codes.Ok, "metrics fetched") + return metrics, nil } diff --git a/internal/app/cloudwatch/usage_test.go b/internal/app/cloudwatch/usage_test.go index 3d4ec37..ba444db 100644 --- a/internal/app/cloudwatch/usage_test.go +++ b/internal/app/cloudwatch/usage_test.go @@ -1,6 +1,7 @@ package cloudwatch_test import ( + "context" "log/slog" "testing" @@ -42,7 +43,8 @@ func TestGetUsageMetrics(t *testing.T) { }, } - fetcher := cloudwatch.NewUsageFetcher(client, slog.Logger{}) + ctx := context.TODO() + fetcher := cloudwatch.NewUsageFetcher(ctx, client, slog.Logger{}) result, err := fetcher.GetUsageMetrics() require.NoError(t, err, "GetUsageMetrics must succeed") diff --git a/internal/app/ec2/ec2.go b/internal/app/ec2/ec2.go index f3491ee..0f68376 100644 --- a/internal/app/ec2/ec2.go +++ b/internal/app/ec2/ec2.go @@ -7,13 +7,18 @@ import ( aws_ec2 "github.com/aws/aws-sdk-go-v2/service/ec2" aws_ec2_types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/qonto/prometheus-rds-exporter/internal/app/trace" converter "github.com/qonto/prometheus-rds-exporter/internal/app/unit" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" ) const ( maxInstanceTypesPerEC2APIRequest int = 100 // Limit the number of instance types per request due to AWS API limits ) +var tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/ec2") + type EC2InstanceMetrics struct { MaximumIops int32 MaximumThroughput float64 @@ -33,13 +38,15 @@ type EC2Client interface { DescribeInstanceTypes(ctx context.Context, input *aws_ec2.DescribeInstanceTypesInput, fn ...func(*aws_ec2.Options)) (*aws_ec2.DescribeInstanceTypesOutput, error) } -func NewFetcher(client EC2Client) *EC2Fetcher { +func NewFetcher(context context.Context, client EC2Client) *EC2Fetcher { return &EC2Fetcher{ + ctx: context, client: client, } } type EC2Fetcher struct { + ctx context.Context client EC2Client statistics Statistics } @@ -51,9 +58,17 @@ func (e *EC2Fetcher) GetStatistics() Statistics { // GetDBInstanceTypeInformation returns information about specified AWS EC2 instance types // AWS RDS API use "db." prefix while AWS EC2 API don't so we must remove it to obtains instance type information func (e *EC2Fetcher) GetDBInstanceTypeInformation(instanceTypes []string) (Metrics, error) { + ctx, span := tracer.Start(e.ctx, "collect-ec2-metrics") + defer span.End() + metrics := make(map[string]EC2InstanceMetrics) for _, instances := range chunkBy(instanceTypes, maxInstanceTypesPerEC2APIRequest) { + _, instanceTypeSpan := tracer.Start(ctx, "collect-ec2-instance-types-metrics") + defer instanceTypeSpan.End() + + instanceTypeSpan.SetAttributes(trace.AWSInstanceTypesCount(int64(len(instances)))) + // Remove "db." prefix from instance types instanceTypesToFetch := make([]aws_ec2_types.InstanceType, len(instances)) for i, instance := range instances { @@ -64,6 +79,9 @@ func (e *EC2Fetcher) GetDBInstanceTypeInformation(instanceTypes []string) (Metri resp, err := e.client.DescribeInstanceTypes(context.TODO(), input) if err != nil { + instanceTypeSpan.SetStatus(codes.Error, "can't fetch describe instance types") + instanceTypeSpan.RecordError(err) + return Metrics{}, fmt.Errorf("can't fetch describe instance types: %w", err) } @@ -78,6 +96,8 @@ func (e *EC2Fetcher) GetDBInstanceTypeInformation(instanceTypes []string) (Metri Memory: converter.MegaBytesToBytes(*i.MemoryInfo.SizeInMiB), } } + + instanceTypeSpan.SetStatus(codes.Ok, "metrics fetched") } return Metrics{ diff --git a/internal/app/ec2/ec2_test.go b/internal/app/ec2/ec2_test.go index 3d7d830..5a8ba2f 100644 --- a/internal/app/ec2/ec2_test.go +++ b/internal/app/ec2/ec2_test.go @@ -1,6 +1,7 @@ package ec2_test import ( + "context" "testing" "github.com/qonto/prometheus-rds-exporter/internal/app/ec2" @@ -11,10 +12,11 @@ import ( ) func TestGetDBInstanceTypeInformation(t *testing.T) { + context := context.TODO() client := mock.EC2Client{} instanceTypes := []string{"db.t3.large", "db.t3.small"} - fetcher := ec2.NewFetcher(client) + fetcher := ec2.NewFetcher(context, client) result, err := fetcher.GetDBInstanceTypeInformation(instanceTypes) require.NoError(t, err, "GetDBInstanceTypeInformation must succeed") diff --git a/internal/app/exporter/exporter.go b/internal/app/exporter/exporter.go index 1be2f77..df616c6 100644 --- a/internal/app/exporter/exporter.go +++ b/internal/app/exporter/exporter.go @@ -2,6 +2,7 @@ package exporter import ( + "context" "fmt" "log/slog" "strconv" @@ -13,6 +14,10 @@ import ( "github.com/qonto/prometheus-rds-exporter/internal/app/rds" "github.com/qonto/prometheus-rds-exporter/internal/app/servicequotas" "github.com/qonto/prometheus-rds-exporter/internal/infra/build" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) const ( @@ -20,6 +25,8 @@ const ( exporterDownStatusCode float64 = 0 ) +var tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/exporter") + type Configuration struct { CollectInstanceMetrics bool CollectInstanceTags bool @@ -48,6 +55,7 @@ type metrics struct { } type rdsCollector struct { + ctx context.Context wg sync.WaitGroup logger slog.Logger counters counters @@ -346,7 +354,7 @@ func (c *rdsCollector) fetchMetrics() error { // Fetch RDS instances metrics c.logger.Info("get RDS metrics") - rdsFetcher := rds.NewFetcher(c.rdsClient, rds.Configuration{ + rdsFetcher := rds.NewFetcher(c.ctx, c.rdsClient, rds.Configuration{ CollectLogsSize: c.configuration.CollectLogsSize, CollectMaintenances: c.configuration.CollectMaintenances, }) @@ -385,6 +393,9 @@ func (c *rdsCollector) getCloudwatchMetrics(client cloudwatch.CloudWatchClient, defer c.wg.Done() c.logger.Debug("fetch cloudwatch metrics") + _, span := tracer.Start(c.ctx, "collect-cloudwatch-metrics") + defer span.End() + fetcher := cloudwatch.NewRDSFetcher(client, c.logger) metrics, err := fetcher.GetRDSInstanceMetrics(instanceIdentifiers) @@ -402,7 +413,7 @@ func (c *rdsCollector) getUsagesMetrics(client cloudwatch.CloudWatchClient) { defer c.wg.Done() c.logger.Debug("fetch usage metrics") - fetcher := cloudwatch.NewUsageFetcher(client, c.logger) + fetcher := cloudwatch.NewUsageFetcher(c.ctx, client, c.logger) metrics, err := fetcher.GetUsageMetrics() if err != nil { @@ -420,7 +431,7 @@ func (c *rdsCollector) getEC2Metrics(client ec2.EC2Client, instanceTypes []strin defer c.wg.Done() c.logger.Debug("fetch EC2 metrics") - fetcher := ec2.NewFetcher(client) + fetcher := ec2.NewFetcher(c.ctx, client) metrics, err := fetcher.GetDBInstanceTypeInformation(instanceTypes) if err != nil { @@ -436,18 +447,26 @@ func (c *rdsCollector) getEC2Metrics(client ec2.EC2Client, instanceTypes []strin func (c *rdsCollector) getQuotasMetrics(client servicequotas.ServiceQuotasClient) { defer c.wg.Done() + + ctx, span := tracer.Start(c.ctx, "collect-quota-metrics") + defer span.End() + c.logger.Debug("fetch quotas") - fetcher := servicequotas.NewFetcher(client) + fetcher := servicequotas.NewFetcher(ctx, client) metrics, err := fetcher.GetRDSQuotas() if err != nil { c.counters.Errors++ c.logger.Error(fmt.Sprintf("can't fetch service quota metrics: %s", err)) + span.SetStatus(codes.Error, "can't fetch service quota metrics") + span.RecordError(err) } c.counters.ServiceQuotasAPICalls += fetcher.GetStatistics().UsageAPICall c.metrics.ServiceQuota = metrics + + span.SetStatus(codes.Ok, "quota fetched") } func (c *rdsCollector) getInstanceTagLabels(dbidentifier string, instance rds.RdsInstanceMetrics) (keys []string, values []string) { @@ -476,6 +495,11 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric(c.exporterBuildInformation, prometheus.GaugeValue, 1, build.Version, build.CommitSHA, build.Date) ch <- prometheus.MustNewConstMetric(c.errors, prometheus.CounterValue, c.counters.Errors) + var span trace.Span + + c.ctx, span = tracer.Start(context.TODO(), "collect-metrics") + defer span.End() + // Get all metrics err := c.fetchMetrics() if err != nil { @@ -483,8 +507,14 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) { // Mark exporter as down ch <- prometheus.MustNewConstMetric(c.up, prometheus.CounterValue, exporterDownStatusCode) + span.SetStatus(codes.Error, "failed to get metrics") + span.RecordError(err) + return } + + span.End() + ch <- prometheus.MustNewConstMetric(c.up, prometheus.CounterValue, exporterUpStatusCode) // RDS metrics diff --git a/internal/app/rds/rds.go b/internal/app/rds/rds.go index fb0972b..2c86c4c 100644 --- a/internal/app/rds/rds.go +++ b/internal/app/rds/rds.go @@ -12,6 +12,9 @@ import ( aws_rds "github.com/aws/aws-sdk-go-v2/service/rds" aws_rds_types "github.com/aws/aws-sdk-go-v2/service/rds/types" converter "github.com/qonto/prometheus-rds-exporter/internal/app/unit" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" + semconv "go.opentelemetry.io/otel/semconv/v1.24.0" ) type Configuration struct { @@ -91,6 +94,8 @@ const ( replicaRole string = "replica" ) +var tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/rds") + var instanceStatuses = map[string]int{ "available": InstanceStatusAvailable, "backing-up": InstanceStatusBackingUp, @@ -109,14 +114,16 @@ type RDSClient interface { DescribeDBLogFiles(context.Context, *aws_rds.DescribeDBLogFilesInput, ...func(*aws_rds.Options)) (*aws_rds.DescribeDBLogFilesOutput, error) } -func NewFetcher(client RDSClient, configuration Configuration) RDSFetcher { +func NewFetcher(ctx context.Context, client RDSClient, configuration Configuration) RDSFetcher { return RDSFetcher{ + ctx: ctx, client: client, configuration: configuration, } } type RDSFetcher struct { + ctx context.Context client RDSClient statistics Statistics configuration Configuration @@ -126,13 +133,19 @@ func (r *RDSFetcher) GetStatistics() Statistics { return r.statistics } -func (r *RDSFetcher) getPendingMaintenances() (map[string]string, error) { +func (r *RDSFetcher) getPendingMaintenances(ctx context.Context) (map[string]string, error) { + _, span := tracer.Start(ctx, "collect-pending-maintenances") + defer span.End() + instances := make(map[string]string) inputMaintenance := &aws_rds.DescribePendingMaintenanceActionsInput{} maintenances, err := r.client.DescribePendingMaintenanceActions(context.TODO(), inputMaintenance) if err != nil { + span.SetStatus(codes.Error, "failed to get maintenances") + span.RecordError(err) + return nil, fmt.Errorf("can't describe pending maintenance actions: %w", err) } @@ -161,10 +174,15 @@ func (r *RDSFetcher) getPendingMaintenances() (map[string]string, error) { instances[dbIdentifier] = maintenanceMode } + span.SetStatus(codes.Ok, "maintenances fetched") + return instances, nil } func (r *RDSFetcher) GetInstancesMetrics() (Metrics, error) { + ctx, span := tracer.Start(r.ctx, "collect-instance-metrics") + defer span.End() + metrics := make(map[string]RdsInstanceMetrics) var err error @@ -172,8 +190,11 @@ func (r *RDSFetcher) GetInstancesMetrics() (Metrics, error) { var instanceMaintenances map[string]string if r.configuration.CollectMaintenances { - instanceMaintenances, err = r.getPendingMaintenances() + instanceMaintenances, err = r.getPendingMaintenances(ctx) if err != nil { + span.SetStatus(codes.Error, "can't get RDS maintenances") + span.RecordError(err) + return Metrics{}, fmt.Errorf("can't get RDS maintenances: %w", err) } } @@ -182,30 +203,43 @@ func (r *RDSFetcher) GetInstancesMetrics() (Metrics, error) { paginator := aws_rds.NewDescribeDBInstancesPaginator(r.client, input) for paginator.HasMorePages() { + instanceCtx, instanceSpan := tracer.Start(ctx, "collect-rds-instances") + defer instanceSpan.End() + r.statistics.RdsAPICall++ output, err := paginator.NextPage(context.TODO()) if err != nil { - return Metrics{}, fmt.Errorf("can't get instances: %w", err) + span.SetStatus(codes.Error, "can't get RDS instances") + span.RecordError(err) + + return Metrics{}, fmt.Errorf("can't get RDS instances: %w", err) } for _, dbInstance := range output.DBInstances { dbIdentifier := dbInstance.DBInstanceIdentifier - instanceMetrics, err := r.computeInstanceMetrics(dbInstance, instanceMaintenances) + instanceMetrics, err := r.computeInstanceMetrics(instanceCtx, dbInstance, instanceMaintenances) if err != nil { + span.SetStatus(codes.Error, "can't compute instance metrics") + span.RecordError(err) + return Metrics{}, fmt.Errorf("can't compute instance metrics for %s: %w", *dbIdentifier, err) } metrics[*dbIdentifier] = instanceMetrics } + + instanceSpan.SetStatus(codes.Ok, "instance metrics fetch") } + span.SetStatus(codes.Ok, "metrics fetched") + return Metrics{Instances: metrics}, nil } // computeInstanceMetrics returns metrics about the specified instance -func (r *RDSFetcher) computeInstanceMetrics(dbInstance aws_rds_types.DBInstance, instanceMaintenances map[string]string) (RdsInstanceMetrics, error) { +func (r *RDSFetcher) computeInstanceMetrics(ctx context.Context, dbInstance aws_rds_types.DBInstance, instanceMaintenances map[string]string) (RdsInstanceMetrics, error) { dbIdentifier := dbInstance.DBInstanceIdentifier var iops int64 @@ -253,7 +287,7 @@ func (r *RDSFetcher) computeInstanceMetrics(dbInstance aws_rds_types.DBInstance, if r.configuration.CollectLogsSize { var err error - logFilesSize, err = r.getLogFilesSize(*dbIdentifier) + logFilesSize, err = r.getLogFilesSize(ctx, *dbIdentifier) if err != nil { return RdsInstanceMetrics{}, fmt.Errorf("can't get log files size for %d: %w", dbIdentifier, err) } @@ -312,7 +346,12 @@ func (r *RDSFetcher) computeInstanceMetrics(dbInstance aws_rds_types.DBInstance, } // getLogFilesSize returns the size of all logs on the specified instance -func (r *RDSFetcher) getLogFilesSize(dbidentifier string) (*int64, error) { +func (r *RDSFetcher) getLogFilesSize(ctx context.Context, dbidentifier string) (*int64, error) { + _, span := tracer.Start(ctx, "collect-instance-log") + defer span.End() + + span.SetAttributes(semconv.DBInstanceID(dbidentifier)) + var filesSize *int64 input := &aws_rds.DescribeDBLogFilesInput{DBInstanceIdentifier: &dbidentifier} @@ -326,6 +365,9 @@ func (r *RDSFetcher) getLogFilesSize(dbidentifier string) (*int64, error) { return filesSize, nil } + span.SetStatus(codes.Error, "can't describe db logs files") + span.RecordError(err) + return filesSize, fmt.Errorf("can't describe db logs files for %s: %w", dbidentifier, err) } diff --git a/internal/app/rds/rds_test.go b/internal/app/rds/rds_test.go index cb05865..41dd1d9 100644 --- a/internal/app/rds/rds_test.go +++ b/internal/app/rds/rds_test.go @@ -1,6 +1,7 @@ package rds_test import ( + "context" "fmt" "reflect" "testing" @@ -21,9 +22,10 @@ func TestGetMetrics(t *testing.T) { rdsInstance := mock.NewRdsInstance() mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstance}} + ctx := context.TODO() client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{CollectLogsSize: true} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -67,10 +69,11 @@ func TestGP2StorageType(t *testing.T) { rdsInstanceWithLargeDisk.StorageType = aws.String("gp2") rdsInstanceWithLargeDisk.AllocatedStorage = aws.Int32(20000) + ctx := context.TODO() mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstanceWithSmallDisk, *rdsInstanceWithMediumDisk, *rdsInstanceWithLargeDisk}} client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -95,10 +98,11 @@ func TestGP3StorageType(t *testing.T) { rdsInstanceWithLargeDisk.AllocatedStorage = aws.Int32(500) rdsInstanceWithLargeDisk.Iops = aws.Int32(12000) + ctx := context.TODO() mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstanceWithSmallDisk, *rdsInstanceWithLargeDisk}} client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -123,10 +127,11 @@ func TestIO1StorageType(t *testing.T) { rdsInstanceWithHighIOPS.StorageType = aws.String("io1") rdsInstanceWithHighIOPS.Iops = aws.Int32(64000) + ctx := context.TODO() mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstanceWithSmallIOPS, *rdsInstanceWithMediumIOPS, *rdsInstanceWithLargeIOPS, *rdsInstanceWithHighIOPS}} client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -156,7 +161,8 @@ func TestIO2StorageType(t *testing.T) { mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstanceWithSmallIOPS, *rdsInstanceWithMediumIOPS, *rdsInstanceWithLargeIOPS, *rdsInstanceWithHighIOPS}} client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + ctx := context.TODO() + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -188,7 +194,8 @@ func TestLogSize(t *testing.T) { DescribeDBLogFilesOutput: mockDescribeDBLogFilesOutput, } configuration := rds.Configuration{CollectLogsSize: true} - fetcher := rds.NewFetcher(client, configuration) + ctx := context.TODO() + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -205,7 +212,8 @@ func TestLogSizeInCreation(t *testing.T) { DescribeDBLogFilesOutputError: &aws_rds_types.DBInstanceNotFoundFault{}, } configuration := rds.Configuration{CollectLogsSize: true} - fetcher := rds.NewFetcher(client, configuration) + ctx := context.TODO() + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() var emptyInt64 *int64 @@ -224,7 +232,8 @@ func TestReplicaNode(t *testing.T) { client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{CollectLogsSize: true} - fetcher := rds.NewFetcher(client, configuration) + ctx := context.TODO() + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -275,7 +284,8 @@ func TestPendingModification(t *testing.T) { client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + ctx := context.TODO() + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -289,9 +299,10 @@ func TestPendingModificationDueToInstanceModification(t *testing.T) { rdsInstance.PendingModifiedValues = &pendingModifications mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstance}} + ctx := context.TODO() client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -304,9 +315,10 @@ func TestPendingModificationDueToUnappliedParameterGroup(t *testing.T) { rdsInstance.DBParameterGroups = []aws_rds_types.DBParameterGroupStatus{{DBParameterGroupName: aws.String("my_parameter_group"), ParameterApplyStatus: aws.String("pending-reboot")}} mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstance}} + ctx := context.TODO() client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() require.NoError(t, err, "GetInstancesMetrics must succeed") @@ -320,9 +332,10 @@ func TestInstanceAge(t *testing.T) { rdsInstance.InstanceCreateTime = &creationDate mockDescribeDBInstancesOutput := &aws_rds.DescribeDBInstancesOutput{DBInstances: []aws_rds_types.DBInstance{*rdsInstance}} + ctx := context.TODO() client := mock.RDSClient{DescribeDBInstancesOutput: mockDescribeDBInstancesOutput} configuration := rds.Configuration{} - fetcher := rds.NewFetcher(client, configuration) + fetcher := rds.NewFetcher(ctx, client, configuration) metrics, err := fetcher.GetInstancesMetrics() expectedAge := time.Since(*rdsInstance.InstanceCreateTime) diff --git a/internal/app/servicequotas/servicequotas.go b/internal/app/servicequotas/servicequotas.go index fc7d404..f61b6ec 100644 --- a/internal/app/servicequotas/servicequotas.go +++ b/internal/app/servicequotas/servicequotas.go @@ -7,13 +7,17 @@ import ( "fmt" aws_servicequotas "github.com/aws/aws-sdk-go-v2/service/servicequotas" + "github.com/qonto/prometheus-rds-exporter/internal/app/trace" converter "github.com/qonto/prometheus-rds-exporter/internal/app/unit" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" "golang.org/x/exp/slog" ) var ( errNoQuota = errors.New("no AWS quota with this code") - errQuotaError = errors.New("aws return error for this quota") + errQuotaError = errors.New("AWS return error for this quota") + tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/servicequotas") ) const ( @@ -40,13 +44,15 @@ type ServiceQuotasClient interface { GetServiceQuota(ctx context.Context, input *aws_servicequotas.GetServiceQuotaInput, optFns ...func(*aws_servicequotas.Options)) (*aws_servicequotas.GetServiceQuotaOutput, error) } -func NewFetcher(client ServiceQuotasClient) *serviceQuotaFetcher { +func NewFetcher(ctx context.Context, client ServiceQuotasClient) *serviceQuotaFetcher { return &serviceQuotaFetcher{ + ctx: ctx, client: client, } } type serviceQuotaFetcher struct { + ctx context.Context logger *slog.Logger client ServiceQuotasClient statistics Statistics @@ -58,6 +64,11 @@ func (s *serviceQuotaFetcher) GetStatistics() Statistics { // GetQuota retrieves and returns the AWS quota value for the specified serviceCode and quotaCode func (s *serviceQuotaFetcher) getQuota(serviceCode string, quotaCode string) (float64, error) { + _, span := tracer.Start(s.ctx, "get-quota") + defer span.End() + + span.SetAttributes(trace.AWSQuotaServiceCode(serviceCode), trace.AWSQuotaCode(quotaCode)) + params := &aws_servicequotas.GetServiceQuotaInput{ ServiceCode: &serviceCode, QuotaCode: "aCode, @@ -65,22 +76,32 @@ func (s *serviceQuotaFetcher) getQuota(serviceCode string, quotaCode string) (fl s.statistics.UsageAPICall++ - result, err := s.client.GetServiceQuota(context.TODO(), params) + result, err := s.client.GetServiceQuota(s.ctx, params) if err != nil { + span.SetStatus(codes.Error, "failed to get quota") + span.RecordError(err) + return 0, fmt.Errorf("can't get %s/%s service quota: %w", serviceCode, quotaCode, err) } // AWS response payload could contains errors (eg. missing permission) if result.Quota.ErrorReason != nil { s.logger.Error("AWS quota error: ", "errorCode", result.Quota.ErrorReason.ErrorCode, "message", *result.Quota.ErrorReason.ErrorMessage) + span.SetStatus(codes.Error, "failed to fetch quota") + span.RecordError(errQuotaError) return 0, errQuotaError } if result.Quota == nil { + span.SetStatus(codes.Error, "no quota") + span.RecordError(errQuotaError) + return 0, fmt.Errorf("no quota for %s/%s: %w", serviceCode, quotaCode, errNoQuota) } + span.SetStatus(codes.Ok, "quota fetched") + return *result.Quota.Value, nil } diff --git a/internal/app/servicequotas/servicequotas_test.go b/internal/app/servicequotas/servicequotas_test.go index 6e2ec94..628fda6 100644 --- a/internal/app/servicequotas/servicequotas_test.go +++ b/internal/app/servicequotas/servicequotas_test.go @@ -1,6 +1,7 @@ package servicequotas_test import ( + "context" "testing" "github.com/qonto/prometheus-rds-exporter/internal/app/servicequotas" @@ -12,9 +13,10 @@ import ( ) func TestGetRDSQuotas(t *testing.T) { + context := context.TODO() client := mock.ServiceQuotasClient{} - result, err := servicequotas.NewFetcher(client).GetRDSQuotas() + result, err := servicequotas.NewFetcher(context, client).GetRDSQuotas() require.NoError(t, err, "GetRDSQuotas must succeed") assert.Equal(t, mock.DBinstancesQuota, result.DBinstances, "DbInstance quota is incorrect") assert.Equal(t, converter.GigaBytesToBytes(mock.TotalStorage), result.TotalStorage, "Total storage quota is incorrect") diff --git a/internal/app/trace/attibute.go b/internal/app/trace/attibute.go new file mode 100644 index 0000000..d1272b8 --- /dev/null +++ b/internal/app/trace/attibute.go @@ -0,0 +1,28 @@ +// Package trace provides OTEL tracing resources +package trace + +import "go.opentelemetry.io/otel/attribute" + +// Attribute Naming convention +// +// Use namespacing to avoid name clashes. Delimit the namespaces using a dot character. For example service.version denotes the service version where service is the namespace and version is an attribute in that namespace. +// For each multi-word dot-delimited component of the attribute name separate the words by underscores (i.e. use snake_case). +// See https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/ + +const ( + AWSServiceCodeOtelKey = attribute.Key("qonto.prometheus_rds_exporter.aws.quota.service_code") + AWSQuotaCodeOtelKey = attribute.Key("qonto.prometheus_rds_exporter.aws.quota.code") + AWSInstanceTypesCountKey = attribute.Key("qonto.prometheus_rds_exporter.aws.instance-types-count") +) + +func AWSQuotaServiceCode(val string) attribute.KeyValue { + return AWSServiceCodeOtelKey.String(val) +} + +func AWSQuotaCode(val string) attribute.KeyValue { + return AWSQuotaCodeOtelKey.String(val) +} + +func AWSInstanceTypesCount(val int64) attribute.KeyValue { + return AWSInstanceTypesCountKey.Int64(val) +} diff --git a/internal/app/trace/attibute_test.go b/internal/app/trace/attibute_test.go new file mode 100644 index 0000000..cf9bed7 --- /dev/null +++ b/internal/app/trace/attibute_test.go @@ -0,0 +1,43 @@ +package trace_test + +import ( + "reflect" + "testing" + + "github.com/qonto/prometheus-rds-exporter/internal/app/trace" + "go.opentelemetry.io/otel/attribute" +) + +func TestOtelKeys(t *testing.T) { + type test struct { + name string + want attribute.KeyValue + key string + value any + } + + tests := []test{ + {"QuotaServiceCode", trace.AWSQuotaServiceCode("unittest"), "qonto.prometheus_rds_exporter.aws.quota.service_code", "unittest"}, + {"QuotaCode", trace.AWSQuotaCode("unittest"), "qonto.prometheus_rds_exporter.aws.quota.code", "unittest"}, + {"InstanceTypesCount", trace.AWSInstanceTypesCount(42), "qonto.prometheus_rds_exporter.aws.instance-types-count", int64(42)}, + } + + for _, tc := range tests { + if !reflect.DeepEqual(string(tc.want.Key), tc.key) { + t.Fatalf("%s: expected key: %v, got: %v", tc.name, tc.want.Key, tc.key) + } + + switch tc.value.(type) { + case string: + if !reflect.DeepEqual(tc.want.Value.AsString(), tc.value) { + t.Fatalf("%s: expected value: %v, got: %v", tc.name, tc.want.Value.AsString(), tc.value) + } + case int64: + if !reflect.DeepEqual(tc.want.Value.AsInt64(), tc.value) { + t.Fatalf("%s: expected value: %v, got: %v", tc.name, tc.want.Value.AsInt64(), tc.value) + } + default: + t.Fatalf("%s: %s type is not implemented. Add it to the test suite", tc.name, reflect.TypeOf(tc.value)) + } + } +} diff --git a/internal/infra/http/server.go b/internal/infra/http/server.go index a9f5aa9..67bcd1c 100644 --- a/internal/infra/http/server.go +++ b/internal/infra/http/server.go @@ -15,6 +15,8 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/qonto/prometheus-rds-exporter/internal/infra/build" + "github.com/qonto/prometheus-rds-exporter/internal/infra/tracing" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" ) const ( @@ -33,10 +35,11 @@ type Component struct { } type Config struct { - MetricPath string - ListenAddress string - TLSKeyPath string - TLSCertPath string + MetricPath string + ListenAddress string + TLSKeyPath string + TLSCertPath string + OTELTracesEnabled bool } func New(logger slog.Logger, config Config) (component Component) { @@ -66,7 +69,23 @@ func (c *Component) Start() error { return fmt.Errorf("hompage initialization failed: %w", err) } - http.Handle("/", homepage) + if c.config.OTELTracesEnabled { + c.logger.Debug("Enable OTEL traces") + + tracer, err := tracing.SetupOTelSDK(ctx) + if err != nil { + return fmt.Errorf("failed to initialize tracing: %w", err) + } + + defer func() { + err := tracer.Shutdown(ctx) + if err != nil { + c.logger.Error("failed to shutdown tracing. Some spans may be missing") + } + }() + } + + http.Handle("/", otelhttp.NewHandler(homepage, "homepage")) http.Handle(c.config.MetricPath, promhttp.Handler()) signalChan := make(chan os.Signal, 1) diff --git a/internal/infra/tracing/tracing.go b/internal/infra/tracing/tracing.go new file mode 100644 index 0000000..0203674 --- /dev/null +++ b/internal/infra/tracing/tracing.go @@ -0,0 +1,52 @@ +// Package tracing provides OpenTelemetry trace +package tracing + +import ( + "context" + "fmt" + "time" + + "github.com/qonto/prometheus-rds-exporter/internal/infra/build" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.20.0" +) + +// SetupOTelSDK bootstraps the OpenTelemetry pipeline. +// If it does not return an error, make sure to call shutdown for proper cleanup. +func SetupOTelSDK(ctx context.Context) (*trace.TracerProvider, error) { + tracerProvider, err := newTraceProvider() + if err != nil { + return tracerProvider, err + } + + otel.SetTracerProvider(tracerProvider) + + return tracerProvider, err +} + +func newTraceProvider() (*trace.TracerProvider, error) { + grpcExporter, err := otlptracegrpc.New(context.TODO()) + if err != nil { + return nil, fmt.Errorf("could not initialized gPRC exporter: %w", err) + } + + resources, err := resource.New(context.Background(), + resource.WithAttributes(semconv.ServiceName("prometheus-rds-exporter"), semconv.ServiceVersion(build.Version)), + resource.WithFromEnv(), // pull attributes from OTEL_RESOURCE_ATTRIBUTES and OTEL_SERVICE_NAME environment variables + resource.WithOS(), // This option configures a set of Detectors that discover OS information + resource.WithContainer(), // This option configures a set of Detectors that discover container information + ) + if err != nil { + return nil, fmt.Errorf("failed to initialized otel resources: %w", err) + } + + traceProvider := trace.NewTracerProvider( + trace.WithResource(resources), + trace.WithBatcher(grpcExporter, trace.WithBatchTimeout(time.Second)), + ) + + return traceProvider, nil +}