diff --git a/README.md b/README.md index 3c2cb19..7fdbf00 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ It collects key metrics about: | Name | Labels | Description | | ---- | ------ | ----------- | +| rds_allocated_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated disk IOPS | +| rds_allocated_disk_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated disk throughput | | rds_allocated_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated storage | | rds_api_call_total | `api`, `aws_account_id`, `aws_region` | Number of call to AWS API | | rds_backup_retention_period_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Automatic DB snapshots retention period | @@ -55,19 +57,19 @@ It collects key metrics about: | rds_free_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Free storage on the instance | | rds_freeable_memory_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Amount of available random access memory. For MariaDB, MySQL, Oracle, and PostgreSQL DB instances, this metric reports the value of the MemAvailable field of /proc/meminfo | | rds_instance_age_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Time since instance creation | +| rds_instance_baseline_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Baseline IOPS of underlying EC2 instance class | +| rds_instance_baseline_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Baseline throughput of underlying EC2 instance class | | rds_instance_info | `arn`, `aws_account_id`, `aws_region`, `dbi_resource_id`, `dbidentifier`, `deletion_protection`, `engine`, `engine_version`, `instance_class`, `multi_az`, `performance_insights_enabled`, `pending_maintenance`, `pending_modified_values`, `role`, `source_dbidentifier`, `storage_type`, `ca_certificate_identifier` | RDS instance information | | rds_instance_log_files_size_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Total of log files on the instance | -| rds_instance_baseline_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Baseline IOPS of underlying EC2 instance class | | rds_instance_max_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Maximum IOPS of underlying EC2 instance class | | rds_instance_max_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Maximum throughput of underlying EC2 instance class | -| rds_instance_baseline_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Baseline throughput of underlying EC2 instance class | | rds_instance_memory_bytes | `aws_account_id`, `aws_region`, `instance_class` | Instance class memory | | rds_instance_status | `aws_account_id`, `aws_region`, `dbidentifier` | Instance status (1: ok, 0: can't scrap metrics) | | rds_instance_tags | `aws_account_id`, `aws_region`, `dbidentifier`, `tag_`... | AWS tags attached to the instance | | rds_instance_vcpu_average | `aws_account_id`, `aws_region`, `instance_class` | Total vCPU for this instance class | | rds_max_allocated_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Upper limit in gibibytes to which Amazon RDS can automatically scale the storage of the DB instance | -| rds_max_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Max IOPS for the instance | -| rds_max_storage_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Max storage throughput | +| rds_max_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk IOPS evaluated with disk IOPS and EC2 capacity | +| rds_max_storage_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk throughput evaluated with disk throughput and EC2 capacity | | rds_maximum_used_transaction_ids_average | `aws_account_id`, `aws_region`, `dbidentifier` | Maximum transaction IDs that have been used. Applies to only PostgreSQL | | rds_quota_max_dbinstances_average | `aws_account_id`, `aws_region` | Maximum number of RDS instances allowed in the AWS account | | rds_quota_maximum_db_instance_snapshots_average | `aws_account_id`, `aws_region` | Maximum number of manual DB instance snapshots | diff --git a/configs/grafana/panels/instance.libsonnet b/configs/grafana/panels/instance.libsonnet index e52fc56..004dd4e 100644 --- a/configs/grafana/panels/instance.libsonnet +++ b/configs/grafana/panels/instance.libsonnet @@ -160,16 +160,15 @@ local colors = common.colors; ]), diskIOPSScaling: - ts.base('Disk IOPS', "Regardless of the allocated disk IOPS, the EC2 instance behind RDS also has disk IOPS limits. You can't use more IOPS than EC2's instance limit. Burst IOPS are supported 30 minutes at least once every 24 hours.", [queries.instance.disk.iops.usage, queries.instance.disk.iops.max, queries.instance.disk.iops.instanceTypeBaseline, queries.instance.disk.iops.instanceTypeBurst]) + ts.base('Disk IOPS', 'The RDS instance cannot use more disk IOPS than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html', [queries.instance.disk.iops.usage, queries.instance.disk.iops.allocated, queries.instance.disk.iops.instanceTypeBaseline, queries.instance.disk.iops.instanceTypeBurst]) + options.legend.withSortBy('Max') + options.legend.withSortDesc(true) + standardOptions.withUnit('locale') + standardOptions.withOverrides([ - fieldOverride.byName.new('Max') + fieldOverride.byName.new('Allocated') + standardOptions.override.byType.withPropertiesFromOptions( color.withMode('fixed') + color.withFixedColor(colors.warning) - + standardOptions.withDisplayName('Allocated') + custom.withFillOpacity(0) ), fieldOverride.byRegexp.new('.* burst') @@ -189,16 +188,15 @@ local colors = common.colors; ]), diskThroughputScaling: - ts.base('Disk throughput', "Regardless of the allocated disk throughput, the EC2 instance behind RDS also has disk throughput limits. You can't use more throughput than EC2's instance limit. Burst throughput is supported 30 minutes at least once every 24 hours.", [queries.instance.disk.throughput.usage, queries.instance.disk.throughput.max, queries.instance.disk.throughput.instanceTypeBaseline, queries.instance.disk.throughput.instanceTypeBurst]) + ts.base('Disk throughput', 'The RDS instance cannot use more disk throughput than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html', [queries.instance.disk.throughput.usage, queries.instance.disk.throughput.allocated, queries.instance.disk.throughput.instanceTypeBaseline, queries.instance.disk.throughput.instanceTypeBurst]) + options.legend.withSortBy('Max') + options.legend.withSortDesc(true) + standardOptions.withUnit('bytes') + standardOptions.withOverrides([ - fieldOverride.byName.new('Max') + fieldOverride.byName.new('Allocated') + standardOptions.override.byType.withPropertiesFromOptions( color.withMode('fixed') + color.withFixedColor(colors.warning) - + standardOptions.withDisplayName('Allocated') + custom.withFillOpacity(0) ), fieldOverride.byRegexp.new('.* burst') @@ -247,7 +245,7 @@ local colors = common.colors; + ts.singleMetric, diskIOPS: - ts.base('Disk IOPS usage', 'Total of read and write disk IOPS regarding instance IOPS limits. For optimal performances, you should not reach IOPS limits', [queries.instance.disk.iops.max, queries.instance.disk.iops.read, queries.instance.disk.iops.write]) + ts.base('Disk IOPS usage', 'Total of read and write disk IOPS regarding RDS instance IOPS limits. For optimal performances, you should not reach IOPS limits', [queries.instance.disk.iops.max, queries.instance.disk.iops.read, queries.instance.disk.iops.write]) + standardOptions.withOverrides([ fieldOverride.byName.new('Max') + standardOptions.override.byType.withPropertiesFromOptions( @@ -273,7 +271,7 @@ local colors = common.colors; ]), diskThroughput: - ts.base('Disk throughput', 'The average number of bytes read/write from disk per second. For optimal performances, you should not reach disk throughput', [queries.instance.disk.throughput.read, queries.instance.disk.throughput.write, queries.instance.disk.throughput.max]) + ts.base('Disk throughput', 'The average number of bytes read/write from disk per second regarding RDS instance disk throughput limits. For optimal performances, you should not reach disk throughput', [queries.instance.disk.throughput.read, queries.instance.disk.throughput.write, queries.instance.disk.throughput.max]) + standardOptions.withDecimals(0) + standardOptions.withUnit('bytes') + standardOptions.withOverrides([ diff --git a/configs/grafana/public/rds-instance.json b/configs/grafana/public/rds-instance.json index 8c0ac59..980f859 100644 --- a/configs/grafana/public/rds-instance.json +++ b/configs/grafana/public/rds-instance.json @@ -1504,7 +1504,7 @@ "type": "datasource", "uid": "-- Mixed --" }, - "description": "Total of read and write disk IOPS regarding instance IOPS limits. For optimal performances, you should not reach IOPS limits", + "description": "Total of read and write disk IOPS regarding RDS instance IOPS limits. For optimal performances, you should not reach IOPS limits", "fieldConfig": { "defaults": { "custom": { @@ -1795,7 +1795,7 @@ "type": "datasource", "uid": "-- Mixed --" }, - "description": "The average number of bytes read/write from disk per second. For optimal performances, you should not reach disk throughput", + "description": "The average number of bytes read/write from disk per second regarding RDS instance disk throughput limits. For optimal performances, you should not reach disk throughput", "fieldConfig": { "defaults": { "custom": { @@ -2551,7 +2551,7 @@ "type": "datasource", "uid": "-- Mixed --" }, - "description": "Regardless of the allocated disk IOPS, the EC2 instance behind RDS also has disk IOPS limits. You can't use more IOPS than EC2's instance limit. Burst IOPS are supported 30 minutes at least once every 24 hours.", + "description": "The RDS instance cannot use more disk IOPS than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html", "fieldConfig": { "defaults": { "custom": { @@ -2565,7 +2565,7 @@ { "matcher": { "id": "byName", - "options": "Max" + "options": "Allocated" }, "properties": [ { @@ -2578,10 +2578,6 @@ { "id": "custom.fillOpacity", "value": 0 - }, - { - "id": "displayName", - "value": "Allocated" } ] }, @@ -2670,8 +2666,8 @@ "type": "prometheus", "uid": "$datasource" }, - "expr": "max(rds_max_disk_iops_average{aws_account_id=\"$aws_account_id\",aws_region=\"$aws_region\",dbidentifier=\"$dbidentifier\"})\n", - "legendFormat": "Max" + "expr": "max(rds_allocated_disk_iops_average{dbidentifier=\"$dbidentifier\"})\n", + "legendFormat": "Allocated" }, { "datasource": { @@ -2773,7 +2769,7 @@ "type": "datasource", "uid": "-- Mixed --" }, - "description": "Regardless of the allocated disk throughput, the EC2 instance behind RDS also has disk throughput limits. You can't use more throughput than EC2's instance limit. Burst throughput is supported 30 minutes at least once every 24 hours.", + "description": "The RDS instance cannot use more disk throughput than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html", "fieldConfig": { "defaults": { "custom": { @@ -2787,7 +2783,7 @@ { "matcher": { "id": "byName", - "options": "Max" + "options": "Allocated" }, "properties": [ { @@ -2800,10 +2796,6 @@ { "id": "custom.fillOpacity", "value": 0 - }, - { - "id": "displayName", - "value": "Allocated" } ] }, @@ -2892,8 +2884,8 @@ "type": "prometheus", "uid": "$datasource" }, - "expr": "max(rds_max_storage_throughput_bytes{dbidentifier=\"$dbidentifier\"})\n", - "legendFormat": "Max" + "expr": "max(rds_allocated_disk_throughput_bytes{dbidentifier=\"$dbidentifier\"})\n", + "legendFormat": "Allocated" }, { "datasource": { diff --git a/configs/grafana/queries/instance.libsonnet b/configs/grafana/queries/instance.libsonnet index fbfc4aa..a059ba3 100644 --- a/configs/grafana/queries/instance.libsonnet +++ b/configs/grafana/queries/instance.libsonnet @@ -221,6 +221,15 @@ local variables = import '../variables.libsonnet'; ) + prometheusQuery.withLegendFormat('Usage'), + allocated: + prometheusQuery.new( + '$' + variables.datasource.name, + ||| + max(rds_allocated_disk_iops_average{dbidentifier="$dbidentifier"}) + ||| + ) + + prometheusQuery.withLegendFormat('Allocated'), + instanceTypeBurst: prometheusQuery.new( '$' + variables.datasource.name, @@ -287,6 +296,15 @@ local variables = import '../variables.libsonnet'; ) + prometheusQuery.withLegendFormat('Max'), + allocated: + prometheusQuery.new( + '$' + variables.datasource.name, + ||| + max(rds_allocated_disk_throughput_bytes{dbidentifier="$dbidentifier"}) + ||| + ) + + prometheusQuery.withLegendFormat('Allocated'), + instanceTypeBurst: prometheusQuery.new( '$' + variables.datasource.name, diff --git a/internal/app/exporter/exporter.go b/internal/app/exporter/exporter.go index cfb4970..d823cd5 100644 --- a/internal/app/exporter/exporter.go +++ b/internal/app/exporter/exporter.go @@ -74,6 +74,8 @@ type rdsCollector struct { dBLoadCPU *prometheus.Desc dBLoadNonCPU *prometheus.Desc allocatedStorage *prometheus.Desc + allocatedDiskIOPS *prometheus.Desc + allocatedDiskThroughput *prometheus.Desc information *prometheus.Desc instanceBaselineIops *prometheus.Desc instanceMaximumIops *prometheus.Desc @@ -138,6 +140,14 @@ func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsA "Allocated storage", []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, ), + allocatedDiskIOPS: prometheus.NewDesc("rds_allocated_disk_iops_average", + "Allocated disk IOPS", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + allocatedDiskThroughput: prometheus.NewDesc("rds_allocated_disk_throughput_bytes", + "Allocated disk throughput", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), information: prometheus.NewDesc("rds_instance_info", "RDS instance information", []string{"aws_account_id", "aws_region", "dbidentifier", "dbi_resource_id", "instance_class", "engine", "engine_version", "storage_type", "multi_az", "deletion_protection", "role", "source_dbidentifier", "pending_modified_values", "pending_maintenance", "performance_insights_enabled", "ca_certificate_identifier", "arn"}, nil, @@ -151,11 +161,11 @@ func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsA []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, ), maxIops: prometheus.NewDesc("rds_max_disk_iops_average", - "Max IOPS for the instance", + "Max disk IOPS evaluated with disk IOPS and EC2 capacity", []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, ), storageThroughput: prometheus.NewDesc("rds_max_storage_throughput_bytes", - "Max storage throughput", + "Max disk throughput evaluated with disk throughput and EC2 capacity", []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, ), readThroughput: prometheus.NewDesc("rds_read_throughput_bytes", @@ -305,6 +315,8 @@ func (c *rdsCollector) Describe(ch chan<- *prometheus.Desc) { ch <- c.DBLoad ch <- c.age ch <- c.allocatedStorage + ch <- c.allocatedDiskIOPS + ch <- c.allocatedDiskThroughput ch <- c.apiCall ch <- c.apiCall ch <- c.backupRetentionPeriod @@ -561,11 +573,23 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) { instance.Arn, ) ch <- prometheus.MustNewConstMetric(c.maxAllocatedStorage, prometheus.GaugeValue, float64(instance.MaxAllocatedStorage), c.awsAccountID, c.awsRegion, dbidentifier) - ch <- prometheus.MustNewConstMetric(c.maxIops, prometheus.GaugeValue, float64(instance.MaxIops), c.awsAccountID, c.awsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.allocatedDiskIOPS, prometheus.GaugeValue, float64(instance.MaxIops), c.awsAccountID, c.awsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.allocatedDiskThroughput, prometheus.GaugeValue, float64(instance.StorageThroughput), c.awsAccountID, c.awsRegion, dbidentifier) ch <- prometheus.MustNewConstMetric(c.status, prometheus.GaugeValue, float64(instance.Status), c.awsAccountID, c.awsRegion, dbidentifier) - ch <- prometheus.MustNewConstMetric(c.storageThroughput, prometheus.GaugeValue, float64(instance.StorageThroughput), c.awsAccountID, c.awsRegion, dbidentifier) ch <- prometheus.MustNewConstMetric(c.backupRetentionPeriod, prometheus.GaugeValue, float64(instance.BackupRetentionPeriod), c.awsAccountID, c.awsRegion, dbidentifier) + maxIops := instance.MaxIops + storageThroughput := float64(instance.StorageThroughput) + + // RDS disk performance are limited by the EBS volume attached the RDS instance + if ec2Metrics, ok := c.metrics.EC2.Instances[instance.DBInstanceClass]; ok { + maxIops = min(instance.MaxIops, int64(ec2Metrics.BaselineIOPS)) + storageThroughput = min(float64(instance.StorageThroughput), ec2Metrics.BaselineThroughput) + } + + ch <- prometheus.MustNewConstMetric(c.maxIops, prometheus.GaugeValue, float64(maxIops), c.awsAccountID, c.awsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.storageThroughput, prometheus.GaugeValue, storageThroughput, c.awsAccountID, c.awsRegion, dbidentifier) + if c.configuration.CollectInstanceTags { names, values := c.getInstanceTagLabels(dbidentifier, instance)