Skip to content

Commit

Permalink
Merge pull request #178 from qonto/add-allocated-storage
Browse files Browse the repository at this point in the history
Fix storage performance information
  • Loading branch information
vmercierfr authored May 27, 2024
2 parents a3653fd + b197bf3 commit 1907bf0
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 34 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ It collects key metrics about:

| Name | Labels | Description |
| ---- | ------ | ----------- |
| rds_allocated_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated disk IOPS |
| rds_allocated_disk_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated disk throughput |
| rds_allocated_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Allocated storage |
| rds_api_call_total | `api`, `aws_account_id`, `aws_region` | Number of call to AWS API |
| rds_backup_retention_period_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Automatic DB snapshots retention period |
Expand All @@ -55,19 +57,19 @@ It collects key metrics about:
| rds_free_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Free storage on the instance |
| rds_freeable_memory_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Amount of available random access memory. For MariaDB, MySQL, Oracle, and PostgreSQL DB instances, this metric reports the value of the MemAvailable field of /proc/meminfo |
| rds_instance_age_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Time since instance creation |
| rds_instance_baseline_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Baseline IOPS of underlying EC2 instance class |
| rds_instance_baseline_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Baseline throughput of underlying EC2 instance class |
| rds_instance_info | `arn`, `aws_account_id`, `aws_region`, `dbi_resource_id`, `dbidentifier`, `deletion_protection`, `engine`, `engine_version`, `instance_class`, `multi_az`, `performance_insights_enabled`, `pending_maintenance`, `pending_modified_values`, `role`, `source_dbidentifier`, `storage_type`, `ca_certificate_identifier` | RDS instance information |
| rds_instance_log_files_size_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Total of log files on the instance |
| rds_instance_baseline_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Baseline IOPS of underlying EC2 instance class |
| rds_instance_max_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Maximum IOPS of underlying EC2 instance class |
| rds_instance_max_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Maximum throughput of underlying EC2 instance class |
| rds_instance_baseline_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Baseline throughput of underlying EC2 instance class |
| rds_instance_memory_bytes | `aws_account_id`, `aws_region`, `instance_class` | Instance class memory |
| rds_instance_status | `aws_account_id`, `aws_region`, `dbidentifier` | Instance status (1: ok, 0: can't scrap metrics) |
| rds_instance_tags | `aws_account_id`, `aws_region`, `dbidentifier`, `tag_<AWS_TAG>`... | AWS tags attached to the instance |
| rds_instance_vcpu_average | `aws_account_id`, `aws_region`, `instance_class` | Total vCPU for this instance class |
| rds_max_allocated_storage_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Upper limit in gibibytes to which Amazon RDS can automatically scale the storage of the DB instance |
| rds_max_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Max IOPS for the instance |
| rds_max_storage_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Max storage throughput |
| rds_max_disk_iops_average | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk IOPS evaluated with disk IOPS and EC2 capacity |
| rds_max_storage_throughput_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Max disk throughput evaluated with disk throughput and EC2 capacity |
| rds_maximum_used_transaction_ids_average | `aws_account_id`, `aws_region`, `dbidentifier` | Maximum transaction IDs that have been used. Applies to only PostgreSQL |
| rds_quota_max_dbinstances_average | `aws_account_id`, `aws_region` | Maximum number of RDS instances allowed in the AWS account |
| rds_quota_maximum_db_instance_snapshots_average | `aws_account_id`, `aws_region` | Maximum number of manual DB instance snapshots |
Expand Down
14 changes: 6 additions & 8 deletions configs/grafana/panels/instance.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,15 @@ local colors = common.colors;
]),

diskIOPSScaling:
ts.base('Disk IOPS', "Regardless of the allocated disk IOPS, the EC2 instance behind RDS also has disk IOPS limits. You can't use more IOPS than EC2's instance limit. Burst IOPS are supported 30 minutes at least once every 24 hours.", [queries.instance.disk.iops.usage, queries.instance.disk.iops.max, queries.instance.disk.iops.instanceTypeBaseline, queries.instance.disk.iops.instanceTypeBurst])
ts.base('Disk IOPS', 'The RDS instance cannot use more disk IOPS than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html', [queries.instance.disk.iops.usage, queries.instance.disk.iops.allocated, queries.instance.disk.iops.instanceTypeBaseline, queries.instance.disk.iops.instanceTypeBurst])
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
+ standardOptions.withUnit('locale')
+ standardOptions.withOverrides([
fieldOverride.byName.new('Max')
fieldOverride.byName.new('Allocated')
+ standardOptions.override.byType.withPropertiesFromOptions(
color.withMode('fixed')
+ color.withFixedColor(colors.warning)
+ standardOptions.withDisplayName('Allocated')
+ custom.withFillOpacity(0)
),
fieldOverride.byRegexp.new('.* burst')
Expand All @@ -189,16 +188,15 @@ local colors = common.colors;
]),

diskThroughputScaling:
ts.base('Disk throughput', "Regardless of the allocated disk throughput, the EC2 instance behind RDS also has disk throughput limits. You can't use more throughput than EC2's instance limit. Burst throughput is supported 30 minutes at least once every 24 hours.", [queries.instance.disk.throughput.usage, queries.instance.disk.throughput.max, queries.instance.disk.throughput.instanceTypeBaseline, queries.instance.disk.throughput.instanceTypeBurst])
ts.base('Disk throughput', 'The RDS instance cannot use more disk throughput than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html', [queries.instance.disk.throughput.usage, queries.instance.disk.throughput.allocated, queries.instance.disk.throughput.instanceTypeBaseline, queries.instance.disk.throughput.instanceTypeBurst])
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
+ standardOptions.withUnit('bytes')
+ standardOptions.withOverrides([
fieldOverride.byName.new('Max')
fieldOverride.byName.new('Allocated')
+ standardOptions.override.byType.withPropertiesFromOptions(
color.withMode('fixed')
+ color.withFixedColor(colors.warning)
+ standardOptions.withDisplayName('Allocated')
+ custom.withFillOpacity(0)
),
fieldOverride.byRegexp.new('.* burst')
Expand Down Expand Up @@ -247,7 +245,7 @@ local colors = common.colors;
+ ts.singleMetric,

diskIOPS:
ts.base('Disk IOPS usage', 'Total of read and write disk IOPS regarding instance IOPS limits. For optimal performances, you should not reach IOPS limits', [queries.instance.disk.iops.max, queries.instance.disk.iops.read, queries.instance.disk.iops.write])
ts.base('Disk IOPS usage', 'Total of read and write disk IOPS regarding RDS instance IOPS limits. For optimal performances, you should not reach IOPS limits', [queries.instance.disk.iops.max, queries.instance.disk.iops.read, queries.instance.disk.iops.write])
+ standardOptions.withOverrides([
fieldOverride.byName.new('Max')
+ standardOptions.override.byType.withPropertiesFromOptions(
Expand All @@ -273,7 +271,7 @@ local colors = common.colors;
]),

diskThroughput:
ts.base('Disk throughput', 'The average number of bytes read/write from disk per second. For optimal performances, you should not reach disk throughput', [queries.instance.disk.throughput.read, queries.instance.disk.throughput.write, queries.instance.disk.throughput.max])
ts.base('Disk throughput', 'The average number of bytes read/write from disk per second regarding RDS instance disk throughput limits. For optimal performances, you should not reach disk throughput', [queries.instance.disk.throughput.read, queries.instance.disk.throughput.write, queries.instance.disk.throughput.max])
+ standardOptions.withDecimals(0)
+ standardOptions.withUnit('bytes')
+ standardOptions.withOverrides([
Expand Down
28 changes: 10 additions & 18 deletions configs/grafana/public/rds-instance.json
Original file line number Diff line number Diff line change
Expand Up @@ -1504,7 +1504,7 @@
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Total of read and write disk IOPS regarding instance IOPS limits. For optimal performances, you should not reach IOPS limits",
"description": "Total of read and write disk IOPS regarding RDS instance IOPS limits. For optimal performances, you should not reach IOPS limits",
"fieldConfig": {
"defaults": {
"custom": {
Expand Down Expand Up @@ -1795,7 +1795,7 @@
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "The average number of bytes read/write from disk per second. For optimal performances, you should not reach disk throughput",
"description": "The average number of bytes read/write from disk per second regarding RDS instance disk throughput limits. For optimal performances, you should not reach disk throughput",
"fieldConfig": {
"defaults": {
"custom": {
Expand Down Expand Up @@ -2551,7 +2551,7 @@
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Regardless of the allocated disk IOPS, the EC2 instance behind RDS also has disk IOPS limits. You can't use more IOPS than EC2's instance limit. Burst IOPS are supported 30 minutes at least once every 24 hours.",
"description": "The RDS instance cannot use more disk IOPS than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html",
"fieldConfig": {
"defaults": {
"custom": {
Expand All @@ -2565,7 +2565,7 @@
{
"matcher": {
"id": "byName",
"options": "Max"
"options": "Allocated"
},
"properties": [
{
Expand All @@ -2578,10 +2578,6 @@
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "displayName",
"value": "Allocated"
}
]
},
Expand Down Expand Up @@ -2670,8 +2666,8 @@
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(rds_max_disk_iops_average{aws_account_id=\"$aws_account_id\",aws_region=\"$aws_region\",dbidentifier=\"$dbidentifier\"})\n",
"legendFormat": "Max"
"expr": "max(rds_allocated_disk_iops_average{dbidentifier=\"$dbidentifier\"})\n",
"legendFormat": "Allocated"
},
{
"datasource": {
Expand Down Expand Up @@ -2773,7 +2769,7 @@
"type": "datasource",
"uid": "-- Mixed --"
},
"description": "Regardless of the allocated disk throughput, the EC2 instance behind RDS also has disk throughput limits. You can't use more throughput than EC2's instance limit. Burst throughput is supported 30 minutes at least once every 24 hours.",
"description": "The RDS instance cannot use more disk throughput than supported by the EC2 instance baseline, but it can burst 30 minutes at least once every 24 hours. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html",
"fieldConfig": {
"defaults": {
"custom": {
Expand All @@ -2787,7 +2783,7 @@
{
"matcher": {
"id": "byName",
"options": "Max"
"options": "Allocated"
},
"properties": [
{
Expand All @@ -2800,10 +2796,6 @@
{
"id": "custom.fillOpacity",
"value": 0
},
{
"id": "displayName",
"value": "Allocated"
}
]
},
Expand Down Expand Up @@ -2892,8 +2884,8 @@
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(rds_max_storage_throughput_bytes{dbidentifier=\"$dbidentifier\"})\n",
"legendFormat": "Max"
"expr": "max(rds_allocated_disk_throughput_bytes{dbidentifier=\"$dbidentifier\"})\n",
"legendFormat": "Allocated"
},
{
"datasource": {
Expand Down
18 changes: 18 additions & 0 deletions configs/grafana/queries/instance.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,15 @@ local variables = import '../variables.libsonnet';
)
+ prometheusQuery.withLegendFormat('Usage'),

allocated:
prometheusQuery.new(
'$' + variables.datasource.name,
|||
max(rds_allocated_disk_iops_average{dbidentifier="$dbidentifier"})
|||
)
+ prometheusQuery.withLegendFormat('Allocated'),

instanceTypeBurst:
prometheusQuery.new(
'$' + variables.datasource.name,
Expand Down Expand Up @@ -287,6 +296,15 @@ local variables = import '../variables.libsonnet';
)
+ prometheusQuery.withLegendFormat('Max'),

allocated:
prometheusQuery.new(
'$' + variables.datasource.name,
|||
max(rds_allocated_disk_throughput_bytes{dbidentifier="$dbidentifier"})
|||
)
+ prometheusQuery.withLegendFormat('Allocated'),

instanceTypeBurst:
prometheusQuery.new(
'$' + variables.datasource.name,
Expand Down
32 changes: 28 additions & 4 deletions internal/app/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ type rdsCollector struct {
dBLoadCPU *prometheus.Desc
dBLoadNonCPU *prometheus.Desc
allocatedStorage *prometheus.Desc
allocatedDiskIOPS *prometheus.Desc
allocatedDiskThroughput *prometheus.Desc
information *prometheus.Desc
instanceBaselineIops *prometheus.Desc
instanceMaximumIops *prometheus.Desc
Expand Down Expand Up @@ -138,6 +140,14 @@ func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsA
"Allocated storage",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
allocatedDiskIOPS: prometheus.NewDesc("rds_allocated_disk_iops_average",
"Allocated disk IOPS",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
allocatedDiskThroughput: prometheus.NewDesc("rds_allocated_disk_throughput_bytes",
"Allocated disk throughput",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
information: prometheus.NewDesc("rds_instance_info",
"RDS instance information",
[]string{"aws_account_id", "aws_region", "dbidentifier", "dbi_resource_id", "instance_class", "engine", "engine_version", "storage_type", "multi_az", "deletion_protection", "role", "source_dbidentifier", "pending_modified_values", "pending_maintenance", "performance_insights_enabled", "ca_certificate_identifier", "arn"}, nil,
Expand All @@ -151,11 +161,11 @@ func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsA
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
maxIops: prometheus.NewDesc("rds_max_disk_iops_average",
"Max IOPS for the instance",
"Max disk IOPS evaluated with disk IOPS and EC2 capacity",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
storageThroughput: prometheus.NewDesc("rds_max_storage_throughput_bytes",
"Max storage throughput",
"Max disk throughput evaluated with disk throughput and EC2 capacity",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
),
readThroughput: prometheus.NewDesc("rds_read_throughput_bytes",
Expand Down Expand Up @@ -305,6 +315,8 @@ func (c *rdsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.DBLoad
ch <- c.age
ch <- c.allocatedStorage
ch <- c.allocatedDiskIOPS
ch <- c.allocatedDiskThroughput
ch <- c.apiCall
ch <- c.apiCall
ch <- c.backupRetentionPeriod
Expand Down Expand Up @@ -561,11 +573,23 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) {
instance.Arn,
)
ch <- prometheus.MustNewConstMetric(c.maxAllocatedStorage, prometheus.GaugeValue, float64(instance.MaxAllocatedStorage), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.maxIops, prometheus.GaugeValue, float64(instance.MaxIops), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.allocatedDiskIOPS, prometheus.GaugeValue, float64(instance.MaxIops), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.allocatedDiskThroughput, prometheus.GaugeValue, float64(instance.StorageThroughput), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.status, prometheus.GaugeValue, float64(instance.Status), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.storageThroughput, prometheus.GaugeValue, float64(instance.StorageThroughput), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.backupRetentionPeriod, prometheus.GaugeValue, float64(instance.BackupRetentionPeriod), c.awsAccountID, c.awsRegion, dbidentifier)

maxIops := instance.MaxIops
storageThroughput := float64(instance.StorageThroughput)

// RDS disk performance are limited by the EBS volume attached the RDS instance
if ec2Metrics, ok := c.metrics.EC2.Instances[instance.DBInstanceClass]; ok {
maxIops = min(instance.MaxIops, int64(ec2Metrics.BaselineIOPS))
storageThroughput = min(float64(instance.StorageThroughput), ec2Metrics.BaselineThroughput)
}

ch <- prometheus.MustNewConstMetric(c.maxIops, prometheus.GaugeValue, float64(maxIops), c.awsAccountID, c.awsRegion, dbidentifier)
ch <- prometheus.MustNewConstMetric(c.storageThroughput, prometheus.GaugeValue, storageThroughput, c.awsAccountID, c.awsRegion, dbidentifier)

if c.configuration.CollectInstanceTags {
names, values := c.getInstanceTagLabels(dbidentifier, instance)

Expand Down

0 comments on commit 1907bf0

Please sign in to comment.