Skip to content

Commit

Permalink
feat(metrics): Add rds_instance_baseline_iops_average and rds_instanc…
Browse files Browse the repository at this point in the history
…e_baseline_throughput_bytes metrics with instance baseline capacity

Many instances can support maximum performance only for 30 minutes at least once every 24 hours, after which they revert to their baseline performance.

See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-optimized.html

Since we don't collect this information yet, IOPS/throughput is massively over-evaluated for some instances (e.g. db.t3.small instances)'
  • Loading branch information
vmercierfr committed May 24, 2024
1 parent 0b71507 commit 93a4d24
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 36 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ It collects key metrics about:
| rds_instance_age_seconds | `aws_account_id`, `aws_region`, `dbidentifier` | Time since instance creation |
| rds_instance_info | `arn`, `aws_account_id`, `aws_region`, `dbi_resource_id`, `dbidentifier`, `deletion_protection`, `engine`, `engine_version`, `instance_class`, `multi_az`, `performance_insights_enabled`, `pending_maintenance`, `pending_modified_values`, `role`, `source_dbidentifier`, `storage_type`, `ca_certificate_identifier` | RDS instance information |
| rds_instance_log_files_size_bytes | `aws_account_id`, `aws_region`, `dbidentifier` | Total of log files on the instance |
| rds_instance_baseline_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Baseline IOPS of underlying EC2 instance class |
| rds_instance_max_iops_average | `aws_account_id`, `aws_region`, `instance_class` | Maximum IOPS of underlying EC2 instance class |
| rds_instance_max_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Maximum throughput of underlying EC2 instance class |
| rds_instance_baseline_throughput_bytes | `aws_account_id`, `aws_region`, `instance_class` | Baseline throughput of underlying EC2 instance class |
| rds_instance_memory_bytes | `aws_account_id`, `aws_region`, `instance_class` | Instance class memory |
| rds_instance_status | `aws_account_id`, `aws_region`, `dbidentifier` | Instance status (1: ok, 0: can't scrap metrics) |
| rds_instance_tags | `aws_account_id`, `aws_region`, `dbidentifier`, `tag_<AWS_TAG>`... | AWS tags attached to the instance |
Expand Down
13 changes: 9 additions & 4 deletions internal/app/ec2/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ const (
var tracer = otel.Tracer("github/qonto/prometheus-rds-exporter/internal/app/ec2")

type EC2InstanceMetrics struct {
MaximumIops int32
MaximumThroughput float64
Memory int64
Vcpu int32
BaselineIOPS int32
BaselineThroughput float64
MaximumIops int32
MaximumThroughput float64
Memory int64
Vcpu int32
}

type Metrics struct {
Expand Down Expand Up @@ -100,6 +102,9 @@ func (e *EC2Fetcher) GetDBInstanceTypeInformation(instanceTypes []string) (Metri
}

if i.EbsInfo != nil && i.EbsInfo.EbsOptimizedInfo != nil {
instanceMetrics.BaselineIOPS = aws.ToInt32(i.EbsInfo.EbsOptimizedInfo.BaselineIops)
instanceMetrics.BaselineThroughput = converter.MegaBytesToBytes(aws.ToFloat64(i.EbsInfo.EbsOptimizedInfo.BaselineThroughputInMBps))

instanceMetrics.MaximumIops = aws.ToInt32(i.EbsInfo.EbsOptimizedInfo.MaximumIops)
instanceMetrics.MaximumThroughput = converter.MegaBytesToBytes(aws.ToFloat64(i.EbsInfo.EbsOptimizedInfo.MaximumThroughputInMBps))
}
Expand Down
50 changes: 30 additions & 20 deletions internal/app/ec2/ec2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,40 @@ func TestGetDBInstanceTypeInformation(t *testing.T) {
client := mock.EC2Client{}

testCases := []struct {
instanceType string
vCPU int32
memory int64
maximumIops int32
maximumThroughput float64
instanceType string
vCPU int32
memory int64
baselineIops int32
maximumIops int32
baselineThroughput float64
maximumThroughput float64
}{
{
instanceType: "t3.large",
vCPU: mock.InstanceT3Large.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT3Large.Memory),
maximumIops: mock.InstanceT3Large.MaximumIops,
maximumThroughput: converter.MegaBytesToBytes(mock.InstanceT3Large.MaximumThroughput),
baselineIops: mock.InstanceT3Large.BaselineIOPS,
baselineThroughput: mock.InstanceT3Large.BaselineThroughput,
instanceType: "t3.large",
vCPU: mock.InstanceT3Large.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT3Large.Memory),
maximumIops: mock.InstanceT3Large.MaximumIops,
maximumThroughput: converter.MegaBytesToBytes(mock.InstanceT3Large.MaximumThroughput),
},
{
instanceType: "t3.small",
vCPU: mock.InstanceT3Small.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT3Small.Memory),
maximumIops: mock.InstanceT3Small.MaximumIops,
maximumThroughput: converter.MegaBytesToBytes(mock.InstanceT3Small.MaximumThroughput),
baselineIops: mock.InstanceT3Small.BaselineIOPS,
baselineThroughput: mock.InstanceT3Small.BaselineThroughput,
instanceType: "t3.small",
vCPU: mock.InstanceT3Small.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT3Small.Memory),
maximumIops: mock.InstanceT3Small.MaximumIops,
maximumThroughput: converter.MegaBytesToBytes(mock.InstanceT3Small.MaximumThroughput),
},
{
instanceType: "t2.small",
vCPU: mock.InstanceT2Small.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT2Small.Memory),
maximumIops: 0, // Don't have Maximum IOPS for non EBS optimized instances
maximumThroughput: 0, // Don't have Maximum throughput for non EBS optimized instances
baselineIops: 0, // Don't have Maximum IOPS for non EBS optimized instances
baselineThroughput: 0, // Don't have Maximum IOPS for non EBS optimized instances
instanceType: "t2.small",
vCPU: mock.InstanceT2Small.Vcpu,
memory: converter.MegaBytesToBytes(mock.InstanceT2Small.Memory),
maximumIops: 0, // Don't have Maximum IOPS for non EBS optimized instances
maximumThroughput: 0, // Don't have Maximum throughput for non EBS optimized instances
},
}
expectedAPICalls := float64(1)
Expand All @@ -64,6 +72,8 @@ func TestGetDBInstanceTypeInformation(t *testing.T) {

assert.Equal(t, tc.vCPU, instance.Vcpu, "vCPU don't match")
assert.Equal(t, tc.memory, instance.Memory, "Memory don't match")
assert.Equal(t, tc.baselineIops, instance.BaselineIOPS, "Baseline IOPS don't match")
assert.Equal(t, tc.baselineThroughput, instance.BaselineThroughput, "Baseline throughput don't match")
assert.Equal(t, tc.maximumIops, instance.MaximumIops, "Maximum IOPS don't match")
assert.Equal(t, tc.maximumThroughput, instance.MaximumThroughput, "Maximum throughput don't match")
})
Expand Down
32 changes: 20 additions & 12 deletions internal/app/ec2/mock/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,22 @@ import (

//nolint:golint,gomnd
var InstanceT3Large = ec2.EC2InstanceMetrics{
MaximumIops: 15700,
MaximumThroughput: 347.5,
Memory: 8,
Vcpu: 2,
BaselineIOPS: 4000,
BaselineThroughput: 86.88,
MaximumIops: 15700,
MaximumThroughput: 347.5,
Memory: 8,
Vcpu: 2,
}

//nolint:golint,gomnd
var InstanceT3Small = ec2.EC2InstanceMetrics{
MaximumIops: 11800,
MaximumThroughput: 260.62,
Memory: 2,
Vcpu: 2,
BaselineIOPS: 1000,
BaselineThroughput: 21.75,
MaximumIops: 11800,
MaximumThroughput: 260.62,
Memory: 2,
Vcpu: 2,
}

//nolint:golint,gomnd
Expand All @@ -45,8 +49,10 @@ func (m EC2Client) DescribeInstanceTypes(ctx context.Context, input *aws_ec2.Des
VCpuInfo: &aws_ec2_types.VCpuInfo{DefaultVCpus: &InstanceT3Large.Vcpu},
MemoryInfo: &aws_ec2_types.MemoryInfo{SizeInMiB: &InstanceT3Large.Memory},
EbsInfo: &aws_ec2_types.EbsInfo{EbsOptimizedInfo: &aws_ec2_types.EbsOptimizedInfo{
MaximumIops: &InstanceT3Large.MaximumIops,
MaximumThroughputInMBps: &InstanceT3Large.MaximumThroughput,
BaselineIops: &InstanceT3Large.BaselineIOPS,
BaselineThroughputInMBps: &InstanceT3Large.BaselineThroughput,
MaximumIops: &InstanceT3Large.MaximumIops,
MaximumThroughputInMBps: &InstanceT3Large.MaximumThroughput,
}},
})
case "t3.small":
Expand All @@ -55,8 +61,10 @@ func (m EC2Client) DescribeInstanceTypes(ctx context.Context, input *aws_ec2.Des
VCpuInfo: &aws_ec2_types.VCpuInfo{DefaultVCpus: &InstanceT3Small.Vcpu},
MemoryInfo: &aws_ec2_types.MemoryInfo{SizeInMiB: &InstanceT3Small.Memory},
EbsInfo: &aws_ec2_types.EbsInfo{EbsOptimizedInfo: &aws_ec2_types.EbsOptimizedInfo{
MaximumIops: &InstanceT3Small.MaximumIops,
MaximumThroughputInMBps: &InstanceT3Small.MaximumThroughput,
BaselineIops: &InstanceT3Small.BaselineIOPS,
BaselineThroughputInMBps: &InstanceT3Small.BaselineThroughput,
MaximumIops: &InstanceT3Small.MaximumIops,
MaximumThroughputInMBps: &InstanceT3Small.MaximumThroughput,
}},
})
case "t2.small":
Expand Down
14 changes: 14 additions & 0 deletions internal/app/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ type rdsCollector struct {
dBLoadNonCPU *prometheus.Desc
allocatedStorage *prometheus.Desc
information *prometheus.Desc
instanceBaselineIops *prometheus.Desc
instanceMaximumIops *prometheus.Desc
instanceBaselineThroughput *prometheus.Desc
instanceMaximumThroughput *prometheus.Desc
instanceMemory *prometheus.Desc
instanceVCPU *prometheus.Desc
Expand Down Expand Up @@ -192,10 +194,18 @@ func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsA
"Maximum throughput of underlying EC2 instance class",
[]string{"aws_account_id", "aws_region", "instance_class"}, nil,
),
instanceBaselineThroughput: prometheus.NewDesc("rds_instance_baseline_throughput_bytes",
"Baseline throughput of underlying EC2 instance class",
[]string{"aws_account_id", "aws_region", "instance_class"}, nil,
),
instanceMaximumIops: prometheus.NewDesc("rds_instance_max_iops_average",
"Maximum IOPS of underlying EC2 instance class",
[]string{"aws_account_id", "aws_region", "instance_class"}, nil,
),
instanceBaselineIops: prometheus.NewDesc("rds_instance_baseline_iops_average",
"Baseline IOPS of underlying EC2 instance class",
[]string{"aws_account_id", "aws_region", "instance_class"}, nil,
),
freeStorageSpace: prometheus.NewDesc("rds_free_storage_bytes",
"Free storage on the instance",
[]string{"aws_account_id", "aws_region", "dbidentifier"}, nil,
Expand Down Expand Up @@ -308,7 +318,9 @@ func (c *rdsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.freeStorageSpace
ch <- c.freeableMemory
ch <- c.information
ch <- c.instanceBaselineIops
ch <- c.instanceMaximumIops
ch <- c.instanceBaselineThroughput
ch <- c.instanceMaximumThroughput
ch <- c.instanceMemory
ch <- c.instanceVCPU
Expand Down Expand Up @@ -654,6 +666,8 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) {
// EC2 metrics
ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, c.counters.EC2APIcalls, c.awsAccountID, c.awsRegion, "ec2")
for instanceType, instance := range c.metrics.EC2.Instances {
ch <- prometheus.MustNewConstMetric(c.instanceBaselineIops, prometheus.GaugeValue, float64(instance.BaselineIOPS), c.awsAccountID, c.awsRegion, instanceType)
ch <- prometheus.MustNewConstMetric(c.instanceBaselineThroughput, prometheus.GaugeValue, instance.BaselineThroughput, c.awsAccountID, c.awsRegion, instanceType)
ch <- prometheus.MustNewConstMetric(c.instanceMaximumIops, prometheus.GaugeValue, float64(instance.MaximumIops), c.awsAccountID, c.awsRegion, instanceType)
ch <- prometheus.MustNewConstMetric(c.instanceMaximumThroughput, prometheus.GaugeValue, instance.MaximumThroughput, c.awsAccountID, c.awsRegion, instanceType)
ch <- prometheus.MustNewConstMetric(c.instanceMemory, prometheus.GaugeValue, float64(instance.Memory), c.awsAccountID, c.awsRegion, instanceType)
Expand Down

0 comments on commit 93a4d24

Please sign in to comment.