From 7ec9bccf09ed46ddb04ef3822b61a28d5651a368 Mon Sep 17 00:00:00 2001 From: Roi Vazquez Date: Wed, 4 Oct 2023 10:12:29 +0200 Subject: [PATCH 1/2] Improve redis server status metrics --- controllers/sentinel_controller.go | 5 +++ pkg/redis/metrics/cluster_status_metrics.go | 50 +++++++++++++++++++++ pkg/redis/metrics/sentinel_metrics.go | 21 +-------- 3 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 pkg/redis/metrics/cluster_status_metrics.go diff --git a/controllers/sentinel_controller.go b/controllers/sentinel_controller.go index 9ec88a2b..64a5d60c 100644 --- a/controllers/sentinel_controller.go +++ b/controllers/sentinel_controller.go @@ -178,6 +178,11 @@ func (r *SentinelReconciler) reconcileStatus(ctx context.Context, instance *saas log.Error(merr, "DiscoveryError") } + // publish metrics based on the discovered cluster status + if err := metrics.FromShardedCluster(ctx, cluster, false, instance.GetName()); err != nil { + log.Error(err, "unable to publish redis cluster status metrics") + } + shards := make(saasv1alpha1.MonitoredShards, len(cluster.Shards)) for idx, shard := range cluster.Shards { shards[idx] = saasv1alpha1.MonitoredShard{ diff --git a/pkg/redis/metrics/cluster_status_metrics.go b/pkg/redis/metrics/cluster_status_metrics.go new file mode 100644 index 00000000..0ff64ac8 --- /dev/null +++ b/pkg/redis/metrics/cluster_status_metrics.go @@ -0,0 +1,50 @@ +package metrics + +import ( + "context" + + "github.com/3scale/saas-operator/pkg/redis/sharded" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + serverInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "server_info", + Namespace: "saas_redis_cluster_status", + Help: `"redis server info"`, + }, + []string{"resource", "shard", "redis_server_host", "redis_server_alias", "role", "read_only"}) +) + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister(serverInfo) +} + +func FromShardedCluster(ctx context.Context, cluster *sharded.Cluster, refresh bool, resource string) error { + + if refresh { + err := cluster.SentinelDiscover(ctx, sharded.SlaveReadOnlyDiscoveryOpt) + if err != nil { + return err + } + } + + for _, shard := range cluster.Shards { + + for _, server := range shard.Servers { + ro, ok := server.Config["slave-read-only"] + if !ok { + ro = "no" + } + serverInfo.With(prometheus.Labels{"resource": resource, "shard": shard.Name, + "redis_server_host": server.ID(), "redis_server_alias": server.GetAlias(), + "role": string(server.Role), "read_only": ro, + }).Set(float64(1)) + } + } + + return nil +} diff --git a/pkg/redis/metrics/sentinel_metrics.go b/pkg/redis/metrics/sentinel_metrics.go index 7dd50417..248e2d9d 100644 --- a/pkg/redis/metrics/sentinel_metrics.go +++ b/pkg/redis/metrics/sentinel_metrics.go @@ -16,13 +16,6 @@ import ( ) var ( - serverInfo = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "server_info", - Namespace: "saas_redis_sentinel", - Help: `"redis server info"`, - }, - []string{"sentinel", "shard", "redis_server", "role"}) linkPendingCommands = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "link_pending_commands", @@ -86,8 +79,8 @@ var ( func init() { // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister( - serverInfo, linkPendingCommands, lastOkPingReply, roleReportedTime, - numSlaves, numOtherSentinels, masterLinkDownTime, slaveReplOffset, + linkPendingCommands, lastOkPingReply, roleReportedTime, numSlaves, + numOtherSentinels, masterLinkDownTime, slaveReplOffset, ) } @@ -176,7 +169,6 @@ func (smg *SentinelMetricsGatherer) Stop() { smg.cancel() // Reset all gauge metrics so the values related to // this exporter are deleted from the collection - serverInfo.Reset() linkPendingCommands.Reset() lastOkPingReply.Reset() roleReportedTime.Reset() @@ -195,10 +187,6 @@ func (smg *SentinelMetricsGatherer) gatherMetrics(ctx context.Context) error { for _, master := range mresult { - serverInfo.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, - "redis_server": fmt.Sprintf("%s:%d", master.IP, master.Port), "role": master.RoleReported, - }).Set(float64(1)) - linkPendingCommands.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, "redis_server": fmt.Sprintf("%s:%d", master.IP, master.Port), "role": master.RoleReported, }).Set(float64(master.LinkPendingCommands)) @@ -235,10 +223,6 @@ func (smg *SentinelMetricsGatherer) gatherMetrics(ctx context.Context) error { for _, slave := range sresult { - serverInfo.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, - "redis_server": fmt.Sprintf("%s:%d", slave.IP, slave.Port), "role": slave.RoleReported, - }).Set(float64(1)) - linkPendingCommands.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, "redis_server": fmt.Sprintf("%s:%d", slave.IP, slave.Port), "role": slave.RoleReported, }).Set(float64(slave.LinkPendingCommands)) @@ -272,7 +256,6 @@ func (smg *SentinelMetricsGatherer) gatherMetrics(ctx context.Context) error { } func cleanupMetrics(labels prometheus.Labels) { - serverInfo.Delete(labels) linkPendingCommands.Delete(labels) lastOkPingReply.Delete(labels) roleReportedTime.Delete(labels) From 9e4509fc0b9d5a53cdf35290889746aab946cb61 Mon Sep 17 00:00:00 2001 From: Roi Vazquez Date: Wed, 4 Oct 2023 11:53:43 +0200 Subject: [PATCH 2/2] Add separate ro and rw slave counters --- pkg/redis/metrics/cluster_status_metrics.go | 34 +++++++++++++++++++-- pkg/redis/metrics/sentinel_metrics.go | 16 +--------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pkg/redis/metrics/cluster_status_metrics.go b/pkg/redis/metrics/cluster_status_metrics.go index 0ff64ac8..1ab14b64 100644 --- a/pkg/redis/metrics/cluster_status_metrics.go +++ b/pkg/redis/metrics/cluster_status_metrics.go @@ -3,6 +3,7 @@ package metrics import ( "context" + "github.com/3scale/saas-operator/pkg/redis/client" "github.com/3scale/saas-operator/pkg/redis/sharded" "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -13,14 +14,30 @@ var ( prometheus.GaugeOpts{ Name: "server_info", Namespace: "saas_redis_cluster_status", - Help: `"redis server info"`, + Help: "redis cluster member info", }, []string{"resource", "shard", "redis_server_host", "redis_server_alias", "role", "read_only"}) + roSlaveCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "ro_slave_count", + Namespace: "saas_redis_cluster_status", + Help: "read-only slave count", + }, + []string{"resource", "shard"}, + ) + rwSlaveCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "rw_slave_count", + Namespace: "saas_redis_cluster_status", + Help: "read-write slave count", + }, + []string{"resource", "shard"}, + ) ) func init() { // Register custom metrics with the global prometheus registry - metrics.Registry.MustRegister(serverInfo) + metrics.Registry.MustRegister(serverInfo, roSlaveCount, rwSlaveCount) } func FromShardedCluster(ctx context.Context, cluster *sharded.Cluster, refresh bool, resource string) error { @@ -33,6 +50,8 @@ func FromShardedCluster(ctx context.Context, cluster *sharded.Cluster, refresh b } for _, shard := range cluster.Shards { + roslave := 0 + rwslave := 0 for _, server := range shard.Servers { ro, ok := server.Config["slave-read-only"] @@ -43,7 +62,18 @@ func FromShardedCluster(ctx context.Context, cluster *sharded.Cluster, refresh b "redis_server_host": server.ID(), "redis_server_alias": server.GetAlias(), "role": string(server.Role), "read_only": ro, }).Set(float64(1)) + + if server.Role == client.Slave { + if ro == "yes" { + roslave++ + } else { + rwslave++ + } + } } + + roSlaveCount.With(prometheus.Labels{"resource": resource, "shard": shard.Name}).Set(float64(roslave)) + rwSlaveCount.With(prometheus.Labels{"resource": resource, "shard": shard.Name}).Set(float64(rwslave)) } return nil diff --git a/pkg/redis/metrics/sentinel_metrics.go b/pkg/redis/metrics/sentinel_metrics.go index 248e2d9d..9e17c9fe 100644 --- a/pkg/redis/metrics/sentinel_metrics.go +++ b/pkg/redis/metrics/sentinel_metrics.go @@ -40,14 +40,6 @@ var ( }, []string{"sentinel", "shard", "redis_server", "role"}, ) - numSlaves = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "num_slaves", - Namespace: "saas_redis_sentinel", - Help: `"sentinel master num-slaves"`, - }, - []string{"sentinel", "shard", "redis_server", "role"}, - ) numOtherSentinels = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "num_other_sentinels", @@ -79,7 +71,7 @@ var ( func init() { // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister( - linkPendingCommands, lastOkPingReply, roleReportedTime, numSlaves, + linkPendingCommands, lastOkPingReply, roleReportedTime, numOtherSentinels, masterLinkDownTime, slaveReplOffset, ) } @@ -172,7 +164,6 @@ func (smg *SentinelMetricsGatherer) Stop() { linkPendingCommands.Reset() lastOkPingReply.Reset() roleReportedTime.Reset() - numSlaves.Reset() numOtherSentinels.Reset() masterLinkDownTime.Reset() slaveReplOffset.Reset() @@ -199,10 +190,6 @@ func (smg *SentinelMetricsGatherer) gatherMetrics(ctx context.Context) error { "redis_server": fmt.Sprintf("%s:%d", master.IP, master.Port), "role": master.RoleReported, }).Set(float64(master.RoleReportedTime)) - numSlaves.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, - "redis_server": fmt.Sprintf("%s:%d", master.IP, master.Port), "role": master.RoleReported, - }).Set(float64(master.NumSlaves)) - numOtherSentinels.With(prometheus.Labels{"sentinel": smg.sentinelURI, "shard": master.Name, "redis_server": fmt.Sprintf("%s:%d", master.IP, master.Port), "role": master.RoleReported, }).Set(float64(master.NumOtherSentinels)) @@ -259,7 +246,6 @@ func cleanupMetrics(labels prometheus.Labels) { linkPendingCommands.Delete(labels) lastOkPingReply.Delete(labels) roleReportedTime.Delete(labels) - numSlaves.Delete(labels) numOtherSentinels.Delete(labels) masterLinkDownTime.Delete(labels) slaveReplOffset.Delete(labels)