Skip to content

Commit 5cfdd01

Browse files
authored
Metrics: refactoring, shard state logging (#65)
1 parent 9168eb8 commit 5cfdd01

File tree

4 files changed

+48
-25
lines changed

4 files changed

+48
-25
lines changed

internal/metrics/metrics.go

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
11
package metrics
22

3-
import "github.com/prometheus/client_golang/prometheus"
3+
import (
4+
"github.com/prometheus/client_golang/prometheus"
5+
)
46

57
const (
68
discoveryInstanceDurations = "instance_durations"
79
discoveryClusterDurations = "cluster_durations"
810
shardCriticalLevel = "critical_level"
911
shardState = "state"
12+
shardStateEvent = "shard_state_event"
13+
)
14+
15+
const (
16+
labelClusterName = "cluster_name"
17+
labelHostName = "hostname"
18+
labelShardState = "shard_state"
19+
labelShardUUID = "shard_uuid"
1020
)
1121

1222
var (
@@ -20,42 +30,49 @@ var (
2030
Name: discoveryInstanceDurations,
2131
Help: "Instance discovery latencies in seconds",
2232
Buckets: discoveryInstanceDurationsBuckets,
23-
}, []string{"cluster_name", "hostname"})
33+
}, []string{labelClusterName, labelHostName})
2434

2535
discoveryClusterDurationsSum = prometheus.NewHistogramVec(prometheus.HistogramOpts{
2636
Subsystem: "discovery",
2737
Name: discoveryClusterDurations,
2838
Help: "Cluster discovery latencies in seconds",
2939
Buckets: discoveryClusterDurationsBuckets,
30-
}, []string{"cluster_name"})
40+
}, []string{labelClusterName})
3141

3242
shardCriticalLevelGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
3343
Subsystem: "shard",
3444
Name: shardCriticalLevel,
3545
Help: "Critical level of the replica set",
36-
}, []string{"cluster_name", "uuid", "master_uri"})
46+
}, []string{labelClusterName, labelShardUUID})
3747

3848
shardStateGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
3949
Subsystem: "shard",
4050
Name: shardState,
4151
Help: "The state of each shard in the cluster; it will have one line for each possible state of each shard. A value of 1 means the shard is in the state specified by the state label, a value of 0 means it is not.",
42-
}, []string{"cluster_name", "uuid", "master_uri", "state"})
52+
}, []string{labelClusterName, labelShardUUID, labelShardState})
4353

44-
discoveryErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
54+
discoveryErrors = prometheus.NewCounter(prometheus.CounterOpts{
4555
Subsystem: "discovery",
4656
Name: "errors",
4757
Help: "Errors that happen during discovery process",
48-
}, []string{"cluster_name", "uri"})
58+
})
59+
60+
shardStateCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
61+
Subsystem: "orchestrator",
62+
Name: shardStateEvent,
63+
Help: "Discovered shard state event",
64+
}, []string{labelClusterName, labelShardUUID, labelShardState})
4965
)
5066

5167
func init() {
52-
discoveryErrors.With(prometheus.Labels{"cluster_name": "", "uri": ""}).Add(0)
68+
discoveryErrors.Add(0)
5369
prometheus.MustRegister(
5470
discoveryInstanceDurationsSum,
5571
discoveryClusterDurationsSum,
5672
shardCriticalLevelGauge,
5773
shardStateGauge,
5874
discoveryErrors,
75+
shardStateCounter,
5976
)
6077
}
6178

@@ -95,26 +112,30 @@ func StartClusterDiscovery(clusterName string) Transaction {
95112
return txn.Start()
96113
}
97114

98-
func SetShardCriticalLevel(clusterName, uuid, masterURI string, level int) {
99-
shardCriticalLevelGauge.WithLabelValues(clusterName, uuid, masterURI).Set(float64(level))
115+
func SetShardCriticalLevel(clusterName, uuid string, level int) {
116+
shardCriticalLevelGauge.WithLabelValues(clusterName, uuid).Set(float64(level))
100117
}
101118

102-
func SetShardState(clusterName, uuid, masterURI, state string, active bool) {
119+
func SetShardState(clusterName, uuid, state string, active bool) {
103120
v := float64(0)
104121
if active {
105122
v = 1
106123
}
107124
shardStateGauge.With(prometheus.Labels{
108-
"cluster_name": clusterName,
109-
"uuid": uuid,
110-
"master_uri": masterURI,
111-
"state": state,
125+
labelClusterName: clusterName,
126+
labelShardUUID: uuid,
127+
labelShardState: state,
112128
}).Set(v)
113129
}
114130

115-
func RecordDiscoveryError(clusterName, uri string) {
116-
discoveryErrors.With(prometheus.Labels{
117-
"cluster_name": clusterName,
118-
"uri": uri,
131+
func RecordDiscoveryError() {
132+
discoveryErrors.Inc()
133+
}
134+
135+
func RecordDiscoveredShardState(clusterName, shardUUID, state string) {
136+
shardStateCounter.With(prometheus.Labels{
137+
labelClusterName: clusterName,
138+
labelShardUUID: shardUUID,
139+
labelShardState: state,
119140
}).Inc()
120141
}

internal/vshard/cluster.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ func (c *Cluster) Discover() {
292292
conn := c.Connector(router.URI)
293293
resp := conn.Exec(ctx, vshardRouterInfoQuery)
294294
if resp.Error != nil {
295-
metrics.RecordDiscoveryError(c.Name, router.URI)
295+
metrics.RecordDiscoveryError()
296296
c.logger.
297297
Err(resp.Error).
298298
Str("URI", router.URI).
@@ -302,7 +302,7 @@ func (c *Cluster) Discover() {
302302

303303
updatedRI, err := ParseRouterInfo(resp.Data)
304304
if err != nil {
305-
metrics.RecordDiscoveryError(c.Name, router.URI)
305+
metrics.RecordDiscoveryError()
306306
c.logger.Err(err).
307307
Str("URI", router.URI).
308308
Msg("Failed to discover the topology of the cluster using router")
@@ -371,7 +371,7 @@ func (c *Cluster) Discover() {
371371
ns.ReplicaSets = append(ns.ReplicaSets, set)
372372

373373
code, _ := set.HealthStatus()
374-
metrics.SetShardCriticalLevel(c.Name, string(set.UUID), set.MasterURI, int(code))
374+
metrics.SetShardCriticalLevel(c.Name, string(set.UUID), int(code))
375375
c.logDiscoveredReplicaSet(set)
376376
}
377377

@@ -446,7 +446,7 @@ func (c *Cluster) discoverInstance(ctx context.Context, inst *Instance) {
446446
conn := c.Connector(inst.URI)
447447
resp := conn.Exec(ctx, vshardInstanceInfoQuery)
448448
if resp.Error != nil {
449-
metrics.RecordDiscoveryError(c.Name, inst.URI)
449+
metrics.RecordDiscoveryError()
450450
c.logger.Err(resp.Error).
451451
Str("URI", inst.URI).
452452
Str("UUID", string(inst.UUID)).
@@ -457,7 +457,7 @@ func (c *Cluster) discoverInstance(ctx context.Context, inst *Instance) {
457457

458458
info, err := ParseInstanceInfo(resp.Data)
459459
if err != nil {
460-
metrics.RecordDiscoveryError(c.Name, inst.URI)
460+
metrics.RecordDiscoveryError()
461461
c.logger.Err(err).
462462
Str("URI", inst.URI).
463463
Str("UUID", string(inst.UUID)).

internal/vshard/orchestrator/failover.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/rs/zerolog"
1212
"github.com/viciious/go-tarantool"
1313

14+
"github.com/shmel1k/qumomf/internal/metrics"
1415
"github.com/shmel1k/qumomf/internal/quorum"
1516
"github.com/shmel1k/qumomf/internal/util"
1617
"github.com/shmel1k/qumomf/internal/vshard"
@@ -148,6 +149,7 @@ func (f *failover) checkAndRecover(ctx context.Context, analysis *ReplicationAna
148149
Str("master_uri", analysis.Set.MasterURI).
149150
Logger()
150151
logger.WithLevel(f.sampler.sample(analysis)).Str("analysis", analysis.String()).Msg("checkAndRecover")
152+
metrics.RecordDiscoveredShardState(f.cluster.Name, string(analysis.Set.UUID), string(analysis.State))
151153

152154
recvFunc, desc := f.getCheckAndRecoveryFunc(analysis.State)
153155
if recvFunc == nil {

internal/vshard/orchestrator/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (m *storageMonitor) checkCluster(stream AnalysisWriteStream) {
8686

8787
for _, state := range ReplicaSetStateEnum {
8888
active := state == analysis.State
89-
metrics.SetShardState(m.cluster.Name, string(set.UUID), set.MasterURI, string(state), active)
89+
metrics.SetShardState(m.cluster.Name, string(set.UUID), string(state), active)
9090
}
9191
}
9292
}(set)

0 commit comments

Comments
 (0)