diff --git a/.github/workflows/static_checks_etc.yml b/.github/workflows/static_checks_etc.yml index 42222b01580..2e7e6f585db 100644 --- a/.github/workflows/static_checks_etc.yml +++ b/.github/workflows/static_checks_etc.yml @@ -205,4 +205,4 @@ jobs: echo 'We wish to maintain a consistent changelog directory, please run `go run ./go/tools/releases/releases.go`, commit and push again.' echo 'Running `go run ./go/tools/releases/releases.go` on CI yields the following changes:' echo "$output" - echo "" \ No newline at end of file + echo "" diff --git a/go/stats/timings.go b/go/stats/timings.go index 9048bedee11..cd51ec77a66 100644 --- a/go/stats/timings.go +++ b/go/stats/timings.go @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings { return t } -// Reset will clear histograms: used during testing +// Reset will clear histograms and counters: used during testing func (t *Timings) Reset() { t.mu.RLock() t.histograms = make(map[string]*Histogram) + t.totalCount.Set(0) + t.totalTime.Set(0) t.mu.RUnlock() } diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index ba846ebc147..f3b2ce8d2b4 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -68,9 +68,14 @@ type EmergencyReparentOptions struct { // counters for Emergency Reparent Shard var ( - ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") - ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") - ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + // TODO(timvaillancourt): remove legacyERS* gauges in v19+. + legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") + legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") + legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + + ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) ) // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to @@ -98,26 +103,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l // keyspace and shard. func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + // First step is to lock the shard for the given operation, if not already locked if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias) ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + ersCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) } // dispatch success or failure of ERS + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime)) switch err { case nil: - ersSuccessCounter.Add(1) + legacyERSSuccessCounter.Add(1) + ersCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished EmergencyReparentShard") default: - ersFailureCounter.Add(1) + legacyERSFailureCounter.Add(1) + ersCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error()) } }() @@ -141,7 +153,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) { // log the starting of the operation and increment the counter erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) - ersCounter.Add(1) + legacyERSCounter.Add(1) var ( stoppedReplicationSnapshot *replicationSnapshot diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index af7b33a3151..40c3c295f30 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -2747,10 +2747,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) { } } -func TestEmergencyReparenterCounters(t *testing.T) { - ersCounter.Set(0) - ersSuccessCounter.Set(0) - ersFailureCounter.Set(0) +func TestEmergencyReparenterStats(t *testing.T) { + ersCounter.ResetAll() + legacyERSCounter.Reset() + legacyERSSuccessCounter.Reset() + legacyERSFailureCounter.Reset() + reparentShardOpTimings.Reset() emergencyReparentOps := EmergencyReparentOptions{} tmc := &testutil.TabletManagerClient{ @@ -2879,9 +2881,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.NoError(t, err) // check the counter values - require.EqualValues(t, 1, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 0, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 1, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 0, legacyERSFailureCounter.Get()) // set emergencyReparentOps to request a non existent tablet emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ @@ -2894,9 +2900,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.Error(t, err) // check the counter values - require.EqualValues(t, 2, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 1, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 2, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 1, legacyERSFailureCounter.Get()) } func TestEmergencyReparenter_findMostAdvanced(t *testing.T) { diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go index 4f46d234e25..31b222f9df5 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter.go @@ -26,6 +26,7 @@ import ( "vitess.io/vitess/go/event" "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" @@ -39,6 +40,13 @@ import ( "vitess.io/vitess/go/vt/proto/vtrpc" ) +// counters for Planned Reparent Shard +var ( + prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) +) + // PlannedReparenter performs PlannedReparentShard operations. type PlannedReparenter struct { ts *topo.Server @@ -88,11 +96,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log // both the current and desired primary are reachable and in a good state. func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) opts.lockAction = pr.getLockAction(opts) ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) @@ -101,18 +112,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil { shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias } + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime)) switch err { case nil: + prsCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished PlannedReparentShard") default: + prsCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error()) } }() diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index 09900c4a624..ec20eb19352 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3719,3 +3719,96 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual * AssertReparentEventsEqualWithMessage(t, expected, actual, "") } + +func TestPlannedReparenterStats(t *testing.T) { + prsCounter.ResetAll() + reparentShardOpTimings.Reset() + + tmc := &testutil.TabletManagerClient{ + PrimaryPositionResults: map[string]struct { + Position string + Error error + }{ + "zone1-0000000100": { + Position: "position1", + Error: nil, + }, + }, + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": nil, + }, + SetReadWriteResults: map[string]error{ + "zone1-0000000100": nil, + }, + } + shards := []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + }, + } + tablets := []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Type: topodatapb.TabletType_REPLICA, + Keyspace: "testkeyspace", + Shard: "-", + }, + } + plannedReparentOps := PlannedReparentOptions{ + NewPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + } + keyspace := "testkeyspace" + shard := "-" + ts := memorytopo.NewServer("zone1") + + ctx := context.Background() + logger := logutil.NewMemoryLogger() + + testutil.AddShards(ctx, t, ts, shards...) + testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{ + AlsoSetShardPrimary: true, + SkipShardCreation: false, + }, tablets...) + + prp := NewPlannedReparenter(ts, tmc, logger) + // run a successful prs + _, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.NoError(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts()) + + // set plannedReparentOps to request a non existent tablet + plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ + Cell: "bogus", + Uid: 100, + } + + // run a failing prs + _, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.Error(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts()) +} diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index f4cebc3dd7d..ac51a45c65f 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -23,6 +23,7 @@ import ( "time" "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" @@ -38,6 +39,12 @@ import ( "vitess.io/vitess/go/vt/proto/vtrpc" ) +var ( + reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation") + failureResult = "failure" + successResult = "success" +) + // ChooseNewPrimary finds a tablet that should become a primary after reparent. // The criteria for the new primary-elect are (preferably) to be in the same // cell as the current primary, and to be different from avoidPrimaryAlias. The