From 670570d540935eccb6f45c8c42ccd153bcaa6a93 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:16:34 +0530 Subject: [PATCH] Remember all tablets in VTOrc (#16888) Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/general/vtorc_test.go | 40 +++++++++++++++++++ go/test/endtoend/vtorc/utils/utils.go | 26 +++++++++++++ go/vt/vtorc/inst/analysis_dao.go | 5 +++ go/vt/vtorc/inst/analysis_dao_test.go | 41 ++++++++++++++++++++ go/vt/vtorc/logic/tablet_discovery.go | 3 -- go/vt/vtorc/logic/tablet_discovery_test.go | 22 +++++++++++ 6 files changed, 134 insertions(+), 3 deletions(-) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 88cd7b65d63..329601deb0c 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -492,6 +492,46 @@ func TestMultipleDurabilities(t *testing.T) { assert.NotNil(t, primary, "should have elected a primary") } +// TestDrainedTablet tests that we don't forget drained tablets and they still show up in the vtorc output. +func TestDrainedTablet(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + + // Setup a normal cluster and start vtorc + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{}, 1, "") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] + + // find any replica tablet other than the current primary + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + if tablet.Alias != curPrimary.Alias { + replica = tablet + break + } + } + require.NotNil(t, replica, "could not find any replica tablet") + + output, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput( + "ChangeTabletType", replica.Alias, "DRAINED") + require.NoError(t, err, "error in changing tablet type output - %s", output) + + // Make sure VTOrc sees the drained tablets and doesn't forget them. + utils.WaitForDrainedTabletInVTOrc(t, vtOrcProcess, 1) + + output, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput( + "ChangeTabletType", replica.Alias, "REPLICA") + require.NoError(t, err, "error in changing tablet type output - %s", output) + + // We have no drained tablets anymore. Wait for VTOrc to have processed that. + utils.WaitForDrainedTabletInVTOrc(t, vtOrcProcess, 0) +} + // TestDurabilityPolicySetLater tests that VTOrc works even if the durability policy of the keyspace is // set after VTOrc has been started. func TestDurabilityPolicySetLater(t *testing.T) { diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 63500377f47..89847b94605 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1204,3 +1204,29 @@ func SemiSyncExtensionLoaded(t *testing.T, tablet *cluster.Vttablet) (mysql.Semi return conn.SemiSyncExtensionLoaded() } + +// WaitForDrainedTabletInVTOrc waits for VTOrc to see the specified number of drained tablet. +func WaitForDrainedTabletInVTOrc(t *testing.T, vtorcInstance *cluster.VTOrcProcess, count int) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + t.Errorf("timed out waiting for drained tablet in VTOrc") + return + case <-ticker.C: + statusCode, res, err := vtorcInstance.MakeAPICall("api/database-state") + if err != nil || statusCode != 200 { + continue + } + found := strings.Count(res, fmt.Sprintf(`"tablet_type": "%d"`, topodatapb.TabletType_DRAINED)) + if found == count { + return + } + } + } +} diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 0268a8b183a..99df358e330 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -294,6 +294,11 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna return nil } + // We don't want to run any fixes on any non-replica type tablet. + if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) { + return nil + } + primaryTablet := &topodatapb.Tablet{} if str := m.GetString("primary_tablet_info"); str != "" { if err := opts.Unmarshal([]byte(str), primaryTablet); err != nil { diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index a83e975c747..c061d54ebb3 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -424,6 +424,47 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { keyspaceWanted: "ks", shardWanted: "0", codeWanted: ReplicationStopped, + }, { + name: "No recoveries on drained tablets", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + DurabilityPolicy: "none", + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_DRAINED, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + DurabilityPolicy: "none", + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + }, + LastCheckValid: 1, + ReadOnly: 1, + ReplicationStopped: 1, + }}, + keyspaceWanted: "ks", + shardWanted: "0", + codeWanted: NoProblem, }, { name: "ReplicaMisconfigured", info: []*test.InfoForRecoveryAnalysis{{ diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index 593b846a72e..25820bc5184 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -204,9 +204,6 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an var wg sync.WaitGroup for _, tabletInfo := range tablets { tablet := tabletInfo.Tablet - if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) { - continue - } tabletAliasString := topoproto.TabletAliasString(tablet.Alias) latestInstances[tabletAliasString] = true old, err := inst.ReadTablet(tabletAliasString) diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index 7acb29dcc5b..bc9eeba1fb7 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -200,6 +200,8 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) { return nil }) tab100.MysqlPort = 100 + // We refresh once more to ensure we don't affect the next tests since we've made a change again. + refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(tabletAlias string) {}, false, nil) }() // Let's assume tab100 restarted on a different pod. This would change its tablet hostname and port _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { @@ -212,6 +214,26 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) { // Also the old tablet should be forgotten verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets, nil) }) + + t.Run("Replica becomes a drained tablet", func(t *testing.T) { + defer func() { + _, err = ts.UpdateTabletFields(context.Background(), tab101.Alias, func(tablet *topodatapb.Tablet) error { + tablet.Type = topodatapb.TabletType_REPLICA + return nil + }) + tab101.Type = topodatapb.TabletType_REPLICA + // We refresh once more to ensure we don't affect the next tests since we've made a change again. + refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(tabletAlias string) {}, false, nil) + }() + // A replica tablet can be converted to drained type if it has an errant GTID. + _, err = ts.UpdateTabletFields(context.Background(), tab101.Alias, func(tablet *topodatapb.Tablet) error { + tablet.Type = topodatapb.TabletType_DRAINED + return nil + }) + tab101.Type = topodatapb.TabletType_DRAINED + // We expect 1 tablet to be refreshed since its type has been changed. + verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets, nil) + }) } func TestShardPrimary(t *testing.T) {