From 16fa7a3641af3cbfbce7525c57e4dc462e2c555b Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:10:01 +0530 Subject: [PATCH] Errant GTID detection fix for VTOrc (#17287) Signed-off-by: Manan Gupta --- go/vt/vtorc/inst/instance_dao.go | 11 ++++-- go/vt/vtorc/inst/instance_dao_test.go | 49 +++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index c8ff218710f..45291394c56 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -362,7 +362,7 @@ Cleanup: // Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group // members and its replicas, even though they are not. instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",") - err = detectErrantGTIDs(tabletAlias, instance, tablet) + err = detectErrantGTIDs(instance, tablet) } latency.Stop("instance") @@ -390,13 +390,18 @@ Cleanup: } // detectErrantGTIDs detects the errant GTIDs on an instance. -func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatapb.Tablet) (err error) { +func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error) { // If the tablet is not replicating from anyone, then it could be the previous primary. // We should check for errant GTIDs by finding the difference with the shard's current primary. if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" { var primaryInstance *Instance primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard) if primaryAlias != "" { + // Check if the current tablet is the primary. + // If it is, then we don't need to run errant gtid detection on it. + if primaryAlias == instance.InstanceAlias { + return nil + } primaryInstance, _, _ = ReadInstance(primaryAlias) } // Only run errant GTID detection, if we are sure that the data read of the current primary @@ -434,7 +439,7 @@ func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatap if err == nil { var gtidCount int64 gtidCount, err = replication.GTIDCount(instance.GtidErrant) - currentErrantGTIDCount.Set(tabletAlias, gtidCount) + currentErrantGTIDCount.Set(instance.InstanceAlias, gtidCount) } } } diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index f248ded5e2b..0d59de0588f 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -854,7 +854,7 @@ func TestDetectErrantGTIDs(t *testing.T) { primaryTablet := &topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Cell: "zone-1", - Uid: 100, + Uid: 101, }, Keyspace: keyspaceName, Shard: shardName, @@ -881,7 +881,8 @@ func TestDetectErrantGTIDs(t *testing.T) { require.NoError(t, err) } - err = detectErrantGTIDs(topoproto.TabletAliasString(tablet.Alias), tt.instance, tablet) + tt.instance.InstanceAlias = topoproto.TabletAliasString(tablet.Alias) + err = detectErrantGTIDs(tt.instance, tablet) if tt.wantErr { require.Error(t, err) return @@ -891,3 +892,47 @@ func TestDetectErrantGTIDs(t *testing.T) { }) } } + +// TestPrimaryErrantGTIDs tests that we don't run Errant GTID detection on the primary tablet itself! +func TestPrimaryErrantGTIDs(t *testing.T) { + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + db.ClearVTOrcDatabase() + keyspaceName := "ks" + shardName := "0" + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Keyspace: keyspaceName, + Shard: shardName, + } + instance := &Instance{ + SourceHost: "", + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341", + InstanceAlias: topoproto.TabletAliasString(tablet.Alias), + } + + // Save shard record for the primary tablet. + err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{ + PrimaryAlias: tablet.Alias, + }, nil)) + require.NoError(t, err) + + // Store the tablet record and the instance. + err = SaveTablet(tablet) + require.NoError(t, err) + err = WriteInstance(instance, true, nil) + require.NoError(t, err) + + // After this if we read a new information for the record that updates its + // gtid set further, we shouldn't be detecting errant GTIDs on it since it is the primary! + // We shouldn't be comparing it with a previous version of itself! + instance.ExecutedGtidSet = "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-351" + err = detectErrantGTIDs(instance, tablet) + require.NoError(t, err) + require.EqualValues(t, "", instance.GtidErrant) +}