Skip to content

Commit

Permalink
Fix errant GTID detection in VTOrc to also work with a replica not co…
Browse files Browse the repository at this point in the history
…nnected to any primary (#17267)

Signed-off-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
GuptaManan100 authored Nov 25, 2024
1 parent 0b51839 commit 072d1fa
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 29 deletions.
58 changes: 58 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,64 @@ func TestPrimaryElection(t *testing.T) {
require.Len(t, res.Rows, 1, "There should only be 1 primary tablet which was elected")
}

// TestErrantGTIDOnPreviousPrimary tests that VTOrc is able to detect errant GTIDs on a previously demoted primary
// if it has an errant GTID.
func TestErrantGTIDOnPreviousPrimary(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, []string{"--change-tablets-with-errant-gtid-to-drained"}, cluster.VTOrcConfiguration{}, 1, "")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two tablets, so the "other" one must be the new primary
if tablet.Alias != curPrimary.Alias {
if replica == nil {
replica = tablet
} else {
otherReplica = tablet
}
}
}
require.NotNil(t, replica, "should be able to find a replica")
require.NotNil(t, otherReplica, "should be able to find 2nd replica")
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)

// Disable global recoveries for the cluster.
vtOrcProcess.DisableGlobalRecoveries(t)

// Run PRS to promote a different replica.
output, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(
"PlannedReparentShard",
fmt.Sprintf("%s/%s", keyspace.Name, shard0.Name),
"--new-primary", replica.Alias)
require.NoError(t, err, "error in PlannedReparentShard output - %s", output)

// Stop replicatin on the previous primary to simulate it not reparenting properly.
// Also insert an errant GTID on the previous primary.
err = utils.RunSQLs(t, []string{
"STOP REPLICA",
"RESET REPLICA ALL",
"set global read_only=OFF",
"insert into vt_ks.vt_insert_test(id, msg) values (10173, 'test 178342')",
}, curPrimary, "")
require.NoError(t, err)

// Wait for VTOrc to detect the errant GTID and change the tablet to a drained type.
vtOrcProcess.EnableGlobalRecoveries(t)

// Wait for the tablet to be drained.
utils.WaitForTabletType(t, curPrimary, "drained")
}

// Cases to test:
// 1. create cluster with 1 replica and 1 rdonly, let orc choose primary
// verify rdonly is not elected, only replica
Expand Down
82 changes: 53 additions & 29 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,35 +357,7 @@ Cleanup:
// Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group
// members and its replicas, even though they are not.
instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",")
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
err = detectErrantGTIDs(tabletAlias, instance, tablet)
}

latency.Stop("instance")
Expand All @@ -412,6 +384,58 @@ Cleanup:
return nil, err
}

// detectErrantGTIDs detects the errant GTIDs on an instance.
func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatapb.Tablet) (err error) {
// If the tablet is not replicating from anyone, then it could be the previous primary.
// We should check for errant GTIDs by finding the difference with the shard's current primary.
if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" {
var primaryInstance *Instance
primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
if primaryAlias != "" {
primaryInstance, _, _ = ReadInstance(primaryAlias)
}
// Only run errant GTID detection, if we are sure that the data read of the current primary
// is up-to-date enough to reflect that it has been promoted. This is needed to prevent
// flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs
// accepted by the old primary (this tablet) that don't show in the new primary's set.
if primaryInstance != nil {
if primaryInstance.SourceHost == "" {
instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet
}
}
}
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
return err
}

// getKeyspaceShardName returns a single string having both the keyspace and shard
func getKeyspaceShardName(keyspace, shard string) string {
return fmt.Sprintf("%v:%v", keyspace, shard)
Expand Down
118 changes: 118 additions & 0 deletions go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"vitess.io/vitess/go/vt/external/golib/sqlutils"
"vitess.io/vitess/go/vt/log"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/db"
Expand Down Expand Up @@ -773,3 +774,120 @@ func TestExpireTableData(t *testing.T) {
})
}
}

func TestDetectErrantGTIDs(t *testing.T) {
tests := []struct {
name string
instance *Instance
primaryInstance *Instance
wantErr bool
wantErrantGTID string
}{
{
name: "No errant GTIDs",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
}, {
name: "Errant GTIDs on replica",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
},
{
name: "No errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
},
{
name: "Errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342",
}, {
name: "Old information for new primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "localhost",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-311",
},
},
}

keyspaceName := "ks"
shardName := "0"
tablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
primaryTablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
defer func() {
db.ClearVTOrcDatabase()
}()
db.ClearVTOrcDatabase()

// Save shard record for the primary tablet.
err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{
PrimaryAlias: primaryTablet.Alias,
}, nil))
require.NoError(t, err)

if tt.primaryInstance != nil {
tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias)
err = SaveTablet(primaryTablet)
require.NoError(t, err)
err = WriteInstance(tt.primaryInstance, true, nil)
require.NoError(t, err)
}

err = detectErrantGTIDs(topoproto.TabletAliasString(tablet.Alias), tt.instance, tablet)
if tt.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tt.wantErrantGTID, tt.instance.GtidErrant)
})
}
}

0 comments on commit 072d1fa

Please sign in to comment.