Skip to content

Commit

Permalink
Fixes for avoidance of hosts taking backup in PRS & ERS (vitessio#17300)
Browse files Browse the repository at this point in the history
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
  • Loading branch information
ejortegau committed Dec 10, 2024
1 parent 6f406db commit 35ad672
Show file tree
Hide file tree
Showing 12 changed files with 663 additions and 791 deletions.
178 changes: 84 additions & 94 deletions go/vt/proto/replicationdata/replicationdata.pb.go

Large diffs are not rendered by default.

38 changes: 2 additions & 36 deletions go/vt/proto/replicationdata/replicationdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

959 changes: 469 additions & 490 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata.pb.go

Large diffs are not rendered by default.

72 changes: 2 additions & 70 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 14 additions & 3 deletions go/vt/vtctl/reparentutil/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,6 @@ func stopReplicationAndBuildStatusMaps(
logger.Infof("getting replication position from %v", alias)

stopReplicationStatus, err := tmc.StopReplicationAndGetStatus(groupCtx, tabletInfo.Tablet, replicationdatapb.StopReplicationMode_IOTHREADONLY)
m.Lock()
res.tabletsBackupState[alias] = stopReplicationStatus.GetBackupRunning()
m.Unlock()
if err != nil {
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
Expand All @@ -277,6 +274,20 @@ func stopReplicationAndBuildStatusMaps(
err = vterrors.Wrapf(err, "error when getting replication status for alias %v: %v", alias, err)
}
} else {
isTakingBackup := false

// Prefer the most up-to-date information regarding whether the tablet is taking a backup from the After
// replication status, but fall back to the Before status if After is nil.
if stopReplicationStatus.After != nil {
isTakingBackup = stopReplicationStatus.After.BackupRunning
} else if stopReplicationStatus.Before != nil {
isTakingBackup = stopReplicationStatus.Before.BackupRunning
}

m.Lock()
res.tabletsBackupState[alias] = isTakingBackup
m.Unlock()

var sqlThreadRunning bool
// Check if the sql thread was running for the tablet
sqlThreadRunning, err = SQLThreadWasRunning(stopReplicationStatus)
Expand Down
82 changes: 82 additions & 0 deletions go/vt/vtctl/reparentutil/replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
waitForAllTablets bool
expectedStatusMap map[string]*replicationdatapb.StopReplicationStatus
expectedPrimaryStatusMap map[string]*replicationdatapb.PrimaryStatus
expectedTakingBackup map[string]bool
expectedTabletsReachable []*topodatapb.Tablet
shouldErr bool
}{
Expand Down Expand Up @@ -375,6 +376,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -443,6 +445,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -527,6 +530,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -621,6 +625,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -714,6 +719,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -786,6 +792,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -847,6 +854,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{
"zone1-0000000100": {
Position: "primary-position-100",
Expand Down Expand Up @@ -921,6 +929,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{}, // zone1-0000000100 fails to demote, so does not appear
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1044,6 +1053,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -1093,6 +1103,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1288,8 +1299,78 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
},
}},
stopReplicasTimeout: time.Minute,
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false, "zone1-0000000102": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
}, {
name: "Handle nil replication status After. No segfaulting when determining backup status, and fall back to Before status",
durability: "none",
tmc: &stopReplicationAndBuildStatusMapsTestTMClient{
stopReplicationAndGetStatusResults: map[string]*struct {
StopStatus *replicationdatapb.StopReplicationStatus
Err error
}{
"zone1-0000000100": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
"zone1-0000000101": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
},
},
},
ignoredTablets: sets.New[string](),
expectedStatusMap: map[string]*replicationdatapb.StopReplicationStatus{
"zone1-0000000100": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
"zone1-0000000101": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": true, "zone1-0000000101": true},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, {
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
}},
shouldErr: false,
},
}

Expand Down Expand Up @@ -1317,6 +1398,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
for idx, tablet := range res.reachableTablets {
assert.True(t, topoproto.IsTabletInList(tablet, tt.expectedTabletsReachable), "TabletsReached[%d] not found - %s", idx, topoproto.TabletAliasString(tablet.Alias))
}
assert.Equal(t, tt.expectedTakingBackup, res.tabletsBackupState)
})
}
}
Expand Down
Loading

0 comments on commit 35ad672

Please sign in to comment.