Skip to content

Commit

Permalink
[release-19.0] Fix RemoveTablet during TabletExternallyReparented
Browse files Browse the repository at this point in the history
… causing connection issues (#16371) (#16567)

Signed-off-by: Arthur Schreiber <arthurschreiber@github.com>
Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com>
  • Loading branch information
vitess-bot[bot] authored Aug 9, 2024
1 parent b24a583 commit fe1499d
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 1 deletion.
22 changes: 21 additions & 1 deletion go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,20 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
// delete from healthy list
healthy, ok := hc.healthy[key]
if ok && len(healthy) > 0 {
hc.recomputeHealthy(key)
if tabletType == topodata.TabletType_PRIMARY {
// If the deleted tablet was a primary,
// and it matches what we think is the current active primary,
// clear the healthy list for the primary.
//
// See the logic in `updateHealth` for more details.
alias := tabletAliasString(topoproto.TabletAliasString(healthy[0].Tablet.Alias))
if alias == tabletAlias {
hc.healthy[key] = []*TabletHealth{}
}
} else {
// Simply recompute the list of healthy tablets for all other tablet types.
hc.recomputeHealthy(key)
}
}
}
}()
Expand Down Expand Up @@ -570,6 +583,13 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
hc.broadcast(th)
}

// recomputeHealthy recomputes the healthy tablets for the given key.
//
// This filters out tablets that might be healthy, but are not part of the current
// cell or cell alias. It also performs filtering of tablets based on replication lag,
// if configured to do so.
//
// This should not be called for primary tablets.
func (hc *HealthCheckImpl) recomputeHealthy(key KeyspaceShardTabletType) {
all := hc.healthData[key]
allArray := make([]*TabletHealth, 0, len(all))
Expand Down
121 changes: 121 additions & 0 deletions go/vt/discovery/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,127 @@ func TestRemoveTablet(t *testing.T) {
assert.Empty(t, a, "wrong result, expected empty list")
}

// When an external primary failover is performed,
// the demoted primary will advertise itself as a `PRIMARY`
// tablet until it recognizes that it was demoted,
// and until all in-flight operations have either finished
// (successfully or unsuccessfully, see `--shutdown_grace_period` flag).
//
// During this time, operations like `RemoveTablet` should not lead
// to multiple tablets becoming valid targets for `PRIMARY`.
func TestRemoveTabletDuringExternalReparenting(t *testing.T) {
ctx := utils.LeakCheckContext(t)

// reset error counters
hcErrorCounters.ResetAll()
ts := memorytopo.NewServer(ctx, "cell")
defer ts.Close()
hc := createTestHc(ctx, ts)
// close healthcheck
defer hc.Close()

firstTablet := createTestTablet(0, "cell", "a")
firstTablet.Type = topodatapb.TabletType_PRIMARY

secondTablet := createTestTablet(1, "cell", "b")
secondTablet.Type = topodatapb.TabletType_REPLICA

thirdTablet := createTestTablet(2, "cell", "c")
thirdTablet.Type = topodatapb.TabletType_REPLICA

firstTabletHealthStream := make(chan *querypb.StreamHealthResponse)
firstTabletConn := createFakeConn(firstTablet, firstTabletHealthStream)
firstTabletConn.errCh = make(chan error)

secondTabletHealthStream := make(chan *querypb.StreamHealthResponse)
secondTabletConn := createFakeConn(secondTablet, secondTabletHealthStream)
secondTabletConn.errCh = make(chan error)

thirdTabletHealthStream := make(chan *querypb.StreamHealthResponse)
thirdTabletConn := createFakeConn(thirdTablet, thirdTabletHealthStream)
thirdTabletConn.errCh = make(chan error)

resultChan := hc.Subscribe()

hc.AddTablet(firstTablet)
<-resultChan

hc.AddTablet(secondTablet)
<-resultChan

hc.AddTablet(thirdTablet)
<-resultChan

firstTabletPrimaryTermStartTimestamp := time.Now().Unix() - 10

firstTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: firstTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}
<-resultChan

secondTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: secondTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
Serving: true,

PrimaryTermStartTimestamp: 0,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
}
<-resultChan

thirdTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: thirdTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
Serving: true,

PrimaryTermStartTimestamp: 0,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
}
<-resultChan

secondTabletPrimaryTermStartTimestamp := time.Now().Unix()

// Simulate a failover
firstTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: firstTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}
<-resultChan

secondTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: secondTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: secondTabletPrimaryTermStartTimestamp,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}
<-resultChan

hc.RemoveTablet(thirdTablet)

// `secondTablet` should be the primary now
expectedTabletStats := []*TabletHealth{{
Tablet: secondTablet,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,
Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
PrimaryTermStartTime: secondTabletPrimaryTermStartTimestamp,
}}

actualTabletStats := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
mustMatch(t, expectedTabletStats, actualTabletStats, "unexpected result")
}

// TestGetHealthyTablets tests the functionality of GetHealthyTabletStats.
func TestGetHealthyTablets(t *testing.T) {
ctx := utils.LeakCheckContext(t)
Expand Down

0 comments on commit fe1499d

Please sign in to comment.