Skip to content

Commit

Permalink
Add debugging for stale shard health stats on vtgates
Browse files Browse the repository at this point in the history
  • Loading branch information
tanjinx committed Jan 10, 2025
1 parent a03067d commit 20ec3f2
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 0 deletions.
5 changes: 5 additions & 0 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,9 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
// keyspace and shard are not expected to change, but just in case ...
// move this tabletHealthCheck to the correct map
oldTargetKey := KeyFromTarget(prevTarget)

log.Infof("deleting tablet %v from health stats", th.Tablet)

delete(hc.healthData[oldTargetKey], tabletAlias)
_, ok := hc.healthData[targetKey]
if !ok {
Expand Down Expand Up @@ -554,6 +557,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
alias := tabletAliasString(topoproto.TabletAliasString(healthy[0].Tablet.Alias))
// Clear healthy list for primary if the existing tablet is down
if alias == tabletAlias {
log.Warningf("Removing tablet %v from the healthy map.", tabletAlias)
hc.healthy[targetKey] = []*TabletHealth{}
}
}
Expand All @@ -563,6 +567,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
// We re-sort the healthy tablet list whenever we get a health update for tablets we can route to.
// Tablets from other cells for non-primary targets should not trigger a re-sort;
// they should also be excluded from healthy list.
log.Infof("Recomputing tablet healthy stats for %v", th.Tablet)
if th.Target.TabletType != topodata.TabletType_PRIMARY && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) {
hc.recomputeHealthy(targetKey)
}
Expand Down
4 changes: 4 additions & 0 deletions go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St
}
thc.setServingState(serving, reason)

log.Infof("healthcheck update for tablet %v: serving: %v, reason: %s", thc.Tablet, thc.Serving, reason)

// notify downstream for primary change
hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, thc.Serving)
return nil
Expand Down Expand Up @@ -326,6 +328,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
}
// trivialUpdate = false because this is an error
// up = false because we did not get a healthy response
log.Errorf("healthcheck got error for tablet %v : %v", thc.Tablet, err.Error())
hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false)
}
// If there was a timeout send an error. We do this after stream has returned.
Expand All @@ -337,6 +340,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
// trivialUpdate = false because this is an error
// up = false because we did not get a healthy response within the timeout
log.Warningf("healthcheck timed out for tablet %v", thc.Tablet)
hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false)
}

Expand Down

0 comments on commit 20ec3f2

Please sign in to comment.