From 20ec3f28e1abe6c71e7df27f7fbd1262adbc2fe4 Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Thu, 9 Jan 2025 18:35:02 -0800 Subject: [PATCH] Add debugging for stale shard health stats on vtgates --- go/vt/discovery/healthcheck.go | 5 +++++ go/vt/discovery/tablet_health_check.go | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 440737bd696..c26aac00097 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -516,6 +516,9 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ // keyspace and shard are not expected to change, but just in case ... // move this tabletHealthCheck to the correct map oldTargetKey := KeyFromTarget(prevTarget) + + log.Infof("deleting tablet %v from health stats", th.Tablet) + delete(hc.healthData[oldTargetKey], tabletAlias) _, ok := hc.healthData[targetKey] if !ok { @@ -554,6 +557,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ alias := tabletAliasString(topoproto.TabletAliasString(healthy[0].Tablet.Alias)) // Clear healthy list for primary if the existing tablet is down if alias == tabletAlias { + log.Warningf("Removing tablet %v from the healthy map.", tabletAlias) hc.healthy[targetKey] = []*TabletHealth{} } } @@ -563,6 +567,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ // We re-sort the healthy tablet list whenever we get a health update for tablets we can route to. // Tablets from other cells for non-primary targets should not trigger a re-sort; // they should also be excluded from healthy list. + log.Infof("Recomputing tablet healthy stats for %v", th.Tablet) if th.Target.TabletType != topodata.TabletType_PRIMARY && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { hc.recomputeHealthy(targetKey) } diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 05ab47dee05..cd14cc6df88 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -224,6 +224,8 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St } thc.setServingState(serving, reason) + log.Infof("healthcheck update for tablet %v: serving: %v, reason: %s", thc.Tablet, thc.Serving, reason) + // notify downstream for primary change hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, thc.Serving) return nil @@ -326,6 +328,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { } // trivialUpdate = false because this is an error // up = false because we did not get a healthy response + log.Errorf("healthcheck got error for tablet %v : %v", thc.Tablet, err.Error()) hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) } // If there was a timeout send an error. We do this after stream has returned. @@ -337,6 +340,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) // trivialUpdate = false because this is an error // up = false because we did not get a healthy response within the timeout + log.Warningf("healthcheck timed out for tablet %v", thc.Tablet) hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) }