diff --git a/pkg/tso/global_allocator.go b/pkg/tso/global_allocator.go index a37bcc73881..5678b40278a 100644 --- a/pkg/tso/global_allocator.go +++ b/pkg/tso/global_allocator.go @@ -196,8 +196,21 @@ func (gta *GlobalTSOAllocator) IsInitialize() bool { } // UpdateTSO is used to update the TSO in memory and the time window in etcd. -func (gta *GlobalTSOAllocator) UpdateTSO() error { - return gta.timestampOracle.UpdateTimestamp(gta.member.GetLeadership()) +func (gta *GlobalTSOAllocator) UpdateTSO() (err error) { + // When meet network partition, we need to manually retry to update the global tso, + // next request succeeds with the new endpoint, according to https://github.com/etcd-io/etcd/issues/8711 + maxRetryCount := 3 + for i := 0; i < maxRetryCount; i++ { + err = gta.timestampOracle.UpdateTimestamp(gta.member.GetLeadership()) + if err == nil { + return nil + } + log.Warn("try to update the global tso but failed", errs.ZapError(err)) + // Etcd client retry with roundRobinQuorumBackoff https://github.com/etcd-io/etcd/blob/d62cdeee4863001b09e772ed013eb1342a1d0f89/client/v3/client.go#L488 + // And its default interval is 25ms, so we sleep 50ms here. https://github.com/etcd-io/etcd/blob/d62cdeee4863001b09e772ed013eb1342a1d0f89/client/v3/options.go#L53 + time.Sleep(50 * time.Millisecond) + } + return } // SetTSO sets the physical part with given TSO.