From b8e8d7a35951def8639cbfd29f5b096107a3ff50 Mon Sep 17 00:00:00 2001 From: Eric Weber Date: Tue, 6 Aug 2024 19:12:58 +0000 Subject: [PATCH] feat(timeout): ensure all replicas can time out quickly when necessary Longhorn 8711 Signed-off-by: Eric Weber --- pkg/backend/file/file.go | 2 +- pkg/controller/control.go | 31 +++++++++++++++++++------------ pkg/dataconn/client.go | 6 +++++- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pkg/backend/file/file.go b/pkg/backend/file/file.go index 9c43c3439..b4012fa42 100644 --- a/pkg/backend/file/file.go +++ b/pkg/backend/file/file.go @@ -163,5 +163,5 @@ func (r *Wrapper) GetTimeoutChannel() chan struct{} { } func (r *Wrapper) GetDurationSinceResponse() time.Duration { - return -1 + return time.Duration(0) } diff --git a/pkg/controller/control.go b/pkg/controller/control.go index 43f638417..93601f685 100644 --- a/pkg/controller/control.go +++ b/pkg/controller/control.go @@ -1442,28 +1442,35 @@ func (c *Controller) checkBackendTimeouts(shortTimeout, longTimeout time.Duratio c.RLock() defer c.RUnlock() - if backend := c.backend.backends[addressToTimeOut]; backend.mode == types.RW { - // The last backend we tried to stop via timeout is still not ERR. Don't try another one yet. - // TODO: We could speed this up by checking for durationSinceResponce < 0 instead. + if backend, ok := c.backend.backends[addressToTimeOut]; ok && backend.backend.GetDurationSinceResponse() >= 0 { + // The last backend we tried to stop via timeout hasn't acknowledged it. Don't try another one yet. return addressToTimeOut } addressToTimeOutLong := "" addressToTimeOutShort := "" - rwBackendCount := 0 + backendsNotTimedOut := 0 for address, backend := range c.backend.backends { - if backend.mode == types.RW { - rwBackendCount += 1 - if backend.backend.GetDurationSinceResponse() > longTimeout && addressToTimeOutLong == "" { - addressToTimeOutLong = address - } else if backend.backend.GetDurationSinceResponse() > shortTimeout { - addressToTimeOutShort = address - } + durationSinceResponse := backend.backend.GetDurationSinceResponse() + if durationSinceResponse < 0 { + // This backend has acknowledged our request to time out. + // It would look cleaner to check for ERR versus RW mode here, but a backend won't actually transition to + // ERR until the completion of at least one I/O operation. If, for example, all three backends for an + // engine are unreachable, none of the backends can transition to ERR until all of them have timed out (and + // the operation fails). We want to be able to time out the other two backends in due time instead of + // waiting for some TCP error, etc. + continue + } + backendsNotTimedOut += 1 + if durationSinceResponse > longTimeout && addressToTimeOutLong == "" { + addressToTimeOutLong = address + } else if durationSinceResponse > shortTimeout { + addressToTimeOutShort = address } } if addressToTimeOutLong != "" { addressToTimeOut = addressToTimeOutLong - } else if addressToTimeOutShort != "" && rwBackendCount > 1 { + } else if addressToTimeOutShort != "" && backendsNotTimedOut > 1 { // Only use addressToTimeOutShort if there is another available backend. addressToTimeOut = addressToTimeOutShort } else { diff --git a/pkg/dataconn/client.go b/pkg/dataconn/client.go index 68a845df0..177d98e83 100644 --- a/pkg/dataconn/client.go +++ b/pkg/dataconn/client.go @@ -160,7 +160,7 @@ func (c *Client) loop() { } ioInflight = 0 - c.durationSinceResponse.Store(-1) + c.durationSinceResponse.Store(-1) // Indicate successful timeout to the upper layer. } for { @@ -169,6 +169,10 @@ func (c *Client) loop() { return case <-ticker.C: // Keep the upper layer informed of outstanding I/O times. + if c.durationSinceResponse.Load() < 0 { + // We have already been asked to time out. + continue + } if lastIOTime.IsZero() { c.durationSinceResponse.Store(0) } else {