Skip to content

Commit

Permalink
feat(timeout): ensure all replicas can time out quickly when necessary
Browse files Browse the repository at this point in the history
Longhorn 8711

Signed-off-by: Eric Weber <eric.weber@suse.com>
  • Loading branch information
ejweber committed Aug 6, 2024
1 parent b50a821 commit b8e8d7a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pkg/backend/file/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,5 +163,5 @@ func (r *Wrapper) GetTimeoutChannel() chan struct{} {
}

func (r *Wrapper) GetDurationSinceResponse() time.Duration {
return -1
return time.Duration(0)
}
31 changes: 19 additions & 12 deletions pkg/controller/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -1442,28 +1442,35 @@ func (c *Controller) checkBackendTimeouts(shortTimeout, longTimeout time.Duratio
c.RLock()
defer c.RUnlock()

if backend := c.backend.backends[addressToTimeOut]; backend.mode == types.RW {
// The last backend we tried to stop via timeout is still not ERR. Don't try another one yet.
// TODO: We could speed this up by checking for durationSinceResponce < 0 instead.
if backend, ok := c.backend.backends[addressToTimeOut]; ok && backend.backend.GetDurationSinceResponse() >= 0 {
// The last backend we tried to stop via timeout hasn't acknowledged it. Don't try another one yet.
return addressToTimeOut
}

addressToTimeOutLong := ""
addressToTimeOutShort := ""
rwBackendCount := 0
backendsNotTimedOut := 0
for address, backend := range c.backend.backends {
if backend.mode == types.RW {
rwBackendCount += 1
if backend.backend.GetDurationSinceResponse() > longTimeout && addressToTimeOutLong == "" {
addressToTimeOutLong = address
} else if backend.backend.GetDurationSinceResponse() > shortTimeout {
addressToTimeOutShort = address
}
durationSinceResponse := backend.backend.GetDurationSinceResponse()
if durationSinceResponse < 0 {
// This backend has acknowledged our request to time out.
// It would look cleaner to check for ERR versus RW mode here, but a backend won't actually transition to
// ERR until the completion of at least one I/O operation. If, for example, all three backends for an
// engine are unreachable, none of the backends can transition to ERR until all of them have timed out (and
// the operation fails). We want to be able to time out the other two backends in due time instead of
// waiting for some TCP error, etc.
continue
}
backendsNotTimedOut += 1
if durationSinceResponse > longTimeout && addressToTimeOutLong == "" {
addressToTimeOutLong = address
} else if durationSinceResponse > shortTimeout {
addressToTimeOutShort = address
}
}
if addressToTimeOutLong != "" {
addressToTimeOut = addressToTimeOutLong
} else if addressToTimeOutShort != "" && rwBackendCount > 1 {
} else if addressToTimeOutShort != "" && backendsNotTimedOut > 1 {
// Only use addressToTimeOutShort if there is another available backend.
addressToTimeOut = addressToTimeOutShort
} else {
Expand Down
6 changes: 5 additions & 1 deletion pkg/dataconn/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func (c *Client) loop() {
}

ioInflight = 0
c.durationSinceResponse.Store(-1)
c.durationSinceResponse.Store(-1) // Indicate successful timeout to the upper layer.
}

for {
Expand All @@ -169,6 +169,10 @@ func (c *Client) loop() {
return
case <-ticker.C:
// Keep the upper layer informed of outstanding I/O times.
if c.durationSinceResponse.Load() < 0 {
// We have already been asked to time out.
continue
}
if lastIOTime.IsZero() {
c.durationSinceResponse.Store(0)
} else {
Expand Down

0 comments on commit b8e8d7a

Please sign in to comment.