Skip to content

Commit

Permalink
fix(restore): dr volume failed by delete lock
Browse files Browse the repository at this point in the history
DR volume will fail to start an incremental restoration in target
cluster, when the source cluster is deleting the eliminated backup
by deleting lock in the backup store.

We handle the full restoration at the issue 3055 and now add a
regular expression pattern to handle similar failed message to have
a backoff mechanism to retry the full/incremental restoration.

Ref: 6750

Signed-off-by: James Lu <james.lu@suse.com>
  • Loading branch information
mantissahz authored and David Ko committed Oct 2, 2023
1 parent cec7693 commit 71432f6
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions controller/engine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"reflect"
"regexp"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -42,7 +43,7 @@ import (

const (
unknownReplicaPrefix = "UNKNOWN-"
restoreGetLockFailedMsg = "error initiating full backup restore: failed lock"
restoreGetLockFailedPatternMsg = "error initiating (full|incremental) backup restore: failed lock"
restoreAlreadyInProgressMsg = "already in progress"
restoreAlreadyRestoredBackupMsg = "already restored backup"
)
Expand Down Expand Up @@ -1483,7 +1484,7 @@ func handleRestoreError(log logrus.FieldLogger, engine *longhorn.Engine, rsMap m
continue
}

if strings.Contains(re.Error(), restoreGetLockFailedMsg) {
if isReplicaRestoreFailedLockError(&re) {
log.WithError(re).Warnf("Ignored failed locked restore error from replica %v", re.Address)
// Register the name with a restore backoff entry
backoff.Next(engine.Name, time.Now())
Expand All @@ -1504,6 +1505,11 @@ func handleRestoreError(log logrus.FieldLogger, engine *longhorn.Engine, rsMap m
return nil
}

func isReplicaRestoreFailedLockError(err *imclient.ReplicaError) bool {
failedLock := regexp.MustCompile(restoreGetLockFailedPatternMsg)
return failedLock.MatchString(err.Error())
}

func handleRestoreErrorForCompatibleEngine(log logrus.FieldLogger, engine *longhorn.Engine, rsMap map[string]*longhorn.RestoreStatus, backoff *flowcontrol.Backoff, err error) error {
taskErr, ok := err.(imclient.TaskError)
if !ok {
Expand All @@ -1517,7 +1523,7 @@ func handleRestoreErrorForCompatibleEngine(log logrus.FieldLogger, engine *longh
continue
}

if strings.Contains(re.Error(), restoreGetLockFailedMsg) {
if isReplicaRestoreFailedLockError(&re) {
log.WithError(re).Warnf("Ignored failed locked restore error from replica %v", re.Address)
// Register the name with a restore backoff entry
backoff.Next(engine.Name, time.Now())
Expand Down

0 comments on commit 71432f6

Please sign in to comment.