From 71432f6574ce1809cbcc41ffbed0c95e66554d15 Mon Sep 17 00:00:00 2001 From: James Lu Date: Mon, 2 Oct 2023 09:33:18 +0800 Subject: [PATCH] fix(restore): dr volume failed by delete lock DR volume will fail to start an incremental restoration in target cluster, when the source cluster is deleting the eliminated backup by deleting lock in the backup store. We handle the full restoration at the issue 3055 and now add a regular expression pattern to handle similar failed message to have a backoff mechanism to retry the full/incremental restoration. Ref: 6750 Signed-off-by: James Lu --- controller/engine_controller.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/controller/engine_controller.go b/controller/engine_controller.go index 970e5d5339..23c6c5aa35 100644 --- a/controller/engine_controller.go +++ b/controller/engine_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "reflect" + "regexp" "strconv" "strings" "sync" @@ -42,7 +43,7 @@ import ( const ( unknownReplicaPrefix = "UNKNOWN-" - restoreGetLockFailedMsg = "error initiating full backup restore: failed lock" + restoreGetLockFailedPatternMsg = "error initiating (full|incremental) backup restore: failed lock" restoreAlreadyInProgressMsg = "already in progress" restoreAlreadyRestoredBackupMsg = "already restored backup" ) @@ -1483,7 +1484,7 @@ func handleRestoreError(log logrus.FieldLogger, engine *longhorn.Engine, rsMap m continue } - if strings.Contains(re.Error(), restoreGetLockFailedMsg) { + if isReplicaRestoreFailedLockError(&re) { log.WithError(re).Warnf("Ignored failed locked restore error from replica %v", re.Address) // Register the name with a restore backoff entry backoff.Next(engine.Name, time.Now()) @@ -1504,6 +1505,11 @@ func handleRestoreError(log logrus.FieldLogger, engine *longhorn.Engine, rsMap m return nil } +func isReplicaRestoreFailedLockError(err *imclient.ReplicaError) bool { + failedLock := regexp.MustCompile(restoreGetLockFailedPatternMsg) + return failedLock.MatchString(err.Error()) +} + func handleRestoreErrorForCompatibleEngine(log logrus.FieldLogger, engine *longhorn.Engine, rsMap map[string]*longhorn.RestoreStatus, backoff *flowcontrol.Backoff, err error) error { taskErr, ok := err.(imclient.TaskError) if !ok { @@ -1517,7 +1523,7 @@ func handleRestoreErrorForCompatibleEngine(log logrus.FieldLogger, engine *longh continue } - if strings.Contains(re.Error(), restoreGetLockFailedMsg) { + if isReplicaRestoreFailedLockError(&re) { log.WithError(re).Warnf("Ignored failed locked restore error from replica %v", re.Address) // Register the name with a restore backoff entry backoff.Next(engine.Name, time.Now())