From 21f853b6c904b31de7fcf786bf3333c38af44be8 Mon Sep 17 00:00:00 2001 From: Derek Su Date: Sat, 24 Jun 2023 20:01:12 +0800 Subject: [PATCH] Try to reattach volume if volume is detached or engine is dead unexpectedly Reattach volume if - volume is detached unexpectedly and there are still healthy replicas - engine dead unexpectedly and there are still healthy replicas when the volume is not attached Longhorn 6155 Signed-off-by: Derek Su --- constant/events.go | 6 +++--- controller/volume_controller.go | 26 +++++++++++++++----------- manager/volume.go | 2 +- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/constant/events.go b/constant/events.go index 8c10f31a9f..de48a0b3ba 100644 --- a/constant/events.go +++ b/constant/events.go @@ -42,9 +42,9 @@ const ( EventReasonUnknown = "Unknown" EventReasonFailedEviction = "FailedEviction" - EventReasonDetachedUnexpectly = "DetachedUnexpectly" - EventReasonRemount = "Remount" - EventReasonAutoSalvaged = "AutoSalvaged" + EventReasonDetachedUnexpectedly = "DetachedUnexpectedly" + EventReasonRemount = "Remount" + EventReasonAutoSalvaged = "AutoSalvaged" EventReasonFetching = "Fetching" EventReasonFetched = "Fetched" diff --git a/controller/volume_controller.go b/controller/volume_controller.go index d0b25ce315..659fef91c8 100644 --- a/controller/volume_controller.go +++ b/controller/volume_controller.go @@ -1330,19 +1330,23 @@ func (c *VolumeController) ReconcileVolumeState(v *longhorn.Volume, es map[strin return nil } - // reattach volume if detached unexpected and there are still healthy replicas - if e.Status.CurrentState == longhorn.InstanceStateError && v.Status.CurrentNodeID != "" { - log.Warn("Reattaching the volume since engine of volume dead unexpectedly") - msg := fmt.Sprintf("Engine of volume %v dead unexpectedly, reattach the volume", v.Name) - c.eventRecorder.Event(v, corev1.EventTypeWarning, constant.EventReasonDetachedUnexpectly, msg) - e.Spec.LogRequested = true - for _, r := range rs { - if r.Status.CurrentState == longhorn.InstanceStateRunning { - r.Spec.LogRequested = true - rs[r.Name] = r + // Reattach volume if + // - volume is detached unexpectedly and there are still healthy replicas + // - engine dead unexpectedly and there are still healthy replicas when the volume is not attached + if e.Status.CurrentState == longhorn.InstanceStateError { + if v.Status.CurrentNodeID != "" || (v.Spec.NodeID != "" && v.Status.CurrentNodeID == "" && v.Status.State != longhorn.VolumeStateAttached) { + log.Warn("Reattaching the volume since engine of volume dead unexpectedly") + msg := fmt.Sprintf("Engine of volume %v dead unexpectedly, reattach the volume", v.Name) + c.eventRecorder.Event(v, corev1.EventTypeWarning, constant.EventReasonDetachedUnexpectedly, msg) + e.Spec.LogRequested = true + for _, r := range rs { + if r.Status.CurrentState == longhorn.InstanceStateRunning { + r.Spec.LogRequested = true + rs[r.Name] = r + } } + v.Status.Robustness = longhorn.VolumeRobustnessFaulted } - v.Status.Robustness = longhorn.VolumeRobustnessFaulted } } diff --git a/manager/volume.go b/manager/volume.go index fa3298c0aa..2130e16a92 100644 --- a/manager/volume.go +++ b/manager/volume.go @@ -754,7 +754,7 @@ func (m *VolumeManager) EngineUpgrade(volumeName, image string) (v *longhorn.Vol return nil, err } if image != defaultEngineImage { - return nil, fmt.Errorf("updrading to %v is not allowed. "+ + return nil, fmt.Errorf("upgrading to %v is not allowed. "+ "Only allow to upgrade to the default engine image %v because the setting "+ "`Concurrent Automatic Engine Upgrade Per Node Limit` is greater than 0", image, defaultEngineImage)