From a3768493385042b65da343d569daa9e977ebf928 Mon Sep 17 00:00:00 2001 From: James Munson Date: Mon, 19 Aug 2024 10:38:12 -0600 Subject: [PATCH] fix(manager): fix logic for when RWX workload is restarted after node failure Signed-off-by: James Munson --- controller/kubernetes_pod_controller.go | 25 +++++++++++++++---------- controller/share_manager_controller.go | 20 +++----------------- controller/volume_controller.go | 6 +++--- datastore/longhorn.go | 14 ++++++++++++++ 4 files changed, 35 insertions(+), 30 deletions(-) diff --git a/controller/kubernetes_pod_controller.go b/controller/kubernetes_pod_controller.go index b9e2ce3d3d..47a36f992e 100644 --- a/controller/kubernetes_pod_controller.go +++ b/controller/kubernetes_pod_controller.go @@ -209,19 +209,12 @@ func (kc *KubernetesPodController) handleWorkloadPodDeletionIfCSIPluginPodIsDown return nil } - storageNetworkSetting, err := kc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) + isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume() if err != nil { - log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetwork) + log.WithError(err).Warnf("%s. Failed to check isStorageNetwork", logAbort) return nil } - - storageNetworkForRWXVolumeEnabled, err := kc.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) - if err != nil { - log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetworkForRWXVolumeEnabled) - return nil - } - - if !types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled) { + if !isStorageNetworkForRWXVolume { return nil } @@ -476,6 +469,11 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod * return err } + isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume() + if err != nil { + kc.logger.WithError(err).Warn("Failed to check isStorageNetwork, assuming not") + } + // Only delete pod which has startTime < vol.Status.RemountRequestAt AND timeNow > vol.Status.RemountRequestAt + delayDuration // The delayDuration is to make sure that we don't repeatedly delete the pod too fast // when vol.Status.RemountRequestAt is updated too quickly by volumeController @@ -493,6 +491,13 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod * if vol.Status.RemountRequestedAt == "" { continue } + + // NFS clients can generally recover without a restart/remount when the NFS server restarts using the same Cluster IP. + // A remount is required when the storage network for RWX is in use because the new NFS server has a different IP. + if isRegularRWXVolume(vol) && !isStorageNetworkForRWXVolume { + continue + } + remountRequestedAt, err := time.Parse(time.RFC3339, vol.Status.RemountRequestedAt) if err != nil { return err diff --git a/controller/share_manager_controller.go b/controller/share_manager_controller.go index 1f2e16dcc6..c232d58f11 100644 --- a/controller/share_manager_controller.go +++ b/controller/share_manager_controller.go @@ -451,7 +451,7 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana return nil } - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { return err } @@ -1045,20 +1045,6 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc * return tolerations } -func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) { - storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) - if err != nil { - return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork) - } - - storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) - if err != nil { - return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled) - } - - return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil -} - func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) { targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled} for _, item := range targetSettings { @@ -1091,7 +1077,7 @@ func (c *ShareManagerController) canCleanupService(shareManagerName string) (boo return false, nil } - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { return false, err } @@ -1362,7 +1348,7 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager log := getLoggerForShareManager(c.logger, sm) - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { log.WithError(err).Warnf("Failed to check storage network for RWX volume") } diff --git a/controller/volume_controller.go b/controller/volume_controller.go index 83f4b041e9..4f8a2ecb85 100644 --- a/controller/volume_controller.go +++ b/controller/volume_controller.go @@ -4509,9 +4509,9 @@ func (c *VolumeController) ReconcileShareManagerState(volume *longhorn.Volume) e log.Infof("Updated image for share manager from %v to %v", sm.Spec.Image, c.smImage) } - // kill the workload pods, when the share manager goes into error state - // easiest approach is to set the RemountRequestedAt variable, - // since that is already responsible for killing the workload pods + // Give the workload pods a chance to restart when the share manager goes into error state. + // Easiest approach is to set the RemountRequestedAt variable. Pods will make that decision + // in the kubernetes_pod_controller. if sm.Status.State == longhorn.ShareManagerStateError || sm.Status.State == longhorn.ShareManagerStateUnknown { volume.Status.RemountRequestedAt = c.nowHandler() msg := fmt.Sprintf("Volume %v requested remount at %v", volume.Name, volume.Status.RemountRequestedAt) diff --git a/datastore/longhorn.go b/datastore/longhorn.go index d7cd2b6b37..e18d67a985 100644 --- a/datastore/longhorn.go +++ b/datastore/longhorn.go @@ -5576,3 +5576,17 @@ func (s *DataStore) IsPVMountOptionReadOnly(volume *longhorn.Volume) (bool, erro } return false, nil } + +func (s *DataStore) IsStorageNetworkForRWXVolume() (bool, error) { + storageNetworkSetting, err := s.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) + if err != nil { + return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetwork) + } + + storageNetworkForRWXVolumeEnabled, err := s.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) + if err != nil { + return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetworkForRWXVolumeEnabled) + } + + return types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled), nil +}