From f05f9c0391475a9df16006a9dd52bfac3270ce7c Mon Sep 17 00:00:00 2001 From: James Munson Date: Mon, 19 Aug 2024 10:38:12 -0600 Subject: [PATCH] fix(manager): fix logic for when RWX workload is restarted after node failure Signed-off-by: James Munson (cherry picked from commit a3768493385042b65da343d569daa9e977ebf928) # Conflicts: # datastore/longhorn.go --- controller/kubernetes_pod_controller.go | 25 ++++++++++-------- controller/share_manager_controller.go | 20 +++------------ controller/volume_controller.go | 6 ++--- datastore/longhorn.go | 34 +++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/controller/kubernetes_pod_controller.go b/controller/kubernetes_pod_controller.go index cc91d5cd96..762e332764 100644 --- a/controller/kubernetes_pod_controller.go +++ b/controller/kubernetes_pod_controller.go @@ -209,19 +209,12 @@ func (kc *KubernetesPodController) handleWorkloadPodDeletionIfCSIPluginPodIsDown return nil } - storageNetworkSetting, err := kc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) + isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume() if err != nil { - log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetwork) + log.WithError(err).Warnf("%s. Failed to check isStorageNetwork", logAbort) return nil } - - storageNetworkForRWXVolumeEnabled, err := kc.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) - if err != nil { - log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetworkForRWXVolumeEnabled) - return nil - } - - if !types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled) { + if !isStorageNetworkForRWXVolume { return nil } @@ -476,6 +469,11 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod * return err } + isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume() + if err != nil { + kc.logger.WithError(err).Warn("Failed to check isStorageNetwork, assuming not") + } + // Only delete pod which has startTime < vol.Status.RemountRequestAt AND timeNow > vol.Status.RemountRequestAt + delayDuration // The delayDuration is to make sure that we don't repeatedly delete the pod too fast // when vol.Status.RemountRequestAt is updated too quickly by volumeController @@ -493,6 +491,13 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod * if vol.Status.RemountRequestedAt == "" { continue } + + // NFS clients can generally recover without a restart/remount when the NFS server restarts using the same Cluster IP. + // A remount is required when the storage network for RWX is in use because the new NFS server has a different IP. + if isRegularRWXVolume(vol) && !isStorageNetworkForRWXVolume { + continue + } + remountRequestedAt, err := time.Parse(time.RFC3339, vol.Status.RemountRequestedAt) if err != nil { return err diff --git a/controller/share_manager_controller.go b/controller/share_manager_controller.go index 1f2e16dcc6..c232d58f11 100644 --- a/controller/share_manager_controller.go +++ b/controller/share_manager_controller.go @@ -451,7 +451,7 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana return nil } - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { return err } @@ -1045,20 +1045,6 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc * return tolerations } -func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) { - storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) - if err != nil { - return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork) - } - - storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) - if err != nil { - return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled) - } - - return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil -} - func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) { targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled} for _, item := range targetSettings { @@ -1091,7 +1077,7 @@ func (c *ShareManagerController) canCleanupService(shareManagerName string) (boo return false, nil } - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { return false, err } @@ -1362,7 +1348,7 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager log := getLoggerForShareManager(c.logger, sm) - storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume() + storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume() if err != nil { log.WithError(err).Warnf("Failed to check storage network for RWX volume") } diff --git a/controller/volume_controller.go b/controller/volume_controller.go index b1dbdc92a6..147248f843 100644 --- a/controller/volume_controller.go +++ b/controller/volume_controller.go @@ -4524,9 +4524,9 @@ func (c *VolumeController) ReconcileShareManagerState(volume *longhorn.Volume) e log.Infof("Updated image for share manager from %v to %v", sm.Spec.Image, c.smImage) } - // kill the workload pods, when the share manager goes into error state - // easiest approach is to set the RemountRequestedAt variable, - // since that is already responsible for killing the workload pods + // Give the workload pods a chance to restart when the share manager goes into error state. + // Easiest approach is to set the RemountRequestedAt variable. Pods will make that decision + // in the kubernetes_pod_controller. if sm.Status.State == longhorn.ShareManagerStateError || sm.Status.State == longhorn.ShareManagerStateUnknown { volume.Status.RemountRequestedAt = c.nowHandler() msg := fmt.Sprintf("Volume %v requested remount at %v", volume.Name, volume.Status.RemountRequestedAt) diff --git a/datastore/longhorn.go b/datastore/longhorn.go index d7ce3a709e..f477ea1316 100644 --- a/datastore/longhorn.go +++ b/datastore/longhorn.go @@ -5517,3 +5517,37 @@ func (s *DataStore) GetOneBackingImageReadyNodeDisk(backingImage *longhorn.Backi return nil, "", fmt.Errorf("failed to find one ready backing image %v", backingImage.Name) } + +func (s *DataStore) IsPVMountOptionReadOnly(volume *longhorn.Volume) (bool, error) { + kubeStatus := volume.Status.KubernetesStatus + if kubeStatus.PVName == "" { + return false, nil + } + + pv, err := s.GetPersistentVolumeRO(kubeStatus.PVName) + if err != nil { + return false, err + } + if pv != nil { + for _, opt := range pv.Spec.MountOptions { + if opt == "ro" { + return true, nil + } + } + } + return false, nil +} + +func (s *DataStore) IsStorageNetworkForRWXVolume() (bool, error) { + storageNetworkSetting, err := s.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) + if err != nil { + return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetwork) + } + + storageNetworkForRWXVolumeEnabled, err := s.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) + if err != nil { + return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetworkForRWXVolumeEnabled) + } + + return types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled), nil +}