Skip to content

Commit

Permalink
fix(manager): fix logic for when RWX workload is restarted after node…
Browse files Browse the repository at this point in the history
… failure

Signed-off-by: James Munson <james.munson@suse.com>
(cherry picked from commit a376849)

# Conflicts:
#	datastore/longhorn.go
  • Loading branch information
james-munson committed Sep 18, 2024
1 parent bc46b6d commit f05f9c0
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 30 deletions.
25 changes: 15 additions & 10 deletions controller/kubernetes_pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,19 +209,12 @@ func (kc *KubernetesPodController) handleWorkloadPodDeletionIfCSIPluginPodIsDown
return nil
}

storageNetworkSetting, err := kc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetwork)
log.WithError(err).Warnf("%s. Failed to check isStorageNetwork", logAbort)
return nil
}

storageNetworkForRWXVolumeEnabled, err := kc.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetworkForRWXVolumeEnabled)
return nil
}

if !types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled) {
if !isStorageNetworkForRWXVolume {
return nil
}

Expand Down Expand Up @@ -476,6 +469,11 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
return err
}

isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
kc.logger.WithError(err).Warn("Failed to check isStorageNetwork, assuming not")
}

// Only delete pod which has startTime < vol.Status.RemountRequestAt AND timeNow > vol.Status.RemountRequestAt + delayDuration
// The delayDuration is to make sure that we don't repeatedly delete the pod too fast
// when vol.Status.RemountRequestAt is updated too quickly by volumeController
Expand All @@ -493,6 +491,13 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
if vol.Status.RemountRequestedAt == "" {
continue
}

// NFS clients can generally recover without a restart/remount when the NFS server restarts using the same Cluster IP.
// A remount is required when the storage network for RWX is in use because the new NFS server has a different IP.
if isRegularRWXVolume(vol) && !isStorageNetworkForRWXVolume {
continue
}

remountRequestedAt, err := time.Parse(time.RFC3339, vol.Status.RemountRequestedAt)
if err != nil {
return err
Expand Down
20 changes: 3 additions & 17 deletions controller/share_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana
return nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return err
}
Expand Down Expand Up @@ -1045,20 +1045,6 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc *
return tolerations
}

func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) {
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil
}

func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) {
targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled}
for _, item := range targetSettings {
Expand Down Expand Up @@ -1091,7 +1077,7 @@ func (c *ShareManagerController) canCleanupService(shareManagerName string) (boo
return false, nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return false, err
}
Expand Down Expand Up @@ -1362,7 +1348,7 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager

log := getLoggerForShareManager(c.logger, sm)

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("Failed to check storage network for RWX volume")
}
Expand Down
6 changes: 3 additions & 3 deletions controller/volume_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4524,9 +4524,9 @@ func (c *VolumeController) ReconcileShareManagerState(volume *longhorn.Volume) e
log.Infof("Updated image for share manager from %v to %v", sm.Spec.Image, c.smImage)
}

// kill the workload pods, when the share manager goes into error state
// easiest approach is to set the RemountRequestedAt variable,
// since that is already responsible for killing the workload pods
// Give the workload pods a chance to restart when the share manager goes into error state.
// Easiest approach is to set the RemountRequestedAt variable. Pods will make that decision
// in the kubernetes_pod_controller.
if sm.Status.State == longhorn.ShareManagerStateError || sm.Status.State == longhorn.ShareManagerStateUnknown {
volume.Status.RemountRequestedAt = c.nowHandler()
msg := fmt.Sprintf("Volume %v requested remount at %v", volume.Name, volume.Status.RemountRequestedAt)
Expand Down
34 changes: 34 additions & 0 deletions datastore/longhorn.go
Original file line number Diff line number Diff line change
Expand Up @@ -5517,3 +5517,37 @@ func (s *DataStore) GetOneBackingImageReadyNodeDisk(backingImage *longhorn.Backi

return nil, "", fmt.Errorf("failed to find one ready backing image %v", backingImage.Name)
}

func (s *DataStore) IsPVMountOptionReadOnly(volume *longhorn.Volume) (bool, error) {
kubeStatus := volume.Status.KubernetesStatus
if kubeStatus.PVName == "" {
return false, nil
}

pv, err := s.GetPersistentVolumeRO(kubeStatus.PVName)
if err != nil {
return false, err
}
if pv != nil {
for _, opt := range pv.Spec.MountOptions {
if opt == "ro" {
return true, nil
}
}
}
return false, nil
}

func (s *DataStore) IsStorageNetworkForRWXVolume() (bool, error) {
storageNetworkSetting, err := s.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := s.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled), nil
}

0 comments on commit f05f9c0

Please sign in to comment.