Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(manager): fix logic for when RWX workload is restarted after node… (backport #3077) #3169

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions controller/kubernetes_pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,19 +209,12 @@ func (kc *KubernetesPodController) handleWorkloadPodDeletionIfCSIPluginPodIsDown
return nil
}

storageNetworkSetting, err := kc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetwork)
log.WithError(err).Warnf("%s. Failed to check isStorageNetwork", logAbort)
return nil
}

storageNetworkForRWXVolumeEnabled, err := kc.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetworkForRWXVolumeEnabled)
return nil
}

if !types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled) {
if !isStorageNetworkForRWXVolume {
return nil
}

Expand Down Expand Up @@ -476,6 +469,11 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
return err
}

isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
kc.logger.WithError(err).Warn("Failed to check isStorageNetwork, assuming not")
}

// Only delete pod which has startTime < vol.Status.RemountRequestAt AND timeNow > vol.Status.RemountRequestAt + delayDuration
// The delayDuration is to make sure that we don't repeatedly delete the pod too fast
// when vol.Status.RemountRequestAt is updated too quickly by volumeController
Expand All @@ -493,6 +491,13 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
if vol.Status.RemountRequestedAt == "" {
continue
}

// NFS clients can generally recover without a restart/remount when the NFS server restarts using the same Cluster IP.
// A remount is required when the storage network for RWX is in use because the new NFS server has a different IP.
if isRegularRWXVolume(vol) && !isStorageNetworkForRWXVolume {
continue
}

remountRequestedAt, err := time.Parse(time.RFC3339, vol.Status.RemountRequestedAt)
if err != nil {
return err
Expand Down
20 changes: 3 additions & 17 deletions controller/share_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana
return nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return err
}
Expand Down Expand Up @@ -1045,20 +1045,6 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc *
return tolerations
}

func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) {
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil
}

func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) {
targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled}
for _, item := range targetSettings {
Expand Down Expand Up @@ -1091,7 +1077,7 @@ func (c *ShareManagerController) canCleanupService(shareManagerName string) (boo
return false, nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return false, err
}
Expand Down Expand Up @@ -1362,7 +1348,7 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager

log := getLoggerForShareManager(c.logger, sm)

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("Failed to check storage network for RWX volume")
}
Expand Down
6 changes: 3 additions & 3 deletions controller/volume_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4524,9 +4524,9 @@ func (c *VolumeController) ReconcileShareManagerState(volume *longhorn.Volume) e
log.Infof("Updated image for share manager from %v to %v", sm.Spec.Image, c.smImage)
}

// kill the workload pods, when the share manager goes into error state
// easiest approach is to set the RemountRequestedAt variable,
// since that is already responsible for killing the workload pods
// Give the workload pods a chance to restart when the share manager goes into error state.
// Easiest approach is to set the RemountRequestedAt variable. Pods will make that decision
// in the kubernetes_pod_controller.
if sm.Status.State == longhorn.ShareManagerStateError || sm.Status.State == longhorn.ShareManagerStateUnknown {
volume.Status.RemountRequestedAt = c.nowHandler()
msg := fmt.Sprintf("Volume %v requested remount at %v", volume.Name, volume.Status.RemountRequestedAt)
Expand Down
34 changes: 34 additions & 0 deletions datastore/longhorn.go
Original file line number Diff line number Diff line change
Expand Up @@ -5517,3 +5517,37 @@ func (s *DataStore) GetOneBackingImageReadyNodeDisk(backingImage *longhorn.Backi

return nil, "", fmt.Errorf("failed to find one ready backing image %v", backingImage.Name)
}

func (s *DataStore) IsPVMountOptionReadOnly(volume *longhorn.Volume) (bool, error) {
kubeStatus := volume.Status.KubernetesStatus
if kubeStatus.PVName == "" {
return false, nil
}

pv, err := s.GetPersistentVolumeRO(kubeStatus.PVName)
if err != nil {
return false, err
}
if pv != nil {
for _, opt := range pv.Spec.MountOptions {
if opt == "ro" {
return true, nil
}
}
}
return false, nil
}

func (s *DataStore) IsStorageNetworkForRWXVolume() (bool, error) {
storageNetworkSetting, err := s.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := s.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled), nil
}