Skip to content

Commit

Permalink
fix(csi): Invalid mount point with volume detached
Browse files Browse the repository at this point in the history
Volume detached because only one replica failed and not remount the
`globalmount`. We need to handle it at NodePublishVolume because
os.ReadDir will be passed when it becomes a read-only file system
after an unexpected volume detaching.

Use an empty file to test the file system

Ref: 4814, 5801

Signed-off-by: James Lu <james.lu@suse.com>
  • Loading branch information
mantissahz committed Jul 5, 2023
1 parent 57b4596 commit 06f4aa0
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 4 deletions.
2 changes: 1 addition & 1 deletion csi/node_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
if volume.State != string(longhorn.VolumeStateAttached) || volume.Controllers[0].Endpoint == "" {
log.Infof("Volume %v hasn't been attached yet, unmounting potential mount point %v", volumeID, targetPath)
if err := unmount(targetPath, mounter); err != nil {
log.WithError(err).Warn("Failed to unmount")
log.WithError(err).Warnf("Failed to unmount: %v", targetPath)
}
return nil, status.Errorf(codes.InvalidArgument, "volume %s hasn't been attached yet", volumeID)
}
Expand Down
41 changes: 38 additions & 3 deletions csi/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"strconv"
"strings"
Expand All @@ -29,6 +30,8 @@ const (
defaultStaleReplicaTimeout = 2880

defaultForceUmountTimeout = 30 * time.Second

tempTestMountPointValidStatusFile = "/.longhornmountpointesttstatus"
)

// NewForcedParamsExec creates a osExecutor that allows for adding additional params to later occurring Run calls
Expand Down Expand Up @@ -239,6 +242,38 @@ func parseJSONRecurringJobs(jsonRecurringJobs string) ([]longhornclient.Recurrin
return recurringJobs, nil
}

func syncMountPointDirectory(targetPath string) error {
d, err := os.OpenFile(targetPath, os.O_SYNC, 0750)
if err != nil {
return err
}
defer d.Close()

if _, err := d.Readdirnames(1); err != nil {
if !errors.Is(err, io.EOF) {
return err
}
}

// it would not always return `Input/Output Error` or `read-only file system` errors if we only use ReadDir() or Readdirnames() without an I/O operation
// an I/O operation will make the targetPath mount point invalid immediately
tempFile := targetPath + tempTestMountPointValidStatusFile
f, err := os.OpenFile(tempFile, os.O_CREATE|os.O_RDWR|os.O_SYNC|os.O_TRUNC, 0666)
if err != nil {
return err
}
defer func() {
if err := f.Close(); err != nil {
logrus.WithError(err).Warnf("Failed to close %v", tempFile)
}
if err := os.Remove(tempFile); err != nil {
logrus.WithError(err).Warnf("Failed to remove %v", tempFile)
}
}()

return f.Sync()
}

// ensureMountPoint evaluates whether a path is a valid mountPoint
// in case the targetPath does not exists it will create a path and return false
// in case where the mount point exists but is corrupt, the mount point will be cleaned up and a error is returned
Expand All @@ -252,9 +287,9 @@ func ensureMountPoint(targetPath string, mounter mount.Interface) (bool, error)

IsCorruptedMnt := mount.IsCorruptedMnt(err)
if !IsCorruptedMnt {
logrus.Debugf("Mount point %v try reading dir to make sure it's healthy", targetPath)
if _, err := os.ReadDir(targetPath); err != nil {
logrus.Debugf("Mount point %v was identified as corrupt by ReadDir", targetPath)
logrus.Debugf("Mount point %v try opening and syncing dir to make sure it's healthy", targetPath)
if err := syncMountPointDirectory(targetPath); err != nil {
logrus.WithError(err).Warnf("Mount point %v was identified as corrupt by opening and syncing", targetPath)
IsCorruptedMnt = true
}
}
Expand Down

0 comments on commit 06f4aa0

Please sign in to comment.