From 87bc581f7263ee6ace36cb68c5b82ad032b80b44 Mon Sep 17 00:00:00 2001 From: James Lu Date: Thu, 6 Jul 2023 13:50:34 +0800 Subject: [PATCH] fix(csi): Invalid mount point with volume detached Volume detached because only one replica failed and not remount the `globalmount`. We need to handle it at NodePublishVolume because os.ReadDir will be passed when it becomes a read-only file system after an unexpected volume detaching. Use an empty file to test the file system Ref: 4814, 5801 Signed-off-by: James Lu (cherry picked from commit a9d62894e51a1a3ddb00369ff88992f50355b65e) # Conflicts: # csi/node_server.go # csi/util.go --- csi/node_server.go | 4 ++++ csi/util.go | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/csi/node_server.go b/csi/node_server.go index 7a36a0ae9c..7c8203ead6 100644 --- a/csi/node_server.go +++ b/csi/node_server.go @@ -120,7 +120,11 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis if volume.State != string(longhorn.VolumeStateAttached) || volume.Controllers[0].Endpoint == "" { logrus.Debugf("volume %v hasn't been attached yet, try unmounting potential mount point %v", volumeID, targetPath) if err := unmount(targetPath, mounter); err != nil { +<<<<<<< HEAD logrus.Debugf("failed to unmount error: %v", err) +======= + log.WithError(err).Warnf("Failed to unmount: %v", targetPath) +>>>>>>> a9d62894 (fix(csi): Invalid mount point with volume detached) } return nil, status.Errorf(codes.InvalidArgument, "volume %s hasn't been attached yet", volumeID) } diff --git a/csi/util.go b/csi/util.go index ac4a704e41..b5bd51bd85 100644 --- a/csi/util.go +++ b/csi/util.go @@ -4,7 +4,9 @@ import ( "context" "encoding/json" "fmt" + "io" "os" + "path" "strconv" "strings" "time" @@ -29,6 +31,8 @@ const ( defaultStaleReplicaTimeout = 2880 defaultForceUmountTimeout = 30 * time.Second + + tempTestMountPointValidStatusFile = ".longhorn-volume-mount-point-test.tmp" ) // NewForcedParamsExec creates a osExecutor that allows for adding additional params to later occurring Run calls @@ -215,6 +219,42 @@ func parseJSONRecurringJobs(jsonRecurringJobs string) ([]longhornclient.Recurrin return recurringJobs, nil } +func syncMountPointDirectory(targetPath string) error { + d, err := os.OpenFile(targetPath, os.O_SYNC, 0750) + if err != nil { + return err + } + defer d.Close() + + if _, err := d.Readdirnames(1); err != nil { + if !errors.Is(err, io.EOF) { + return err + } + } + + // it would not always return `Input/Output Error` or `read-only file system` errors if we only use ReadDir() or Readdirnames() without an I/O operation + // an I/O operation will make the targetPath mount point invalid immediately + tempFile := path.Join(targetPath, tempTestMountPointValidStatusFile) + f, err := os.OpenFile(tempFile, os.O_CREATE|os.O_RDWR|os.O_SYNC|os.O_TRUNC, 0666) + if err != nil { + return err + } + defer func() { + if err := f.Close(); err != nil { + logrus.WithError(err).Warnf("Failed to close %v", tempFile) + } + if err := os.Remove(tempFile); err != nil { + logrus.WithError(err).Warnf("Failed to remove %v", tempFile) + } + }() + + if _, err := f.WriteString(tempFile); err != nil { + return err + } + + return nil +} + // ensureMountPoint evaluates whether a path is a valid mountPoint // in case the targetPath does not exists it will create a path and return false // in case where the mount point exists but is corrupt, the mount point will be cleaned up and a error is returned @@ -228,9 +268,15 @@ func ensureMountPoint(targetPath string, mounter mount.Interface) (bool, error) IsCorruptedMnt := mount.IsCorruptedMnt(err) if !IsCorruptedMnt { +<<<<<<< HEAD logrus.Debugf("mount point %v try reading dir to make sure it's healthy", targetPath) if _, err := os.ReadDir(targetPath); err != nil { logrus.Debugf("mount point %v was identified as corrupt by ReadDir", targetPath) +======= + logrus.Debugf("Mount point %v try opening and syncing dir to make sure it's healthy", targetPath) + if err := syncMountPointDirectory(targetPath); err != nil { + logrus.WithError(err).Warnf("Mount point %v was identified as corrupt by opening and syncing", targetPath) +>>>>>>> a9d62894 (fix(csi): Invalid mount point with volume detached) IsCorruptedMnt = true } }