From 09bef581cd6460ec66f8d8fb2706d83e1020f88c Mon Sep 17 00:00:00 2001 From: Derek Su Date: Wed, 20 Sep 2023 15:55:19 +0800 Subject: [PATCH] rwx volume: use soft mode with long timeout for nfs client When a RWX volume is attached, a share-manager pod embedded with a userspace NFS server is created and the volume is exported. A remote exported share is hard mounted by Longhorn, and it is then provided to the workload. When the share-manager pod or embedded NFS server is somehow crashed or unreachable, the 'hard mount' option keeps the client trying to connect to NFS server and prevents data loss. The root cause is the Linux kernel is trying to maintain filesystem stability. Linux kernel will not allow a filesystem to be unmounted until all its pending IO is written back to storage, and the system can't shut down until all file systems are unmounted. Currently, the bug/issue is not resolved. Longhorn 6655 Signed-off-by: Derek Su --- csi/node_server.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/csi/node_server.go b/csi/node_server.go index 13faeba868..79bc5d8f56 100644 --- a/csi/node_server.go +++ b/csi/node_server.go @@ -240,11 +240,11 @@ func (ns *NodeServer) nodeStageSharedVolume(volumeID, shareEndpoint, targetPath "vers=4.1", "noresvport", // "sync", // sync mode is prohibitively expensive on the client, so we allow for host defaults - "intr", - "hard", - //"soft", // for this release we use soft mode, so we can always cleanup mount points - //"timeo=30", // This is tenths of a second, so a 3 second timeout, each retrans the timeout will be linearly increased, 3s, 6s, 9s - //"retrans=3", // We try the io operation for a total of 3 times, before failing, max runtime of 18s + // "intr", + //"hard", + "soft", // for this release we use soft mode, so we can always cleanup mount points + "timeo=150", // This is tenths of a second, so a 15 second timeout, each retrans the timeout will be linearly increased, 15s, 30s, 45s + "retrans=3", // We try the io operation for a total of 3 times, before failing } }