From 8b1028e866c2663dbe58a19d6656cd182b9afd6c Mon Sep 17 00:00:00 2001 From: James Lu Date: Fri, 6 Sep 2024 16:11:36 +0800 Subject: [PATCH] fix(node): add the node.Status.Condition `ModulesLoaded` Check if the module `dm_crypt` is enabled as the first module. ref: longhorn/longhorn 9153 Signed-off-by: James Lu --- controller/node_controller.go | 156 ++++++++++++++++++++------ controller/node_controller_test.go | 10 ++ k8s/pkg/apis/longhorn/v1beta2/node.go | 3 + 3 files changed, 132 insertions(+), 37 deletions(-) diff --git a/controller/node_controller.go b/controller/node_controller.go index 3b62c4e493..24311da984 100644 --- a/controller/node_controller.go +++ b/controller/node_controller.go @@ -26,6 +26,7 @@ import ( v1core "k8s.io/client-go/kubernetes/typed/core/v1" lhexec "github.com/longhorn/go-common-libs/exec" + lhio "github.com/longhorn/go-common-libs/io" lhns "github.com/longhorn/go-common-libs/ns" lhtypes "github.com/longhorn/go-common-libs/types" @@ -47,6 +48,8 @@ const ( unknownDiskID = "UNKNOWN_DISKID" + kernelConfigFilePathPrefix = "/host/boot/config-" + snapshotChangeEventQueueMax = 1048576 ) @@ -922,6 +925,7 @@ func (nc *NodeController) environmentCheck(kubeNode *corev1.Node, node *longhorn namespaces := []lhtypes.Namespace{lhtypes.NamespaceMnt, lhtypes.NamespaceNet} nc.syncPackagesInstalled(kubeNode, node, namespaces) nc.syncMultipathd(node, namespaces) + nc.checkModulesLoaded(kubeNode, node, namespaces) nc.syncNFSClientVersion(kubeNode, node, namespaces) } @@ -1027,21 +1031,49 @@ func (nc *NodeController) syncMultipathd(node *longhorn.Node, namespaces []lhtyp node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, "", "") } -func (nc *NodeController) syncNFSClientVersion(kubeNode *corev1.Node, node *longhorn.Node, namespaces []lhtypes.Namespace) { - kernelVersion := kubeNode.Status.NodeInfo.KernelVersion - nfsClientVersions := []string{"CONFIG_NFS_V4_2", "CONFIG_NFS_V4_1", "CONFIG_NFS_V4"} +func (nc *NodeController) checkModulesLoaded(kubeNode *corev1.Node, node *longhorn.Node, namespaces []lhtypes.Namespace) { + modules := map[string]string{ + "CONFIG_DM_CRYPT": "dm_crypt", + } - nsexec, err := lhns.NewNamespaceExecutor(lhtypes.ProcessNone, lhtypes.HostProcDirectory, namespaces) + notFoundModulesUsingkmod, err := checkModulesLoadedUsingkmod(modules) if err != nil { - node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, string(longhorn.NodeConditionReasonNamespaceExecutorErr), - fmt.Sprintf("Failed to get namespace executor: %v", err.Error())) + fmt.Sprintf("Failed to check kernel modules: %v", err.Error())) + return + } + + if len(notFoundModulesUsingkmod) == 0 { + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusTrue, "", "") + return + } + + notLoadedModules, err := checkModulesLoadedByConfigFile(nc.logger, notFoundModulesUsingkmod, kubeNode.Status.NodeInfo.KernelVersion) + if err != nil { + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, + string(longhorn.NodeConditionReasonCheckKernelConfigFailed), + fmt.Sprintf("Failed to check kernel config file for kernel modules %v: %v", notFoundModulesUsingkmod, err.Error())) + return + } + + if len(notLoadedModules) != 0 { + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, + string(longhorn.NodeConditionReasonModulesNotLoaded), + fmt.Sprintf("Modules %v are not loaded on node %v", notLoadedModules, node.Name)) return } - kernelConfigPath := "/boot/config-" + kernelVersion - args := []string{kernelConfigPath} - if _, err := nsexec.Execute(nil, "ls", args, lhtypes.ExecuteDefaultTimeout); err != nil { + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusTrue, "", "") +} + +func (nc *NodeController) syncNFSClientVersion(kubeNode *corev1.Node, node *longhorn.Node, namespaces []lhtypes.Namespace) { + kernelVersion := kubeNode.Status.NodeInfo.KernelVersion + nfsClientVersions := []string{"CONFIG_NFS_V4_2", "CONFIG_NFS_V4_1", "CONFIG_NFS_V4"} + + kernelConfigPath := kernelConfigFilePathPrefix + kernelVersion + configContent, err := lhio.ReadFileContent(kernelConfigPath) + if err != nil { node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, string(longhorn.NodeConditionReasonKernelConfigIsNotFound), fmt.Sprintf("Unable to find %v for checking %v: %v", kernelConfigPath, nfsClientVersions, err.Error())) @@ -1049,38 +1081,16 @@ func (nc *NodeController) syncNFSClientVersion(kubeNode *corev1.Node, node *long } for _, ver := range nfsClientVersions { - args := []string{ver + "=", kernelConfigPath} - result, err := nsexec.Execute(nil, "grep", args, lhtypes.ExecuteDefaultTimeout) + moduleEnabled, err := checkKernelModuleEabled(nc.logger, configContent, ver, "nfs") if err != nil { - nc.logger.WithError(err).Debugf("Failed to find kernel config %v on node %v", ver, node.Name) - continue + node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, + string(longhorn.NodeConditionReasonNamespaceExecutorErr), + fmt.Sprintf("Failed to check kernel module %v: %v", ver, err.Error())) + return } - enabled := strings.TrimSpace(strings.Split(result, "=")[1]) - switch enabled { - case "y": + if moduleEnabled { node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusTrue, "", "") return - case "m": - kmodResult, err := lhexec.NewExecutor().Execute(nil, "kmod", []string{"list"}, lhtypes.ExecuteDefaultTimeout) - if err != nil { - node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, - string(longhorn.NodeConditionReasonNFSClientIsNotFound), - fmt.Sprintf("Failed to execute command `kmod`: %v", err.Error())) - return - } - res, err := lhexec.NewExecutor().ExecuteWithStdinPipe("grep", []string{"nfs"}, kmodResult, lhtypes.ExecuteDefaultTimeout) - if err != nil { - node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, - string(longhorn.NodeConditionReasonNFSClientIsNotFound), - fmt.Sprintf("Failed to execute command `grep`: %v", err.Error())) - return - } - if res != "" { - node.Status.Conditions = types.SetCondition(node.Status.Conditions, longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusTrue, "", "") - return - } - default: - nc.logger.Debugf("Unknown kernel config value for %v: %v", ver, enabled) } } @@ -1089,6 +1099,78 @@ func (nc *NodeController) syncNFSClientVersion(kubeNode *corev1.Node, node *long fmt.Sprintf("NFS clients %v not found. At least one should be enabled", nfsClientVersions)) } +func checkModulesLoadedUsingkmod(modules map[string]string) (map[string]string, error) { + kmodResult, err := lhexec.NewExecutor().Execute(nil, "kmod", []string{"list"}, lhtypes.ExecuteDefaultTimeout) + if err != nil { + return nil, err + } + + notFoundModules := map[string]string{} + for config, module := range modules { + if !strings.Contains(kmodResult, module) { + notFoundModules[config] = module + } + } + + return notFoundModules, nil +} + +func checkModulesLoadedByConfigFile(log *logrus.Entry, modules map[string]string, kernelVersion string) ([]string, error) { + kernelConfigPath := kernelConfigFilePathPrefix + kernelVersion + configContent, err := lhio.ReadFileContent(kernelConfigPath) + if err != nil { + return nil, err + } + + notLoadedModules := []string{} + for config, module := range modules { + moduleEnabled, err := checkKernelModuleEabled(log, configContent, config, module) + if err != nil { + return nil, err + } + + if !moduleEnabled { + notLoadedModules = append(notLoadedModules, module) + } + } + + return notLoadedModules, nil +} + +func checkKernelModuleEabled(log *logrus.Entry, configContent, module, kmodName string) (bool, error) { + confingLine := "" + + configs := strings.Split(configContent, "\n") + for _, config := range configs { + if strings.Contains(config, module) { + confingLine = config + break + } + } + if confingLine == "" { + log.Debugf("Kernel config %v not found", module) + return false, nil + } + + enabled := strings.TrimSpace(strings.Split(confingLine, "=")[1]) + switch enabled { + case "y": + return true, nil + case "m": + kmodResult, err := lhexec.NewExecutor().Execute(nil, "kmod", []string{"list"}, lhtypes.ExecuteDefaultTimeout) + if err != nil { + return false, errors.Wrap(err, "Failed to execute command `kmod`") + } + if strings.Contains(kmodResult, kmodName) { + return true, nil + } + default: + log.Debugf("Unknown kernel config value for %v: %v", module, enabled) + } + + return false, nil +} + func (nc *NodeController) getImTypeDataEngines(node *longhorn.Node) map[longhorn.InstanceManagerType][]longhorn.DataEngineType { log := getLoggerForNode(nc.logger, node) diff --git a/controller/node_controller_test.go b/controller/node_controller_test.go index b88c85c7c6..b99704c678 100644 --- a/controller/node_controller_test.go +++ b/controller/node_controller_test.go @@ -186,6 +186,7 @@ func (s *NodeControllerSuite) TestManagerPodUp(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, }, @@ -273,6 +274,7 @@ func (s *NodeControllerSuite) TestManagerPodDown(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonNoMountPropagationSupport), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, }, @@ -360,6 +362,7 @@ func (s *NodeControllerSuite) TestKubeNodeDown(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, }, @@ -447,6 +450,7 @@ func (s *NodeControllerSuite) TestKubeNodePressure(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, }, @@ -569,6 +573,7 @@ func (s *NodeControllerSuite) TestUpdateDiskStatus(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{ @@ -720,6 +725,7 @@ func (s *NodeControllerSuite) TestCleanDiskStatus(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{ @@ -877,6 +883,7 @@ func (s *NodeControllerSuite) TestDisableDiskOnFilesystemChange(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{ @@ -1005,6 +1012,7 @@ func (s *NodeControllerSuite) TestCreateDefaultInstanceManager(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{ @@ -1150,6 +1158,7 @@ func (s *NodeControllerSuite) TestCleanupRedundantInstanceManagers(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{ @@ -1265,6 +1274,7 @@ func (s *NodeControllerSuite) TestCleanupAllInstanceManagers(c *C) { newNodeCondition(longhorn.NodeConditionTypeMountPropagation, longhorn.ConditionStatusTrue, ""), newNodeCondition(longhorn.NodeConditionTypeRequiredPackages, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonUnknownOS), newNodeCondition(longhorn.NodeConditionTypeMultipathd, longhorn.ConditionStatusTrue, ""), + newNodeCondition(longhorn.NodeConditionTypeModulesLoaded, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonCheckKernelConfigFailed), newNodeCondition(longhorn.NodeConditionTypeNFSClientInstalled, longhorn.ConditionStatusFalse, longhorn.NodeConditionReasonKernelConfigIsNotFound), }, DiskStatus: map[string]*longhorn.DiskStatus{}, diff --git a/k8s/pkg/apis/longhorn/v1beta2/node.go b/k8s/pkg/apis/longhorn/v1beta2/node.go index 7943543bbf..7a8d071fb7 100644 --- a/k8s/pkg/apis/longhorn/v1beta2/node.go +++ b/k8s/pkg/apis/longhorn/v1beta2/node.go @@ -6,6 +6,7 @@ const ( NodeConditionTypeReady = "Ready" NodeConditionTypeMountPropagation = "MountPropagation" NodeConditionTypeMultipathd = "Multipathd" + NodeConditionTypeModulesLoaded = "ModulesLoaded" NodeConditionTypeRequiredPackages = "RequiredPackages" NodeConditionTypeNFSClientInstalled = "NFSClientInstalled" NodeConditionTypeSchedulable = "Schedulable" @@ -22,8 +23,10 @@ const ( NodeConditionReasonMultipathdIsRunning = "MultipathdIsRunning" NodeConditionReasonUnknownOS = "UnknownOS" NodeConditionReasonNamespaceExecutorErr = "NamespaceExecutorErr" + NodeConditionReasonModulesNotLoaded = "ModulesNotLoaded" NodeConditionReasonPackagesNotInstalled = "PackagesNotInstalled" NodeConditionReasonKernelConfigIsNotFound = "KernelConfigIsNotFound" + NodeConditionReasonCheckKernelConfigFailed = "CheckKernelConfigFailed" NodeConditionReasonNFSClientIsNotFound = "NFSClientIsNotFound" NodeConditionReasonKubernetesNodeCordoned = "KubernetesNodeCordoned" )