From 9d96d0f3a600fa54c565a2d568f93a0ec7cd605f Mon Sep 17 00:00:00 2001 From: Derek Su Date: Tue, 27 Feb 2024 05:57:10 +0000 Subject: [PATCH] Do not create any instance manager pods if one is already existing An extra v2 instance manager requires an additional CPU core and 2 GiB of memory. If the resources are insufficient, the instance manager pod will be recreated and retried. To minimize unnecessary retries, do not create any new instance manager pods if one is already running. Longhorn 6001 Longhorn 8456 Signed-off-by: Derek Su --- controller/instance_manager_controller.go | 9 +++++ controller/node_controller.go | 47 +++++++++++++++++++---- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/controller/instance_manager_controller.go b/controller/instance_manager_controller.go index 39f5fa0d97..37b6d78ceb 100644 --- a/controller/instance_manager_controller.go +++ b/controller/instance_manager_controller.go @@ -532,6 +532,15 @@ func (imc *InstanceManagerController) handlePod(im *longhorn.InstanceManager) er return err } + // An instance manager pod for v2 volume need to consume huge pages, and disks managed by the + // pod is unable to managed by another pod. Therefore, if an instance manager pod is running on a node, + // an extra instance manager pod for v2 volume should not be created. + if types.IsDataEngineV2(im.Spec.DataEngine) { + if im.Spec.DesireState == longhorn.InstanceManagerStateStopped { + return nil + } + } + if err := imc.createInstanceManagerPod(im); err != nil { return err } diff --git a/controller/node_controller.go b/controller/node_controller.go index 8c70389c55..fa003c6ecc 100644 --- a/controller/node_controller.go +++ b/controller/node_controller.go @@ -1122,6 +1122,21 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error { if err := nc.ds.DeleteInstanceManager(im.Name); err != nil { return err } + + if types.IsDataEngineV2(dataEngine) { + im, err := nc.ds.GetDefaultInstanceManagerByNodeRO(nc.controllerID, dataEngine) + if err != nil { + return errors.Wrap(err, "failed to get default instance manager for v2 data engine") + } + + if im.Spec.DesireState != longhorn.InstanceManagerStateRunning { + nc.logger.Infof("Updating default instance manager %v to running state for v2 data engine", im.Name) + im.Spec.DesireState = longhorn.InstanceManagerStateRunning + if _, err := nc.ds.UpdateInstanceManager(im); err != nil { + return errors.Wrap(err, "failed to update default instance manager for v2 data engine") + } + } + } } } if !defaultInstanceManagerCreated && imType == longhorn.InstanceManagerTypeAllInOne { @@ -1129,6 +1144,8 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error { if err != nil { return err } + + desireState := longhorn.InstanceManagerStateRunning if types.IsDataEngineV2(dataEngine) { disabled, err := nc.ds.IsV2DataEngineDisabledForNode(node.Name) if err != nil { @@ -1137,10 +1154,25 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error { if disabled { continue } + + ims, err := nc.ds.ListInstanceManagersBySelectorRO(nc.controllerID, "", longhorn.InstanceManagerTypeAllInOne, dataEngine) + if err != nil { + return errors.Wrap(err, "failed to list instance managers for v2 data engine") + } + foundRunningInstanceManager := false + for _, im := range ims { + if im.Status.CurrentState == longhorn.InstanceManagerStateRunning { + foundRunningInstanceManager = true + break + } + } + if foundRunningInstanceManager { + desireState = longhorn.InstanceManagerStateStopped + } } - log.Infof("Creating default instance manager %v, image: %v, dataEngine: %v", imName, defaultInstanceManagerImage, dataEngine) - if _, err := nc.createInstanceManager(node, imName, defaultInstanceManagerImage, imType, dataEngine); err != nil { + log.Infof("Creating default instance manager %v, image: %v, dataEngine: %v, desireState: %v", imName, defaultInstanceManagerImage, dataEngine, desireState) + if _, err := nc.createInstanceManager(node, imName, defaultInstanceManagerImage, imType, dataEngine, desireState); err != nil { return err } } @@ -1149,16 +1181,17 @@ func (nc *NodeController) syncInstanceManagers(node *longhorn.Node) error { return nil } -func (nc *NodeController) createInstanceManager(node *longhorn.Node, imName, imImage string, imType longhorn.InstanceManagerType, dataEngine longhorn.DataEngineType) (*longhorn.InstanceManager, error) { +func (nc *NodeController) createInstanceManager(node *longhorn.Node, imName, imImage string, imType longhorn.InstanceManagerType, dataEngine longhorn.DataEngineType, desireState longhorn.InstanceManagerState) (*longhorn.InstanceManager, error) { instanceManager := &longhorn.InstanceManager{ ObjectMeta: metav1.ObjectMeta{ Name: imName, }, Spec: longhorn.InstanceManagerSpec{ - Image: imImage, - NodeID: node.Name, - Type: imType, - DataEngine: dataEngine, + Image: imImage, + NodeID: node.Name, + Type: imType, + DataEngine: dataEngine, + DesireState: desireState, }, }