From 1d6b033c1f2dba07bd5ab7674a918ab8e8001681 Mon Sep 17 00:00:00 2001 From: Phan Le Date: Tue, 16 Jul 2024 16:34:32 -0700 Subject: [PATCH] Refactor isResponsibleFor to avoid deadlock in double failure case longhorn-6205 Signed-off-by: Phan Le --- controller/share_manager_controller.go | 67 +++++++++++++++----------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/controller/share_manager_controller.go b/controller/share_manager_controller.go index 54069b9ca4..c2a9b0576e 100644 --- a/controller/share_manager_controller.go +++ b/controller/share_manager_controller.go @@ -1485,35 +1485,6 @@ func (c *ShareManagerController) isShareManagerPodStale(sm *longhorn.ShareManage func (c *ShareManagerController) isResponsibleFor(sm *longhorn.ShareManager) (bool, error) { log := getLoggerForShareManager(c.logger, sm) - // If the lease is stale, we assume the owning node is down but not officially dead. - // Some node has to take over, and it might as well be this one, if another one - // has not already. - leaseExpired, leaseHolder, err := c.isShareManagerPodStale(sm) - if err == nil && leaseExpired { - // Avoid race between nodes by checking for an existing interim owner. - if leaseHolder != sm.Status.OwnerID { - log.Infof("Interim owner %v already took responsibility for stale lease-holder %v", sm.Status.OwnerID, leaseHolder) - return false, nil - } - - log.Infof("Interim owner %v taking responsibility for stale lease-holder %v", c.controllerID, leaseHolder) - - // This code should be moved from here to the sync loop, but it requires care. - // The first naive attempt caused repeated calls to cleanup and delete/create. - if err := c.markShareManagerDelinquent(sm); err != nil { - log.WithError(err).Warnf("Failed to update leease to set delinquent condition for %v", sm.Name) - } - - // Also, turn off the admission webhook on the suspected node. Trying to talk to it - // will delay any effort to modify resources. - if err := c.ds.RemoveLabelFromManagerPod(leaseHolder, types.GetAdmissionWebhookLabel()); err != nil { - log.WithError(err).Warnf("Failed to turn off admission webhook on node %v", leaseHolder) - } - - // In any event, this node takes responsibility. - return true, nil - } - // We prefer keeping the owner of the share manager CR to be the node // where the share manager pod got scheduled and is running. preferredOwnerID := "" @@ -1542,5 +1513,43 @@ func (c *ShareManagerController) isResponsibleFor(sm *longhorn.ShareManager) (bo continueToBeOwner := currentNodeSchedulable && !preferredOwnerSchedulable && c.controllerID == sm.Status.OwnerID requiresNewOwner := currentNodeSchedulable && !preferredOwnerSchedulable && !currentOwnerSchedulable + isNodeUnavailable := func(node string) bool { + isUnavailable, err := c.ds.IsNodeDownOrDeletedOrMissingManager(node) + if node != "" && err != nil { + logrus.Errorf("Error while checking IsNodeDownOrDeletedOrMissingManager for object %v, node %v: %v", name, node, err) + } + return node == "" || isUnavailable + } + + // If the lease is stale, we assume the owning node is down but not officially dead. + // Some node has to take over, and it might as well be this one, if another one + // has not already. + isStale, leaseHolder, err := c.isShareManagerPodStale(sm) + if err == nil && isStale { + // Avoid race between nodes by checking for an existing interim owner. + if leaseHolder != sm.Status.OwnerID && + c.controllerID != sm.Status.OwnerID && + !isNodeUnavailable(sm.Status.OwnerID) && + c.ds.IsNodeSchedulable(sm.Status.OwnerID) { + return false, nil + } + log.Infof("Interim owner %v taking responsibility for stale lease-holder %v", c.controllerID, leaseHolder) + + // TODO: move this to main sync loop + // This code should be moved from here to the sync loop, but it requires care. + // The first naive attempt caused repeated calls to cleanup and delete/create. + if err := c.markShareManagerDelinquent(sm); err != nil { + log.WithError(err).Warnf("Failed to update leease to set delinquent condition for %v", sm.Name) + } + + // Also, turn off the admission webhook on the suspected node. Trying to talk to it + // will delay any effort to modify resources. + if err := c.ds.RemoveLabelFromManagerPod(leaseHolder, types.GetAdmissionWebhookLabel()); err != nil { + log.WithError(err).Warnf("Failed to turn off admission webhook on node %v", leaseHolder) + } + + return currentNodeSchedulable, nil + } + return isPreferredOwner || continueToBeOwner || requiresNewOwner, nil }