Skip to content

Commit

Permalink
Refactor isResponsibleFor to avoid deadlock in double failure case
Browse files Browse the repository at this point in the history
longhorn-6205

Signed-off-by: Phan Le <phan.le@suse.com>
  • Loading branch information
PhanLe1010 committed Jul 16, 2024
1 parent 0b142b5 commit 1d6b033
Showing 1 changed file with 38 additions and 29 deletions.
67 changes: 38 additions & 29 deletions controller/share_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1485,35 +1485,6 @@ func (c *ShareManagerController) isShareManagerPodStale(sm *longhorn.ShareManage
func (c *ShareManagerController) isResponsibleFor(sm *longhorn.ShareManager) (bool, error) {
log := getLoggerForShareManager(c.logger, sm)

// If the lease is stale, we assume the owning node is down but not officially dead.
// Some node has to take over, and it might as well be this one, if another one
// has not already.
leaseExpired, leaseHolder, err := c.isShareManagerPodStale(sm)
if err == nil && leaseExpired {
// Avoid race between nodes by checking for an existing interim owner.
if leaseHolder != sm.Status.OwnerID {
log.Infof("Interim owner %v already took responsibility for stale lease-holder %v", sm.Status.OwnerID, leaseHolder)
return false, nil
}

log.Infof("Interim owner %v taking responsibility for stale lease-holder %v", c.controllerID, leaseHolder)

// This code should be moved from here to the sync loop, but it requires care.
// The first naive attempt caused repeated calls to cleanup and delete/create.
if err := c.markShareManagerDelinquent(sm); err != nil {
log.WithError(err).Warnf("Failed to update leease to set delinquent condition for %v", sm.Name)
}

// Also, turn off the admission webhook on the suspected node. Trying to talk to it
// will delay any effort to modify resources.
if err := c.ds.RemoveLabelFromManagerPod(leaseHolder, types.GetAdmissionWebhookLabel()); err != nil {
log.WithError(err).Warnf("Failed to turn off admission webhook on node %v", leaseHolder)
}

// In any event, this node takes responsibility.
return true, nil
}

// We prefer keeping the owner of the share manager CR to be the node
// where the share manager pod got scheduled and is running.
preferredOwnerID := ""
Expand Down Expand Up @@ -1542,5 +1513,43 @@ func (c *ShareManagerController) isResponsibleFor(sm *longhorn.ShareManager) (bo
continueToBeOwner := currentNodeSchedulable && !preferredOwnerSchedulable && c.controllerID == sm.Status.OwnerID
requiresNewOwner := currentNodeSchedulable && !preferredOwnerSchedulable && !currentOwnerSchedulable

isNodeUnavailable := func(node string) bool {
isUnavailable, err := c.ds.IsNodeDownOrDeletedOrMissingManager(node)
if node != "" && err != nil {
logrus.Errorf("Error while checking IsNodeDownOrDeletedOrMissingManager for object %v, node %v: %v", name, node, err)
}
return node == "" || isUnavailable
}

// If the lease is stale, we assume the owning node is down but not officially dead.
// Some node has to take over, and it might as well be this one, if another one
// has not already.
isStale, leaseHolder, err := c.isShareManagerPodStale(sm)
if err == nil && isStale {
// Avoid race between nodes by checking for an existing interim owner.
if leaseHolder != sm.Status.OwnerID &&
c.controllerID != sm.Status.OwnerID &&
!isNodeUnavailable(sm.Status.OwnerID) &&
c.ds.IsNodeSchedulable(sm.Status.OwnerID) {
return false, nil
}
log.Infof("Interim owner %v taking responsibility for stale lease-holder %v", c.controllerID, leaseHolder)

// TODO: move this to main sync loop
// This code should be moved from here to the sync loop, but it requires care.
// The first naive attempt caused repeated calls to cleanup and delete/create.
if err := c.markShareManagerDelinquent(sm); err != nil {
log.WithError(err).Warnf("Failed to update leease to set delinquent condition for %v", sm.Name)
}

// Also, turn off the admission webhook on the suspected node. Trying to talk to it

Check notice on line 1545 in controller/share_manager_controller.go

View check run for this annotation

codefactor.io / CodeFactor

controller/share_manager_controller.go#L1485-L1545

Complex Method
// will delay any effort to modify resources.
if err := c.ds.RemoveLabelFromManagerPod(leaseHolder, types.GetAdmissionWebhookLabel()); err != nil {
log.WithError(err).Warnf("Failed to turn off admission webhook on node %v", leaseHolder)
}

return currentNodeSchedulable, nil
}

return isPreferredOwner || continueToBeOwner || requiresNewOwner, nil
}

0 comments on commit 1d6b033

Please sign in to comment.