Skip to content

Commit

Permalink
Consider a node with a failed replica as still used
Browse files Browse the repository at this point in the history
Only do this for the purposes of scheduling new replicas. Maintain
previous behavior when checking for reusable replicas.

Longhorn 8043

Signed-off-by: Eric Weber <eric.weber@suse.com>
(cherry picked from commit 29a895c)

# Conflicts:
#	scheduler/replica_scheduler.go
#	scheduler/replica_scheduler_test.go
  • Loading branch information
ejweber authored and mergify[bot] committed Mar 7, 2024
1 parent 828947d commit aa66553
Show file tree
Hide file tree
Showing 2 changed files with 771 additions and 3 deletions.
83 changes: 80 additions & 3 deletions scheduler/replica_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (rcs *ReplicaScheduler) ScheduleReplica(replica *longhorn.Replica, replicas
nodeDisksMap[node.Name] = disks
}

diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true)
diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true, false)

// there's no disk that fit for current replica
if len(diskCandidates) == 0 {
Expand Down Expand Up @@ -143,7 +143,17 @@ func getNodesWithEvictingReplicas(replicas map[string]*longhorn.Replica, nodeInf
return nodesWithEvictingReplicas
}

func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node, nodeDisksMap map[string]map[string]struct{}, replicas map[string]*longhorn.Replica, volume *longhorn.Volume, requireSchedulingCheck bool) (map[string]*Disk, util.MultiError) {
// getDiskCandidates returns a map of the most appropriate disks a replica can be scheduled to (assuming it can be
// scheduled at all). For example, consider a case in which there are two disks on nodes without a replica for a volume
// and two disks on nodes with a replica for the same volume. getDiskCandidates only returns the disks without a
// replica, even if the replica can legally be scheduled on all four disks.
// Some callers (e.g. CheckAndReuseFailedReplicas) do not consider a node or zone to be used if it contains a failed
// replica. ignoreFailedReplicas == true supports this use case.
func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node,
nodeDisksMap map[string]map[string]struct{},
replicas map[string]*longhorn.Replica,
volume *longhorn.Volume,
requireSchedulingCheck, ignoreFailedReplicas bool) (map[string]*Disk, util.MultiError) {
multiError := util.NewMultiError()

nodeSoftAntiAffinity, err := rcs.ds.GetSettingAsBool(types.SettingNameReplicaSoftAntiAffinity)
Expand Down Expand Up @@ -174,6 +184,20 @@ func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Nod
}
multiError.Append(errors)
}
<<<<<<< HEAD
=======
diskCandidates = filterDisksWithMatchingReplicas(diskCandidates, replicas, diskSoftAntiAffinity)
return diskCandidates, multiError
}

usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(replicas, nodeInfo,
ignoreFailedReplicas)

allowEmptyNodeSelectorVolume, err := rcs.ds.GetSettingAsBool(types.SettingNameAllowEmptyNodeSelectorVolume)
if err != nil {
err = errors.Wrapf(err, "failed to get %v setting", types.SettingNameAllowEmptyNodeSelectorVolume)
multiError.Append(util.NewMultiError(err.Error()))
>>>>>>> 29a895c2 (Consider a node with a failed replica as still used)
return map[string]*Disk{}, multiError
}

Expand Down Expand Up @@ -471,7 +495,9 @@ func (rcs *ReplicaScheduler) CheckAndReuseFailedReplica(replicas map[string]*lon
}
}

diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false)
// Call getDiskCandidates with ignoreFailedReplicas == true since we want the list of candidates to include disks
// that already contain a failed replica.
diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false, true)

var reusedReplica *longhorn.Replica
for _, suggestDisk := range diskCandidates {
Expand Down Expand Up @@ -793,3 +819,54 @@ func findDiskSpecAndDiskStatusInNode(diskUUID string, node *longhorn.Node) (long
}
return longhorn.DiskSpec{}, longhorn.DiskStatus{}, false
}
<<<<<<< HEAD
=======

// getCurrentNodesAndZones returns the nodes and zones a replica is already scheduled to. Some callers do not consider a
// node or zone to be used if it contains a failed replica. ignoreFailedReplicas == true supports this use case.
func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node,
ignoreFailedReplicas bool) (map[string]*longhorn.Node,
map[string]bool, map[string]bool, map[string]bool) {
usedNodes := map[string]*longhorn.Node{}
usedZones := map[string]bool{}
onlyEvictingNodes := map[string]bool{}
onlyEvictingZones := map[string]bool{}

for _, r := range replicas {
if r.Spec.NodeID == "" {
continue
}
if r.DeletionTimestamp != nil {
continue
}
if r.Spec.FailedAt != "" && ignoreFailedReplicas {
continue
}

if node, ok := nodeInfo[r.Spec.NodeID]; ok {
if r.Spec.EvictionRequested {
if _, ok := usedNodes[r.Spec.NodeID]; !ok {
// This is an evicting replica on a thus far unused node. We won't change this again unless we
// find a non-evicting replica on this node.
onlyEvictingNodes[node.Name] = true
}
if used := usedZones[node.Status.Zone]; !used {
// This is an evicting replica in a thus far unused zone. We won't change this again unless we
// find a non-evicting replica in this zone.
onlyEvictingZones[node.Status.Zone] = true
}
} else {
// There is now at least one replica on this node and in this zone that is not evicting.
onlyEvictingNodes[node.Name] = false
onlyEvictingZones[node.Status.Zone] = false
}

usedNodes[node.Name] = node
// For empty zone label, we treat them as one zone.
usedZones[node.Status.Zone] = true
}
}

return usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones
}
>>>>>>> 29a895c2 (Consider a node with a failed replica as still used)
Loading

0 comments on commit aa66553

Please sign in to comment.