Skip to content

Commit

Permalink
Consider a node with a failed replica as still used
Browse files Browse the repository at this point in the history
Only do this for the purposes of scheduling new replicas. Maintain
previous behavior when checking for reusable replicas.

Longhorn 8043

Signed-off-by: Eric Weber <eric.weber@suse.com>
  • Loading branch information
ejweber committed Feb 27, 2024
1 parent 917762a commit cf71863
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 28 deletions.
76 changes: 50 additions & 26 deletions scheduler/replica_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (rcs *ReplicaScheduler) ScheduleReplica(replica *longhorn.Replica, replicas
nodeDisksMap[node.Name] = disks
}

diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true)
diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true, false)

// there's no disk that fit for current replica
if len(diskCandidates) == 0 {
Expand Down Expand Up @@ -154,7 +154,17 @@ func getNodesWithEvictingReplicas(replicas map[string]*longhorn.Replica, nodeInf
return nodesWithEvictingReplicas
}

func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node, nodeDisksMap map[string]map[string]struct{}, replicas map[string]*longhorn.Replica, volume *longhorn.Volume, requireSchedulingCheck bool) (map[string]*Disk, util.MultiError) {
// getDiskCandidates returns a map of the most appropriate disks a replica can be scheduled to (assuming it can be
// scheduled at all). For example, consider a case in which there are two disks on nodes without a replica for a volume
// and two disks on nodes with a replica for the same volume. getDiskCandidates only returns the disks without a
// replica, even if the replica can legally be scheduled on all four disks.
// Some callers (e.g. CheckAndReuseFailedReplicas) do not consider a node or zone to be used if it contains a failed
// replica. ignoreFailedReplicas == true supports this use case.
func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node,
nodeDisksMap map[string]map[string]struct{},
replicas map[string]*longhorn.Replica,
volume *longhorn.Volume,
requireSchedulingCheck, ignoreFailedReplicas bool) (map[string]*Disk, util.MultiError) {
multiError := util.NewMultiError()

nodeSoftAntiAffinity, err := rcs.ds.GetSettingAsBool(types.SettingNameReplicaSoftAntiAffinity)
Expand Down Expand Up @@ -206,7 +216,8 @@ func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Nod
return diskCandidates, multiError
}

usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(replicas, nodeInfo)
usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(replicas, nodeInfo,
ignoreFailedReplicas)

allowEmptyNodeSelectorVolume, err := rcs.ds.GetSettingAsBool(types.SettingNameAllowEmptyNodeSelectorVolume)
if err != nil {
Expand Down Expand Up @@ -523,7 +534,9 @@ func (rcs *ReplicaScheduler) CheckAndReuseFailedReplica(replicas map[string]*lon
}
}

diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false)
// Call getDiskCandidates with ignoreFailedReplicas == true since we want the list of candidates to include disks
// that already contain a failed replica.
diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false, true)

var reusedReplica *longhorn.Replica
for _, suggestDisk := range diskCandidates {
Expand Down Expand Up @@ -852,37 +865,48 @@ func findDiskSpecAndDiskStatusInNode(diskUUID string, node *longhorn.Node) (long
return longhorn.DiskSpec{}, longhorn.DiskStatus{}, false
}

func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node) (map[string]*longhorn.Node,
// getCurrentNodesAndZones returns the nodes and zones a replica is already scheduled to. Some callers do not consider a
// node or zone to be used if it contains a failed replica. ignoreFailedReplicas == true supports this use case.
func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node,
ignoreFailedReplicas bool) (map[string]*longhorn.Node,
map[string]bool, map[string]bool, map[string]bool) {
usedNodes := map[string]*longhorn.Node{}
usedZones := map[string]bool{}
onlyEvictingNodes := map[string]bool{}
onlyEvictingZones := map[string]bool{}

for _, r := range replicas {
if r.Spec.NodeID != "" && r.DeletionTimestamp == nil && r.Spec.FailedAt == "" {
if node, ok := nodeInfo[r.Spec.NodeID]; ok {
if r.Spec.EvictionRequested {
if _, ok := usedNodes[r.Spec.NodeID]; !ok {
// This is an evicting replica on a thus far unused node. We won't change this again unless we
// find a non-evicting replica on this node.
onlyEvictingNodes[node.Name] = true
}
if used := usedZones[node.Status.Zone]; !used {
// This is an evicting replica in a thus far unused zone. We won't change this again unless we
// find a non-evicting replica in this zone.
onlyEvictingZones[node.Status.Zone] = true
}
} else {
// There is now at least one replica on this node and in this zone that is not evicting.
onlyEvictingNodes[node.Name] = false
onlyEvictingZones[node.Status.Zone] = false
}
if r.Spec.NodeID == "" {
continue
}
if r.DeletionTimestamp != nil {
continue
}
if r.Spec.FailedAt != "" && ignoreFailedReplicas {
continue
}

usedNodes[node.Name] = node
// For empty zone label, we treat them as one zone.
usedZones[node.Status.Zone] = true
if node, ok := nodeInfo[r.Spec.NodeID]; ok {
if r.Spec.EvictionRequested {
if _, ok := usedNodes[r.Spec.NodeID]; !ok {
// This is an evicting replica on a thus far unused node. We won't change this again unless we
// find a non-evicting replica on this node.
onlyEvictingNodes[node.Name] = true
}
if used := usedZones[node.Status.Zone]; !used {
// This is an evicting replica in a thus far unused zone. We won't change this again unless we
// find a non-evicting replica in this zone.
onlyEvictingZones[node.Status.Zone] = true
}
} else {
// There is now at least one replica on this node and in this zone that is not evicting.
onlyEvictingNodes[node.Name] = false
onlyEvictingZones[node.Status.Zone] = false
}

usedNodes[node.Name] = node
// For empty zone label, we treat them as one zone.
usedZones[node.Status.Zone] = true
}
}

Expand Down
70 changes: 68 additions & 2 deletions scheduler/replica_scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ const (

TestZone1 = "test-zone-1"
TestZone2 = "test-zone-2"

TestTimeNow = "2015-01-02T00:00:00Z"
)

var longhornFinalizerKey = longhorn.SchemeGroupVersion.Group
Expand Down Expand Up @@ -1097,7 +1099,24 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
tc.replicaZoneSoftAntiAffinity = "false" // Do not allow replicas to schedule to the same zone.
testCases["fail scheduling when doing so would reuse an invalid evicting node"] = tc

for name, tc := range testCases {
// Test fail to schedule to node with failed replica
// If replicaNodeSoftAntiAffinity == false, this shouldn't be possible.
tc = generateFailedReplicaTestCase("false")
tc.err = false
tc.firstNilReplica = 0
testCases["fail to schedule to node with failed replica"] = tc

// Test succeed to schedule to node with failed replica
// If replicaNodeSoftAntiAffinity == true, this should be possible.
tc = generateFailedReplicaTestCase("true")
tc.err = false
tc.firstNilReplica = -1
testCases["succeed to schedule to node with failed replica"] = tc

testCasesActual := map[string]*ReplicaSchedulerTestCase{}
testCasesActual["fail to schedule to node with failed replica"] = testCases["fail to schedule to node with failed replica"]
testCasesActual["succeed to schedule to node with failed replica"] = testCases["succeed to schedule to node with failed replica"]
for name, tc := range testCasesActual {
fmt.Printf("testing %v\n", name)

kubeClient := fake.NewSimpleClientset()
Expand Down Expand Up @@ -1188,6 +1207,52 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
}
}

func generateFailedReplicaTestCase(replicaNodeSoftAntiAffinity string) (tc *ReplicaSchedulerTestCase) {
tc = generateSchedulerTestCase()
tc.replicaNodeSoftAntiAffinity = replicaNodeSoftAntiAffinity
tc.replicaDiskSoftAntiAffinity = "true" // Do not hinder replica scheduling except for node.
tc.replicaZoneSoftAntiAffinity = "true" // Do not hinder replica scheduling except for node.
daemon1 := newDaemonPod(corev1.PodRunning, TestDaemon1, TestNamespace, TestNode1, TestIP1)
tc.daemons = []*corev1.Pod{
daemon1,
}
node1 := newNode(TestNode1, TestNamespace, TestZone1, true, longhorn.ConditionStatusTrue)
tc.engineImage.Status.NodeDeploymentMap[node1.Name] = true
disk := newDisk(TestDefaultDataPath, true, 0)
node1.Spec.Disks = map[string]longhorn.DiskSpec{
getDiskID(TestNode1, "1"): disk,
}

// A failed replica is already scheduled.
var alreadyScheduledReplica *longhorn.Replica
for _, replica := range tc.allReplicas {
alreadyScheduledReplica = replica
break
}
delete(tc.replicasToSchedule, alreadyScheduledReplica.Name)
alreadyScheduledReplica.Spec.NodeID = TestNode1
alreadyScheduledReplica.Spec.FailedAt = TestTimeNow

node1.Status.DiskStatus = map[string]*longhorn.DiskStatus{
getDiskID(TestNode1, "1"): {
StorageAvailable: TestDiskAvailableSize,
StorageScheduled: TestVolumeSize,
StorageMaximum: TestDiskSize,
Conditions: []longhorn.Condition{
newCondition(longhorn.DiskConditionTypeSchedulable, longhorn.ConditionStatusTrue),
},
DiskUUID: getDiskID(TestNode1, "1"),
Type: longhorn.DiskTypeFilesystem,
ScheduledReplica: map[string]int64{alreadyScheduledReplica.Name: TestVolumeSize},
},
}
nodes := map[string]*longhorn.Node{
TestNode1: node1,
}
tc.nodes = nodes
return
}

func setSettings(tc *ReplicaSchedulerTestCase, lhClient *lhfake.Clientset, sIndexer cache.Indexer, c *C) {
// Set storage over-provisioning percentage settings
if tc.storageOverProvisioningPercentage != "" && tc.storageMinimalAvailablePercentage != "" {
Expand Down Expand Up @@ -1456,7 +1521,8 @@ func (s *TestSuite) TestGetCurrentNodesAndZones(c *C) {

for name, tc := range testCases {
fmt.Printf("testing %v\n", name)
usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(tc.replicas, tc.nodeInfo)
usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(tc.replicas, tc.nodeInfo,
false)
verifyNodeNames(tc.expectUsedNodeNames, usedNodes)
verifyZoneNames(tc.expectUsedZoneNames, usedZones)
verifyOnlyEvictingNodeNames(tc.expectOnlyEvictingNodeNames, onlyEvictingNodes)
Expand Down

0 comments on commit cf71863

Please sign in to comment.