diff --git a/scheduler/replica_scheduler.go b/scheduler/replica_scheduler.go index f5cc47786c..3f06123a40 100644 --- a/scheduler/replica_scheduler.go +++ b/scheduler/replica_scheduler.go @@ -88,7 +88,7 @@ func (rcs *ReplicaScheduler) ScheduleReplica(replica *longhorn.Replica, replicas nodeDisksMap[node.Name] = disks } - diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true) + diskCandidates, multiError := rcs.getDiskCandidates(nodeCandidates, nodeDisksMap, replicas, volume, true, false) // there's no disk that fit for current replica if len(diskCandidates) == 0 { @@ -154,7 +154,17 @@ func getNodesWithEvictingReplicas(replicas map[string]*longhorn.Replica, nodeInf return nodesWithEvictingReplicas } -func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node, nodeDisksMap map[string]map[string]struct{}, replicas map[string]*longhorn.Replica, volume *longhorn.Volume, requireSchedulingCheck bool) (map[string]*Disk, util.MultiError) { +// getDiskCandidates returns a map of the most appropriate disks a replica can be scheduled to (assuming it can be +// scheduled at all). For example, consider a case in which there are two disks on nodes without a replica for a volume +// and two disks on nodes with a replica for the same volume. getDiskCandidates only returns the disks without a +// replica, even if the replica can legally be scheduled on all four disks. +// Some callers (e.g. CheckAndReuseFailedReplicas) do not consider a node or zone to be used if it contains a failed +// replica. ignoreFailedReplicas == true supports this use case. +func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Node, + nodeDisksMap map[string]map[string]struct{}, + replicas map[string]*longhorn.Replica, + volume *longhorn.Volume, + requireSchedulingCheck, ignoreFailedReplicas bool) (map[string]*Disk, util.MultiError) { multiError := util.NewMultiError() nodeSoftAntiAffinity, err := rcs.ds.GetSettingAsBool(types.SettingNameReplicaSoftAntiAffinity) @@ -206,7 +216,8 @@ func (rcs *ReplicaScheduler) getDiskCandidates(nodeInfo map[string]*longhorn.Nod return diskCandidates, multiError } - usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(replicas, nodeInfo) + usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(replicas, nodeInfo, + ignoreFailedReplicas) allowEmptyNodeSelectorVolume, err := rcs.ds.GetSettingAsBool(types.SettingNameAllowEmptyNodeSelectorVolume) if err != nil { @@ -523,7 +534,9 @@ func (rcs *ReplicaScheduler) CheckAndReuseFailedReplica(replicas map[string]*lon } } - diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false) + // Call getDiskCandidates with ignoreFailedReplicas == true since we want the list of candidates to include disks + // that already contain a failed replica. + diskCandidates, _ := rcs.getDiskCandidates(availableNodesInfo, availableNodeDisksMap, replicas, volume, false, true) var reusedReplica *longhorn.Replica for _, suggestDisk := range diskCandidates { @@ -852,7 +865,10 @@ func findDiskSpecAndDiskStatusInNode(diskUUID string, node *longhorn.Node) (long return longhorn.DiskSpec{}, longhorn.DiskStatus{}, false } -func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node) (map[string]*longhorn.Node, +// getCurrentNodesAndZones returns the nodes and zones a replica is already scheduled to. Some callers do not consider a +// node or zone to be used if it contains a failed replica. ignoreFailedReplicas == true supports this use case. +func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map[string]*longhorn.Node, + ignoreFailedReplicas bool) (map[string]*longhorn.Node, map[string]bool, map[string]bool, map[string]bool) { usedNodes := map[string]*longhorn.Node{} usedZones := map[string]bool{} @@ -860,29 +876,37 @@ func getCurrentNodesAndZones(replicas map[string]*longhorn.Replica, nodeInfo map onlyEvictingZones := map[string]bool{} for _, r := range replicas { - if r.Spec.NodeID != "" && r.DeletionTimestamp == nil && r.Spec.FailedAt == "" { - if node, ok := nodeInfo[r.Spec.NodeID]; ok { - if r.Spec.EvictionRequested { - if _, ok := usedNodes[r.Spec.NodeID]; !ok { - // This is an evicting replica on a thus far unused node. We won't change this again unless we - // find a non-evicting replica on this node. - onlyEvictingNodes[node.Name] = true - } - if used := usedZones[node.Status.Zone]; !used { - // This is an evicting replica in a thus far unused zone. We won't change this again unless we - // find a non-evicting replica in this zone. - onlyEvictingZones[node.Status.Zone] = true - } - } else { - // There is now at least one replica on this node and in this zone that is not evicting. - onlyEvictingNodes[node.Name] = false - onlyEvictingZones[node.Status.Zone] = false - } + if r.Spec.NodeID == "" { + continue + } + if r.DeletionTimestamp != nil { + continue + } + if r.Spec.FailedAt != "" && ignoreFailedReplicas { + continue + } - usedNodes[node.Name] = node - // For empty zone label, we treat them as one zone. - usedZones[node.Status.Zone] = true + if node, ok := nodeInfo[r.Spec.NodeID]; ok { + if r.Spec.EvictionRequested { + if _, ok := usedNodes[r.Spec.NodeID]; !ok { + // This is an evicting replica on a thus far unused node. We won't change this again unless we + // find a non-evicting replica on this node. + onlyEvictingNodes[node.Name] = true + } + if used := usedZones[node.Status.Zone]; !used { + // This is an evicting replica in a thus far unused zone. We won't change this again unless we + // find a non-evicting replica in this zone. + onlyEvictingZones[node.Status.Zone] = true + } + } else { + // There is now at least one replica on this node and in this zone that is not evicting. + onlyEvictingNodes[node.Name] = false + onlyEvictingZones[node.Status.Zone] = false } + + usedNodes[node.Name] = node + // For empty zone label, we treat them as one zone. + usedZones[node.Status.Zone] = true } } diff --git a/scheduler/replica_scheduler_test.go b/scheduler/replica_scheduler_test.go index d92f74e456..be38bb7479 100644 --- a/scheduler/replica_scheduler_test.go +++ b/scheduler/replica_scheduler_test.go @@ -53,6 +53,8 @@ const ( TestZone1 = "test-zone-1" TestZone2 = "test-zone-2" + + TestTimeNow = "2015-01-02T00:00:00Z" ) var longhornFinalizerKey = longhorn.SchemeGroupVersion.Group @@ -1097,7 +1099,24 @@ func (s *TestSuite) TestReplicaScheduler(c *C) { tc.replicaZoneSoftAntiAffinity = "false" // Do not allow replicas to schedule to the same zone. testCases["fail scheduling when doing so would reuse an invalid evicting node"] = tc - for name, tc := range testCases { + // Test fail to schedule to node with failed replica + // If replicaNodeSoftAntiAffinity == false, this shouldn't be possible. + tc = generateFailedReplicaTestCase("false") + tc.err = false + tc.firstNilReplica = 0 + testCases["fail to schedule to node with failed replica"] = tc + + // Test succeed to schedule to node with failed replica + // If replicaNodeSoftAntiAffinity == true, this should be possible. + tc = generateFailedReplicaTestCase("true") + tc.err = false + tc.firstNilReplica = -1 + testCases["succeed to schedule to node with failed replica"] = tc + + testCasesActual := map[string]*ReplicaSchedulerTestCase{} + testCasesActual["fail to schedule to node with failed replica"] = testCases["fail to schedule to node with failed replica"] + testCasesActual["succeed to schedule to node with failed replica"] = testCases["succeed to schedule to node with failed replica"] + for name, tc := range testCasesActual { fmt.Printf("testing %v\n", name) kubeClient := fake.NewSimpleClientset() @@ -1188,6 +1207,52 @@ func (s *TestSuite) TestReplicaScheduler(c *C) { } } +func generateFailedReplicaTestCase(replicaNodeSoftAntiAffinity string) (tc *ReplicaSchedulerTestCase) { + tc = generateSchedulerTestCase() + tc.replicaNodeSoftAntiAffinity = replicaNodeSoftAntiAffinity + tc.replicaDiskSoftAntiAffinity = "true" // Do not hinder replica scheduling except for node. + tc.replicaZoneSoftAntiAffinity = "true" // Do not hinder replica scheduling except for node. + daemon1 := newDaemonPod(corev1.PodRunning, TestDaemon1, TestNamespace, TestNode1, TestIP1) + tc.daemons = []*corev1.Pod{ + daemon1, + } + node1 := newNode(TestNode1, TestNamespace, TestZone1, true, longhorn.ConditionStatusTrue) + tc.engineImage.Status.NodeDeploymentMap[node1.Name] = true + disk := newDisk(TestDefaultDataPath, true, 0) + node1.Spec.Disks = map[string]longhorn.DiskSpec{ + getDiskID(TestNode1, "1"): disk, + } + + // A failed replica is already scheduled. + var alreadyScheduledReplica *longhorn.Replica + for _, replica := range tc.allReplicas { + alreadyScheduledReplica = replica + break + } + delete(tc.replicasToSchedule, alreadyScheduledReplica.Name) + alreadyScheduledReplica.Spec.NodeID = TestNode1 + alreadyScheduledReplica.Spec.FailedAt = TestTimeNow + + node1.Status.DiskStatus = map[string]*longhorn.DiskStatus{ + getDiskID(TestNode1, "1"): { + StorageAvailable: TestDiskAvailableSize, + StorageScheduled: TestVolumeSize, + StorageMaximum: TestDiskSize, + Conditions: []longhorn.Condition{ + newCondition(longhorn.DiskConditionTypeSchedulable, longhorn.ConditionStatusTrue), + }, + DiskUUID: getDiskID(TestNode1, "1"), + Type: longhorn.DiskTypeFilesystem, + ScheduledReplica: map[string]int64{alreadyScheduledReplica.Name: TestVolumeSize}, + }, + } + nodes := map[string]*longhorn.Node{ + TestNode1: node1, + } + tc.nodes = nodes + return +} + func setSettings(tc *ReplicaSchedulerTestCase, lhClient *lhfake.Clientset, sIndexer cache.Indexer, c *C) { // Set storage over-provisioning percentage settings if tc.storageOverProvisioningPercentage != "" && tc.storageMinimalAvailablePercentage != "" { @@ -1456,7 +1521,8 @@ func (s *TestSuite) TestGetCurrentNodesAndZones(c *C) { for name, tc := range testCases { fmt.Printf("testing %v\n", name) - usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(tc.replicas, tc.nodeInfo) + usedNodes, usedZones, onlyEvictingNodes, onlyEvictingZones := getCurrentNodesAndZones(tc.replicas, tc.nodeInfo, + false) verifyNodeNames(tc.expectUsedNodeNames, usedNodes) verifyZoneNames(tc.expectUsedZoneNames, usedZones) verifyOnlyEvictingNodeNames(tc.expectOnlyEvictingNodeNames, onlyEvictingNodes)