Skip to content

Commit

Permalink
fix(qrm): filter NUMAs make shared_cores with numa_binding won't caus…
Browse files Browse the repository at this point in the history
…e normal shared_cores in short supply
  • Loading branch information
csfldf committed Jun 12, 2024
1 parent fcbba49 commit 3a0e668
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 7 deletions.
52 changes: 46 additions & 6 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingHintHandler(_ context.Context,
}

machineState := p.state.GetMachineState()
podEntries := p.state.GetPodEntries()

var hints map[string]*pluginapi.ListOfTopologyHints

allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
Expand All @@ -263,7 +265,6 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingHintHandler(_ context.Context,

// regenerateHints failed. need to clear container record and re-calculate.
if hints == nil {
podEntries := p.state.GetPodEntries()
delete(podEntries[req.PodUid], req.ContainerName)
if len(podEntries[req.PodUid]) == 0 {
delete(podEntries, req.PodUid)
Expand All @@ -282,7 +283,7 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingHintHandler(_ context.Context,

if hints == nil {
var calculateErr error
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, machineState, req.Annotations)
hints, calculateErr = p.calculateHintsForNUMABindingSharedCores(reqInt, podEntries, machineState, req.Annotations)
if calculateErr != nil {
return nil, fmt.Errorf("calculateHintsForNUMABindingSharedCores failed with error: %v", calculateErr)
}
Expand Down Expand Up @@ -351,19 +352,58 @@ func (p *DynamicPolicy) filterNUMANodesByHintPreferLowThreshold(reqInt int,
continue
}

if float64(availableCPUQuantity)/float64(allocatableCPUQuantity) >= p.cpuNUMAHintPreferLowThreshold {
availableRatio := float64(availableCPUQuantity) / float64(allocatableCPUQuantity)

general.Infof("NUMA: %d, availableCPUQuantity: %d, allocatableCPUQuantity: %d, availableRatio: %.2f, cpuNUMAHintPreferLowThreshold:%.2f",
nodeID, availableCPUQuantity, allocatableCPUQuantity, availableRatio, p.cpuNUMAHintPreferLowThreshold)

if availableRatio >= p.cpuNUMAHintPreferLowThreshold {
filteredNUMANodes = append(filteredNUMANodes, nodeID)
}
}

return filteredNUMANodes
}

func (p *DynamicPolicy) calculateHintsForNUMABindingSharedCores(reqInt int, machineState state.NUMANodeMap,
func (p *DynamicPolicy) filterNUMANodesByNonBindingSharedRequestedQuantity(nonBindingSharedRequestedQuantity,
nonBindingNUMAsCPUQuantity int,
nonBindingNUMAs machine.CPUSet,
machineState state.NUMANodeMap, numaNodes []int,
) []int {
filteredNUMANodes := make([]int, 0, len(numaNodes))

for _, nodeID := range numaNodes {
if nonBindingNUMAs.Contains(nodeID) {
allocatableCPUQuantity := machineState[nodeID].GetFilteredDefaultCPUSet(nil, nil).Difference(p.reservedCPUs).Size()

// take this non-binding NUMA for candicate shared_cores with numa_binding,
// won't cause normal shared_cores in short supply
if nonBindingNUMAsCPUQuantity-allocatableCPUQuantity >= nonBindingSharedRequestedQuantity {
filteredNUMANodes = append(filteredNUMANodes, nodeID)
} else {
general.Infof("filter out NUMA: %d since taking it will cause normal shared_cores in short supply;"+
" nonBindingNUMAsCPUQuantity: %d, targetNUMAAllocatableCPUQuantity: %d, nonBindingSharedRequestedQuantity: %d",
nodeID, nonBindingNUMAsCPUQuantity, allocatableCPUQuantity, nonBindingSharedRequestedQuantity)
}
} else {
filteredNUMANodes = append(filteredNUMANodes, nodeID)
}
}

return filteredNUMANodes
}

func (p *DynamicPolicy) calculateHintsForNUMABindingSharedCores(reqInt int, podEntries state.PodEntries,
machineState state.NUMANodeMap,
reqAnnotations map[string]string,
) (map[string]*pluginapi.ListOfTopologyHints, error) {
numaNodes := machineState.GetFilteredNUMASetWithAnnotations(
state.CheckNUMABindingSharedCoresAntiAffinity, reqAnnotations).ToSliceInt()
nonBindingNUMAsCPUQuantity := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, nil, state.CheckNUMABinding).Size()
nonBindingNUMAs := machineState.GetFilteredNUMASet(state.CheckNUMABinding)
nonBindingSharedRequestedQuantity := state.GetNonBindingSharedRequestedQuantityFromPodEntries(podEntries)

numaNodes := p.filterNUMANodesByNonBindingSharedRequestedQuantity(nonBindingSharedRequestedQuantity,
nonBindingNUMAsCPUQuantity, nonBindingNUMAs, machineState,
machineState.GetFilteredNUMASetWithAnnotations(state.CheckNUMABindingSharedCoresAntiAffinity, reqAnnotations).ToSliceInt())

hints := map[string]*pluginapi.ListOfTopologyHints{
string(v1.ResourceCPU): {
Expand Down
1 change: 0 additions & 1 deletion pkg/agent/qrm-plugins/cpu/dynamicpolicy/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,6 @@ func (ns *NUMANodeState) GetAvailableCPUQuantity(reservedCPUs machine.CPUSet) in
}

for _, allocationInfo := range containerEntries {
// sidecar doesn't contribute to allocated quantity currently
if allocationInfo == nil ||
!CheckSharedNUMABinding(allocationInfo) {
continue
Expand Down
21 changes: 21 additions & 0 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/state/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,27 @@ func GetSharedQuantityMapFromPodEntries(podEntries PodEntries, ignoreAllocationI
return poolsQuantityMap, nil
}

// GetTotoalSharedQuantity returns total quanity shared_cores without numa_binding requested
func GetNonBindingSharedRequestedQuantityFromPodEntries(podEntries PodEntries) int {
var reqFloat64 float64 = 0

for _, entries := range podEntries {
if entries.IsPoolEntry() {
continue
}

for _, allocationInfo := range entries {
if allocationInfo == nil || !CheckShared(allocationInfo) || CheckNUMABinding(allocationInfo) {
continue
}

reqFloat64 += GetContainerRequestedCores()(allocationInfo)
}
}

return int(math.Ceil(reqFloat64))
}

// GenerateMachineStateFromPodEntries returns NUMANodeMap for given resource based on
// machine info and reserved resources along with existed pod entries
func GenerateMachineStateFromPodEntries(topology *machine.CPUTopology, podEntries PodEntries, policyName string) (NUMANodeMap, error) {
Expand Down

0 comments on commit 3a0e668

Please sign in to comment.