Skip to content

Commit

Permalink
fix(qrm): fix fallback takeByNUMABalance to TakeHTByNUMABalance
Browse files Browse the repository at this point in the history
  • Loading branch information
luomingmeng committed Nov 1, 2024
1 parent 9c388ef commit 46e968b
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 15 deletions.
19 changes: 11 additions & 8 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ type DynamicPolicy struct {
transitionPeriod time.Duration
cpuNUMAHintPreferPolicy string
cpuNUMAHintPreferLowThreshold float64

reservedReclaimedCPUsSize int
}

func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration,
Expand Down Expand Up @@ -206,10 +208,11 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration
reclaimRelativeRootCgroupPath: conf.ReclaimRelativeRootCgroupPath,
numaBindingReclaimRelativeRootCgroupPaths: common.GetNUMABindingReclaimRelativeRootCgroupPaths(conf.ReclaimRelativeRootCgroupPath,
agentCtx.CPUDetails.NUMANodes().ToSliceNoSortInt()),
podDebugAnnoKeys: conf.PodDebugAnnoKeys,
podAnnotationKeptKeys: conf.PodAnnotationKeptKeys,
podLabelKeptKeys: conf.PodLabelKeptKeys,
transitionPeriod: 30 * time.Second,
podDebugAnnoKeys: conf.PodDebugAnnoKeys,
podAnnotationKeptKeys: conf.PodAnnotationKeptKeys,
podLabelKeptKeys: conf.PodLabelKeptKeys,
transitionPeriod: 30 * time.Second,
reservedReclaimedCPUsSize: general.Max(reservedReclaimedCPUsSize, agentCtx.KatalystMachineInfo.NumNUMANodes),
}

// register allocation behaviors for pods with different QoS level
Expand Down Expand Up @@ -1078,13 +1081,13 @@ func (p *DynamicPolicy) initReclaimPool() error {
state.WrapAllocationMetaFilter((*commonstate.AllocationMeta).CheckDedicatedNUMABinding)).Difference(noneResidentCPUs)

var initReclaimedCPUSetSize int
if availableCPUs.Size() >= reservedReclaimedCPUsSize {
initReclaimedCPUSetSize = reservedReclaimedCPUsSize
if availableCPUs.Size() >= p.reservedReclaimedCPUsSize {
initReclaimedCPUSetSize = p.reservedReclaimedCPUsSize
} else {
initReclaimedCPUSetSize = availableCPUs.Size()
}

reclaimedCPUSet, _, err := calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, initReclaimedCPUSetSize)
reclaimedCPUSet, _, err := calculator.TakeHTByNUMABalance(p.machineInfo, availableCPUs, initReclaimedCPUSetSize)
if err != nil {
return fmt.Errorf("takeByNUMABalance faild in initReclaimPool for %s and %s with error: %v",
commonstate.PoolNameShare, commonstate.PoolNameReclaim, err)
Expand All @@ -1094,7 +1097,7 @@ func (p *DynamicPolicy) initReclaimPool() error {
// todo: noneResidentCPUs is the same as reservedCPUs, why should we do this?
allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs)
if reclaimedCPUSet.IsEmpty() {
reclaimedCPUSet, _, err = calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize)
reclaimedCPUSet, _, err = calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize)
if err != nil {
return fmt.Errorf("fallback takeByNUMABalance faild in initReclaimPool for %s with error: %v",
commonstate.PoolNameReclaim, err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ func (p *DynamicPolicy) applyBlocks(blockCPUSet advisorapi.BlockCPUSet, resp *ad
allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs)

var tErr error
reclaimPoolCPUSet, _, tErr = calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize)
reclaimPoolCPUSet, _, tErr = calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize)
if tErr != nil {
return fmt.Errorf("fallback takeByNUMABalance faild in applyBlocks for reclaimPoolCPUSet with error: %v", tErr)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1428,7 +1428,7 @@ func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]ma

if !p.state.GetAllowSharedCoresOverlapReclaimedCores() {
enableReclaim := p.dynamicConfig.GetDynamicConfiguration().EnableReclaim
if !enableReclaim && poolsCPUSet[commonstate.PoolNameReclaim].Size() > reservedReclaimedCPUsSize {
if !enableReclaim && poolsCPUSet[commonstate.PoolNameReclaim].Size() > p.reservedReclaimedCPUsSize {
poolsCPUSet[commonstate.PoolNameReclaim] = p.apportionReclaimedPool(
poolsCPUSet, poolsCPUSet[commonstate.PoolNameReclaim].Clone(), nonBindingPoolsQuantityMap)
general.Infof("apportionReclaimedPool finished, current %s pool: %s",
Expand Down Expand Up @@ -1461,9 +1461,9 @@ func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]ma
if poolsCPUSet[commonstate.PoolNameReclaim].IsEmpty() {
// for reclaimed pool, we must make them exist when the node isn't in hybrid mode even if cause overlap
allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs)
reclaimedCPUSet, _, tErr := calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize)
reclaimedCPUSet, _, tErr := calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize)
if tErr != nil {
err = fmt.Errorf("fallback takeByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr)
err = fmt.Errorf("fallback TakeHTByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr)
return
}

Expand Down Expand Up @@ -1570,7 +1570,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP
totalSize += poolCPUs.Size()
}

availableSize := reclaimedCPUs.Size() - reservedReclaimedCPUsSize
availableSize := reclaimedCPUs.Size() - p.reservedReclaimedCPUsSize
if availableSize <= 0 || totalSize == 0 {
return reclaimedCPUs
}
Expand All @@ -1587,7 +1587,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP

var err error
var cpuset machine.CPUSet
cpuset, reclaimedCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize)
cpuset, reclaimedCPUs, err = calculator.TakeHTByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize)
if err != nil {
general.Errorf("take %d cpus from reclaimedCPUs: %s, size: %d failed with error: %v",
proportionalSize, reclaimedCPUs.String(), reclaimedCPUs.Size(), err)
Expand All @@ -1597,7 +1597,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP
poolsCPUSet[poolName] = poolCPUs.Union(cpuset)
general.Infof("take %s to %s; prev: %s, current: %s", cpuset.String(), poolName, poolCPUs.String(), poolsCPUSet[poolName].String())

if reclaimedCPUs.Size() <= reservedReclaimedCPUsSize {
if reclaimedCPUs.Size() <= p.reservedReclaimedCPUsSize {
break
}
}
Expand Down

0 comments on commit 46e968b

Please sign in to comment.