diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go index 170e1c811..5e091b718 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go @@ -135,6 +135,8 @@ type DynamicPolicy struct { transitionPeriod time.Duration cpuNUMAHintPreferPolicy string cpuNUMAHintPreferLowThreshold float64 + + reservedReclaimedCPUsSize int } func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, @@ -206,10 +208,11 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration reclaimRelativeRootCgroupPath: conf.ReclaimRelativeRootCgroupPath, numaBindingReclaimRelativeRootCgroupPaths: common.GetNUMABindingReclaimRelativeRootCgroupPaths(conf.ReclaimRelativeRootCgroupPath, agentCtx.CPUDetails.NUMANodes().ToSliceNoSortInt()), - podDebugAnnoKeys: conf.PodDebugAnnoKeys, - podAnnotationKeptKeys: conf.PodAnnotationKeptKeys, - podLabelKeptKeys: conf.PodLabelKeptKeys, - transitionPeriod: 30 * time.Second, + podDebugAnnoKeys: conf.PodDebugAnnoKeys, + podAnnotationKeptKeys: conf.PodAnnotationKeptKeys, + podLabelKeptKeys: conf.PodLabelKeptKeys, + transitionPeriod: 30 * time.Second, + reservedReclaimedCPUsSize: general.Max(reservedReclaimedCPUsSize, agentCtx.KatalystMachineInfo.NumNUMANodes), } // register allocation behaviors for pods with different QoS level @@ -1078,13 +1081,13 @@ func (p *DynamicPolicy) initReclaimPool() error { state.WrapAllocationMetaFilter((*commonstate.AllocationMeta).CheckDedicatedNUMABinding)).Difference(noneResidentCPUs) var initReclaimedCPUSetSize int - if availableCPUs.Size() >= reservedReclaimedCPUsSize { - initReclaimedCPUSetSize = reservedReclaimedCPUsSize + if availableCPUs.Size() >= p.reservedReclaimedCPUsSize { + initReclaimedCPUSetSize = p.reservedReclaimedCPUsSize } else { initReclaimedCPUSetSize = availableCPUs.Size() } - reclaimedCPUSet, _, err := calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, initReclaimedCPUSetSize) + reclaimedCPUSet, _, err := calculator.TakeHTByNUMABalance(p.machineInfo, availableCPUs, initReclaimedCPUSetSize) if err != nil { return fmt.Errorf("takeByNUMABalance faild in initReclaimPool for %s and %s with error: %v", commonstate.PoolNameShare, commonstate.PoolNameReclaim, err) @@ -1094,7 +1097,7 @@ func (p *DynamicPolicy) initReclaimPool() error { // todo: noneResidentCPUs is the same as reservedCPUs, why should we do this? allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs) if reclaimedCPUSet.IsEmpty() { - reclaimedCPUSet, _, err = calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize) + reclaimedCPUSet, _, err = calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize) if err != nil { return fmt.Errorf("fallback takeByNUMABalance faild in initReclaimPool for %s with error: %v", commonstate.PoolNameReclaim, err) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_advisor_handler.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_advisor_handler.go index 957b77add..d2d145e2b 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_advisor_handler.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_advisor_handler.go @@ -522,7 +522,7 @@ func (p *DynamicPolicy) applyBlocks(blockCPUSet advisorapi.BlockCPUSet, resp *ad allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs) var tErr error - reclaimPoolCPUSet, _, tErr = calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize) + reclaimPoolCPUSet, _, tErr = calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize) if tErr != nil { return fmt.Errorf("fallback takeByNUMABalance faild in applyBlocks for reclaimPoolCPUSet with error: %v", tErr) } diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go index 7bf77c7d3..c6e8cd514 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go @@ -1428,7 +1428,7 @@ func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]ma if !p.state.GetAllowSharedCoresOverlapReclaimedCores() { enableReclaim := p.dynamicConfig.GetDynamicConfiguration().EnableReclaim - if !enableReclaim && poolsCPUSet[commonstate.PoolNameReclaim].Size() > reservedReclaimedCPUsSize { + if !enableReclaim && poolsCPUSet[commonstate.PoolNameReclaim].Size() > p.reservedReclaimedCPUsSize { poolsCPUSet[commonstate.PoolNameReclaim] = p.apportionReclaimedPool( poolsCPUSet, poolsCPUSet[commonstate.PoolNameReclaim].Clone(), nonBindingPoolsQuantityMap) general.Infof("apportionReclaimedPool finished, current %s pool: %s", @@ -1461,9 +1461,9 @@ func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]ma if poolsCPUSet[commonstate.PoolNameReclaim].IsEmpty() { // for reclaimed pool, we must make them exist when the node isn't in hybrid mode even if cause overlap allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs) - reclaimedCPUSet, _, tErr := calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize) + reclaimedCPUSet, _, tErr := calculator.TakeHTByNUMABalance(p.machineInfo, allAvailableCPUs, p.reservedReclaimedCPUsSize) if tErr != nil { - err = fmt.Errorf("fallback takeByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr) + err = fmt.Errorf("fallback TakeHTByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr) return } @@ -1570,7 +1570,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP totalSize += poolCPUs.Size() } - availableSize := reclaimedCPUs.Size() - reservedReclaimedCPUsSize + availableSize := reclaimedCPUs.Size() - p.reservedReclaimedCPUsSize if availableSize <= 0 || totalSize == 0 { return reclaimedCPUs } @@ -1587,7 +1587,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP var err error var cpuset machine.CPUSet - cpuset, reclaimedCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize) + cpuset, reclaimedCPUs, err = calculator.TakeHTByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize) if err != nil { general.Errorf("take %d cpus from reclaimedCPUs: %s, size: %d failed with error: %v", proportionalSize, reclaimedCPUs.String(), reclaimedCPUs.Size(), err) @@ -1597,7 +1597,7 @@ func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CP poolsCPUSet[poolName] = poolCPUs.Union(cpuset) general.Infof("take %s to %s; prev: %s, current: %s", cpuset.String(), poolName, poolCPUs.String(), poolsCPUSet[poolName].String()) - if reclaimedCPUs.Size() <= reservedReclaimedCPUsSize { + if reclaimedCPUs.Size() <= p.reservedReclaimedCPUsSize { break } }