From 431175edcbd7b6c1c97335a985d6eec2ec71f2a1 Mon Sep 17 00:00:00 2001 From: linzhecheng Date: Mon, 4 Mar 2024 20:17:27 +0800 Subject: [PATCH] refactor(sysadvisor): refine memory guard The current memory protection mechanism is achieved by setting the memory limit of the Besteffort group to avoid memory overflows. However, when calculating the limit, the cache of high-priority services is considered as reclaimable, so that direct memory reclamation can still be triggered when memory bursting. This MR refactors the calculation logic. Firstly, we calculates the safety memory upper limit for each available NUMA node, and then sums them up to obtain the result. The upper limit for each NUMA node is calculated as followed: NUMA free memory + Besteffort group memory used on the NUMA node - reserved memory. Signed-off-by: linzhecheng --- .../qosaware/resource/memory/advisor_test.go | 144 +++++++++++++----- .../resource/memory/plugin/memory_guard.go | 89 ++++++----- pkg/metaserver/agent/metric/fake_metric.go | 4 +- pkg/metaserver/agent/metric/metric_impl.go | 2 +- .../provisioner/malachite/provisioner.go | 14 +- pkg/metaserver/agent/metric/types/metric.go | 2 +- pkg/util/metric/store.go | 10 +- 7 files changed, 169 insertions(+), 96 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go index 69c109515..62af4d07a 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go @@ -175,7 +175,7 @@ type containerNUMAMetric struct { metricValue metricutil.MetricData podUID string containerName string - numdID int + numaID int } type cgroupMetric struct { @@ -184,6 +184,13 @@ type cgroupMetric struct { cgroupPath string } +type cgroupNUMAMetric struct { + metricName string + metricValue metricutil.MetricData + numaID int + cgroupPath string +} + var defaultPodList = []*v1.Pod{ { ObjectMeta: metav1.ObjectMeta{ @@ -388,6 +395,33 @@ var cgroupMetrics = []cgroupMetric{ }, } +var cgroupNUMAMetrics = []cgroupNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 0, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 1, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 2, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 3, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, +} + func TestUpdate(t *testing.T) { t.Parallel() @@ -405,6 +439,7 @@ func TestUpdate(t *testing.T) { containerMetrics []containerMetric containerNUMAMetrics []containerNUMAMetric cgroupMetrics []cgroupMetric + cgroupNUMAMetrics []cgroupNUMAMetric metricsFetcherSynced *bool wantAdviceResult types.InternalMemoryCalculationResult }{ @@ -648,42 +683,42 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 10 << 20}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 9 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 10 << 20}, podUID: "uid1", containerName: "c1", - numdID: 1, + numaID: 1, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 9 << 30}, podUID: "uid2", containerName: "c2", - numdID: 1, + numaID: 1, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 1, + numaID: 1, }, }, wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -702,7 +737,39 @@ func TestUpdate(t *testing.T) { }, }, { - name: "set reclaimed group memory limit", + name: "set reclaimed group memory limit(succeeded)", + pools: map[string]*types.PoolInfo{ + state.PoolNameReserve: { + PoolName: state.PoolNameReserve, + TopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + OriginalTopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + }, + }, + reclaimedEnable: true, + needRecvAdvices: true, + wantHeadroom: *resource.NewQuantity(996<<30, resource.DecimalSI), + nodeMetrics: defaultNodeMetrics, + numaMetrics: defaultNumaMetrics, + cgroupMetrics: cgroupMetrics, + cgroupNUMAMetrics: cgroupNUMAMetrics, + plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemoryGuard}, + wantAdviceResult: types.InternalMemoryCalculationResult{ + ExtraEntries: []types.ExtraMemoryAdvices{ + { + CgroupPath: "/kubepods/besteffort", + Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(240 << 30)}, + }, + }, + }, + }, + { + name: "set reclaimed group memory limit(failed)", pools: map[string]*types.PoolInfo{ state.PoolNameReserve: { PoolName: state.PoolNameReserve, @@ -721,15 +788,9 @@ func TestUpdate(t *testing.T) { wantHeadroom: *resource.NewQuantity(996<<30, resource.DecimalSI), nodeMetrics: defaultNodeMetrics, numaMetrics: defaultNumaMetrics, - cgroupMetrics: cgroupMetrics, plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemoryGuard}, wantAdviceResult: types.InternalMemoryCalculationResult{ - ExtraEntries: []types.ExtraMemoryAdvices{ - { - CgroupPath: "/kubepods/besteffort", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(375 << 30)}, - }, - }, + ExtraEntries: []types.ExtraMemoryAdvices{}, }, }, { @@ -1127,28 +1188,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1286,28 +1347,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 10}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 10}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 10}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 512 << 10}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1412,28 +1473,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 512 << 20}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1571,28 +1632,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1705,28 +1766,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 2, + numaID: 2, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 2, + numaID: 2, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1837,28 +1898,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 2, + numaID: 2, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 2, + numaID: 2, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1896,11 +1957,14 @@ func TestUpdate(t *testing.T) { metricsFetcher.SetContainerMetric(containerMetric.podUID, containerMetric.containerName, containerMetric.metricName, containerMetric.metricValue) } for _, containerNUMAMetric := range tt.containerNUMAMetrics { - metricsFetcher.SetContainerNumaMetric(containerNUMAMetric.podUID, containerNUMAMetric.containerName, strconv.Itoa(containerNUMAMetric.numdID), containerNUMAMetric.metricName, containerNUMAMetric.metricValue) + metricsFetcher.SetContainerNumaMetric(containerNUMAMetric.podUID, containerNUMAMetric.containerName, strconv.Itoa(containerNUMAMetric.numaID), containerNUMAMetric.metricName, containerNUMAMetric.metricValue) } for _, qosClassMetric := range tt.cgroupMetrics { metricsFetcher.SetCgroupMetric(qosClassMetric.cgroupPath, qosClassMetric.metricName, qosClassMetric.metricValue) } + for _, cgroupNUMAMetric := range tt.cgroupNUMAMetrics { + metricsFetcher.SetCgroupNumaMetric(cgroupNUMAMetric.cgroupPath, cgroupNUMAMetric.numaID, cgroupNUMAMetric.metricName, cgroupNUMAMetric.metricValue) + } if tt.metricsFetcherSynced != nil { metricsFetcher.SetSynced(*tt.metricsFetcherSynced) } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go index 046bccc40..7bc27275b 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go @@ -17,12 +17,14 @@ limitations under the License. package plugin import ( + "math" "strconv" "go.uber.org/atomic" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/consts" @@ -33,6 +35,9 @@ import ( const ( MemoryGuard = "memory-guard" + + reconcileStatusSucceeded = "succeeded" + reconcileStatusFailed = "failed" ) type memoryGuard struct { @@ -41,7 +46,9 @@ type memoryGuard struct { emitter metrics.MetricEmitter reclaimRelativeRootCgroupPath string reclaimMemoryLimit *atomic.Int64 + reconcileStatus *atomic.String minCriticalWatermark int64 + conf *config.Configuration } func NewMemoryGuard(conf *config.Configuration, extraConfig interface{}, metaReader metacache.MetaReader, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter) MemoryAdvisorPlugin { @@ -51,72 +58,64 @@ func NewMemoryGuard(conf *config.Configuration, extraConfig interface{}, metaRea emitter: emitter, reclaimRelativeRootCgroupPath: conf.ReclaimRelativeRootCgroupPath, reclaimMemoryLimit: atomic.NewInt64(-1), + reconcileStatus: atomic.NewString(reconcileStatusFailed), minCriticalWatermark: conf.MinCriticalWatermark, + conf: conf, } } func (mg *memoryGuard) Reconcile(status *types.MemoryPressureStatus) error { - memoryTotal, err := mg.metaServer.GetNodeMetric(consts.MetricMemTotalSystem) - if err != nil { - return err - } - - memoryFree, err := mg.metaReader.GetNodeMetric(consts.MetricMemFreeSystem) - if err != nil { - return err - } - - memoryCache, err := mg.metaReader.GetNodeMetric(consts.MetricMemPageCacheSystem) - if err != nil { - return err - } - - memoryBuffer, err := mg.metaReader.GetNodeMetric(consts.MetricMemBufferSystem) - if err != nil { - return err - } - - scaleFactor, err := mg.metaReader.GetNodeMetric(consts.MetricMemScaleFactorSystem) + mg.reconcileStatus.Store(reconcileStatusFailed) + reclaimMemoryLimit := .0 + availNUMAs, _, err := helper.GetAvailableNUMAsAndReclaimedCores(mg.conf, mg.metaReader, mg.metaServer) if err != nil { return err } - criticalWatermark := general.MaxFloat64(float64(mg.minCriticalWatermark*int64(mg.metaServer.NumNUMANodes)), memoryTotal.Value*scaleFactor.Value/10000) - buffer := memoryFree.Value + memoryCache.Value + memoryBuffer.Value - criticalWatermark - if buffer < 0 { - buffer = 0 - } - - reclaimGroupRss, err := mg.metaReader.GetCgroupMetric(mg.reclaimRelativeRootCgroupPath, consts.MetricMemRssCgroup) + watermarkScaleFactor, err := mg.metaServer.GetNodeMetric(consts.MetricMemScaleFactorSystem) if err != nil { + general.ErrorS(err, "Can not get system watermark scale factor") return err } - reclaimGroupUsed, err := mg.metaReader.GetCgroupMetric(mg.reclaimRelativeRootCgroupPath, consts.MetricMemUsageCgroup) - if err != nil { - return err + for _, numaID := range availNUMAs.ToSliceInt() { + reclaimedCoresUsed, err := mg.metaServer.GetCgroupNumaMetric(mg.reclaimRelativeRootCgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup) + if err != nil { + return err + } + + numaTotal, err := mg.metaServer.GetNumaMetric(numaID, consts.MetricMemTotalNuma) + if err != nil { + return err + } + numaFree, err := mg.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) + if err != nil { + return err + } + + criticalWatermark := math.Max(float64(mg.minCriticalWatermark), numaTotal.Value*watermarkScaleFactor.Value/float64(10000)) + reclaimMemoryLimit += reclaimedCoresUsed.Value + + math.Max(numaFree.Value-criticalWatermark, 0) + + general.InfoS("NUMA memory info", "numaID", numaID, + "criticalWatermark", general.FormatMemoryQuantity(criticalWatermark), + "reclaimedCoresUsed", general.FormatMemoryQuantity(reclaimedCoresUsed.Value), + "numaTotal", general.FormatMemoryQuantity(numaTotal.Value), + "numaFree", general.FormatMemoryQuantity(numaFree.Value), + "reclaimMemoryLimit", general.FormatMemoryQuantity(reclaimMemoryLimit)) } - reclaimMemoryLimit := general.MaxFloat64(reclaimGroupUsed.Value, reclaimGroupRss.Value+buffer) - - general.InfoS("memory details", - "system total", general.FormatMemoryQuantity(memoryTotal.Value), - "system free", general.FormatMemoryQuantity(memoryFree.Value), - "system cache", general.FormatMemoryQuantity(memoryCache.Value), - "system buffer", general.FormatMemoryQuantity(memoryBuffer.Value), - "system scaleFactor", general.FormatMemoryQuantity(scaleFactor.Value), - "criticalWatermark", general.FormatMemoryQuantity(criticalWatermark), - "buffer", general.FormatMemoryQuantity(buffer), - "reclaim cgroup rss", general.FormatMemoryQuantity(reclaimGroupRss.Value), - "reclaim cgroup used", general.FormatMemoryQuantity(reclaimGroupUsed.Value), - ) - mg.reclaimMemoryLimit.Store(int64(reclaimMemoryLimit)) + mg.reconcileStatus.Store(reconcileStatusSucceeded) return nil } func (mg *memoryGuard) GetAdvices() types.InternalMemoryCalculationResult { + if mg.reconcileStatus.Load() == reconcileStatusFailed { + general.Errorf("failed to get last reconcile result") + return types.InternalMemoryCalculationResult{} + } result := types.InternalMemoryCalculationResult{ ExtraEntries: []types.ExtraMemoryAdvices{ { diff --git a/pkg/metaserver/agent/metric/fake_metric.go b/pkg/metaserver/agent/metric/fake_metric.go index 42d8f4d29..cc35b9c08 100644 --- a/pkg/metaserver/agent/metric/fake_metric.go +++ b/pkg/metaserver/agent/metric/fake_metric.go @@ -162,10 +162,10 @@ func (f *FakeMetricsFetcher) GetCgroupMetric(cgroupPath, metricName string) (met return f.metricStore.GetCgroupMetric(cgroupPath, metricName) } -func (f *FakeMetricsFetcher) SetCgroupNumaMetric(cgroupPath, numaNode, metricName string, data metric.MetricData) { +func (f *FakeMetricsFetcher) SetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string, data metric.MetricData) { f.metricStore.SetCgroupNumaMetric(cgroupPath, numaNode, metricName, data) } -func (f *FakeMetricsFetcher) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (metric.MetricData, error) { +func (f *FakeMetricsFetcher) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (metric.MetricData, error) { return f.checkMetricDataExpire(f.metricStore.GetCgroupNumaMetric(cgroupPath, numaNode, metricName)) } diff --git a/pkg/metaserver/agent/metric/metric_impl.go b/pkg/metaserver/agent/metric/metric_impl.go index b0c443e8f..c3108de7f 100644 --- a/pkg/metaserver/agent/metric/metric_impl.go +++ b/pkg/metaserver/agent/metric/metric_impl.go @@ -278,7 +278,7 @@ func (f *MetricsFetcherImpl) GetCgroupMetric(cgroupPath, metricName string) (uti return f.checkMetricDataExpire(f.metricStore.GetCgroupMetric(cgroupPath, metricName)) } -func (f *MetricsFetcherImpl) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (utilmetric.MetricData, error) { +func (f *MetricsFetcherImpl) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (utilmetric.MetricData, error) { return f.checkMetricDataExpire(f.metricStore.GetCgroupNumaMetric(cgroupPath, numaNode, metricName)) } diff --git a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go index 0a84565f7..64c87a82a 100644 --- a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go @@ -589,7 +589,12 @@ func (m *MalachiteMetricsProvisioner) processCgroupPerNumaMemoryData(cgroupPath updateTime := time.Unix(cgStats.V1.Memory.UpdateTime, 0) for _, data := range numaStats { - numaID := strings.TrimPrefix(data.NumaName, "N") + numaIDStr := strings.TrimPrefix(data.NumaName, "N") + numaID, err := strconv.Atoi(numaIDStr) + if err != nil { + klog.ErrorS(err, "failed to parse numa", "str", numaIDStr) + continue + } m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalTotal << pageShift)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemFilePerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalFile << pageShift)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemAnonPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalAnon << pageShift)}) @@ -599,7 +604,12 @@ func (m *MalachiteMetricsProvisioner) processCgroupPerNumaMemoryData(cgroupPath updateTime := time.Unix(cgStats.V2.Memory.UpdateTime, 0) for numa, data := range numaStats { - numaID := strings.TrimPrefix(numa, "N") + numaIDStr := strings.TrimPrefix(numa, "N") + numaID, err := strconv.Atoi(numaIDStr) + if err != nil { + klog.ErrorS(err, "failed to parse numaIDStr", "str", numaIDStr) + continue + } total := data.Anon + data.File + data.Unevictable m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(total)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemFilePerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.File)}) diff --git a/pkg/metaserver/agent/metric/types/metric.go b/pkg/metaserver/agent/metric/types/metric.go index a19a99bcf..53e304004 100644 --- a/pkg/metaserver/agent/metric/types/metric.go +++ b/pkg/metaserver/agent/metric/types/metric.go @@ -86,7 +86,7 @@ type MetricsReader interface { // GetCgroupMetric get metric of cgroup path: /kubepods/burstable, /kubepods/besteffort, etc. GetCgroupMetric(cgroupPath, metricName string) (metric.MetricData, error) // GetCgroupNumaMetric get NUMA metric of qos class: /kubepods/burstable, /kubepods/besteffort, etc. - GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (metric.MetricData, error) + GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (metric.MetricData, error) HasSynced() bool } diff --git a/pkg/util/metric/store.go b/pkg/util/metric/store.go index 1affa464c..b23461c05 100644 --- a/pkg/util/metric/store.go +++ b/pkg/util/metric/store.go @@ -48,7 +48,7 @@ type MetricStore struct { podContainerNumaMetricMap map[string]map[string]map[string]map[string]MetricData // map[podUID]map[containerName]map[numaNode]map[metricName]data podVolumeMetricMap map[string]map[string]map[string]MetricData // map[podUID]map[volumeName]map[metricName]data cgroupMetricMap map[string]map[string]MetricData // map[cgroupPath]map[metricName]value - cgroupNumaMetricMap map[string]map[string]map[string]MetricData // map[cgroupPath]map[numaNode]map[metricName]value + cgroupNumaMetricMap map[string]map[int]map[string]MetricData // map[cgroupPath]map[numaNode]map[metricName]value } func NewMetricStore() *MetricStore { @@ -62,7 +62,7 @@ func NewMetricStore() *MetricStore { podContainerNumaMetricMap: make(map[string]map[string]map[string]map[string]MetricData), podVolumeMetricMap: make(map[string]map[string]map[string]MetricData), cgroupMetricMap: make(map[string]map[string]MetricData), - cgroupNumaMetricMap: make(map[string]map[string]map[string]MetricData), + cgroupNumaMetricMap: make(map[string]map[int]map[string]MetricData), } } @@ -301,13 +301,13 @@ func (c *MetricStore) GetCgroupMetric(cgroupPath, metricName string) (MetricData return data, nil } -func (c *MetricStore) SetCgroupNumaMetric(cgroupPath, numaNode, metricName string, data MetricData) { +func (c *MetricStore) SetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string, data MetricData) { c.mutex.Lock() defer c.mutex.Unlock() numaMetrics, ok := c.cgroupNumaMetricMap[cgroupPath] if !ok { - numaMetrics = make(map[string]map[string]MetricData) + numaMetrics = make(map[int]map[string]MetricData) c.cgroupNumaMetricMap[cgroupPath] = numaMetrics } metrics, ok := numaMetrics[numaNode] @@ -318,7 +318,7 @@ func (c *MetricStore) SetCgroupNumaMetric(cgroupPath, numaNode, metricName strin metrics[metricName] = data } -func (c *MetricStore) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (MetricData, error) { +func (c *MetricStore) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (MetricData, error) { c.mutex.RLock() defer c.mutex.RUnlock() numaMetrics, ok := c.cgroupNumaMetricMap[cgroupPath]