diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go index 69c109515d..ef5ba667a0 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go @@ -175,7 +175,7 @@ type containerNUMAMetric struct { metricValue metricutil.MetricData podUID string containerName string - numdID int + numaID int } type cgroupMetric struct { @@ -184,6 +184,13 @@ type cgroupMetric struct { cgroupPath string } +type cgroupNUMAMetric struct { + metricName string + metricValue metricutil.MetricData + numaID int + cgroupPath string +} + var defaultPodList = []*v1.Pod{ { ObjectMeta: metav1.ObjectMeta{ @@ -388,6 +395,33 @@ var cgroupMetrics = []cgroupMetric{ }, } +var cgroupNUMAMetrics = []cgroupNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 0, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 1, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 2, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaCgroup, + numaID: 3, + cgroupPath: "/kubepods/besteffort", + metricValue: metricutil.MetricData{Value: 6 << 30}, + }, +} + func TestUpdate(t *testing.T) { t.Parallel() @@ -405,6 +439,7 @@ func TestUpdate(t *testing.T) { containerMetrics []containerMetric containerNUMAMetrics []containerNUMAMetric cgroupMetrics []cgroupMetric + cgroupNUMAMetrics []cgroupNUMAMetric metricsFetcherSynced *bool wantAdviceResult types.InternalMemoryCalculationResult }{ @@ -648,42 +683,42 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 10 << 20}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 9 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 10 << 20}, podUID: "uid1", containerName: "c1", - numdID: 1, + numaID: 1, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 9 << 30}, podUID: "uid2", containerName: "c2", - numdID: 1, + numaID: 1, }, { metricName: coreconsts.MetricsMemFilePerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 1, + numaID: 1, }, }, wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -702,7 +737,39 @@ func TestUpdate(t *testing.T) { }, }, { - name: "set reclaimed group memory limit", + name: "set reclaimed group memory limit(succeeded)", + pools: map[string]*types.PoolInfo{ + state.PoolNameReserve: { + PoolName: state.PoolNameReserve, + TopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + OriginalTopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + }, + }, + reclaimedEnable: true, + needRecvAdvices: true, + wantHeadroom: *resource.NewQuantity(996<<30, resource.DecimalSI), + nodeMetrics: defaultNodeMetrics, + numaMetrics: defaultNumaMetrics, + cgroupMetrics: cgroupMetrics, + cgroupNUMAMetrics: cgroupNUMAMetrics, + plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemoryGuard}, + wantAdviceResult: types.InternalMemoryCalculationResult{ + ExtraEntries: []types.ExtraMemoryAdvices{ + { + CgroupPath: "/kubepods/besteffort", + Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(216 << 30)}, + }, + }, + }, + }, + { + name: "set reclaimed group memory limit(failed)", pools: map[string]*types.PoolInfo{ state.PoolNameReserve: { PoolName: state.PoolNameReserve, @@ -721,15 +788,9 @@ func TestUpdate(t *testing.T) { wantHeadroom: *resource.NewQuantity(996<<30, resource.DecimalSI), nodeMetrics: defaultNodeMetrics, numaMetrics: defaultNumaMetrics, - cgroupMetrics: cgroupMetrics, plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemoryGuard}, wantAdviceResult: types.InternalMemoryCalculationResult{ - ExtraEntries: []types.ExtraMemoryAdvices{ - { - CgroupPath: "/kubepods/besteffort", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyMemoryLimitInBytes): strconv.Itoa(375 << 30)}, - }, - }, + ExtraEntries: []types.ExtraMemoryAdvices{}, }, }, { @@ -1127,28 +1188,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1286,28 +1347,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 10}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 10}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 10}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 512 << 10}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1412,28 +1473,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 512 << 20}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1571,28 +1632,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid3", containerName: "c3", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 0, + numaID: 0, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1705,28 +1766,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 2, + numaID: 2, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 2, + numaID: 2, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1837,28 +1898,28 @@ func TestUpdate(t *testing.T) { metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid2", containerName: "c2", - numdID: 0, + numaID: 0, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 2 << 30}, podUID: "uid1", containerName: "c1", - numdID: 2, + numaID: 2, }, { metricName: coreconsts.MetricsMemAnonPerNumaContainer, metricValue: metricutil.MetricData{Value: 1 << 30}, podUID: "uid4", containerName: "c4", - numdID: 2, + numaID: 2, }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), @@ -1896,11 +1957,14 @@ func TestUpdate(t *testing.T) { metricsFetcher.SetContainerMetric(containerMetric.podUID, containerMetric.containerName, containerMetric.metricName, containerMetric.metricValue) } for _, containerNUMAMetric := range tt.containerNUMAMetrics { - metricsFetcher.SetContainerNumaMetric(containerNUMAMetric.podUID, containerNUMAMetric.containerName, strconv.Itoa(containerNUMAMetric.numdID), containerNUMAMetric.metricName, containerNUMAMetric.metricValue) + metricsFetcher.SetContainerNumaMetric(containerNUMAMetric.podUID, containerNUMAMetric.containerName, strconv.Itoa(containerNUMAMetric.numaID), containerNUMAMetric.metricName, containerNUMAMetric.metricValue) } for _, qosClassMetric := range tt.cgroupMetrics { metricsFetcher.SetCgroupMetric(qosClassMetric.cgroupPath, qosClassMetric.metricName, qosClassMetric.metricValue) } + for _, cgroupNUMAMetric := range tt.cgroupNUMAMetrics { + metricsFetcher.SetCgroupNumaMetric(cgroupNUMAMetric.cgroupPath, cgroupNUMAMetric.numaID, cgroupNUMAMetric.metricName, cgroupNUMAMetric.metricValue) + } if tt.metricsFetcherSynced != nil { metricsFetcher.SetSynced(*tt.metricsFetcherSynced) } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go index 046bccc40c..85780b9fca 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/memory_guard.go @@ -17,12 +17,14 @@ limitations under the License. package plugin import ( + "math" "strconv" "go.uber.org/atomic" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/consts" @@ -31,8 +33,15 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" ) +type reconcileStatus string + const ( MemoryGuard = "memory-guard" + // multiply the scale by the criticalWaterMark to get the safe watermark + criticalWaterMarkScaleFactor = 2 + + reconcileStatusSucceeded = "succeeded" + reconcileStatusFailed = "failed" ) type memoryGuard struct { @@ -41,7 +50,9 @@ type memoryGuard struct { emitter metrics.MetricEmitter reclaimRelativeRootCgroupPath string reclaimMemoryLimit *atomic.Int64 + reconcileStatus *atomic.String minCriticalWatermark int64 + conf *config.Configuration } func NewMemoryGuard(conf *config.Configuration, extraConfig interface{}, metaReader metacache.MetaReader, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter) MemoryAdvisorPlugin { @@ -51,72 +62,64 @@ func NewMemoryGuard(conf *config.Configuration, extraConfig interface{}, metaRea emitter: emitter, reclaimRelativeRootCgroupPath: conf.ReclaimRelativeRootCgroupPath, reclaimMemoryLimit: atomic.NewInt64(-1), + reconcileStatus: atomic.NewString(reconcileStatusFailed), minCriticalWatermark: conf.MinCriticalWatermark, + conf: conf, } } func (mg *memoryGuard) Reconcile(status *types.MemoryPressureStatus) error { - memoryTotal, err := mg.metaServer.GetNodeMetric(consts.MetricMemTotalSystem) + mg.reconcileStatus.Store(reconcileStatusFailed) + reclaimMemoryLimit := .0 + availNUMAs, _, err := helper.GetAvailableNUMAsAndReclaimedCores(mg.conf, mg.metaReader, mg.metaServer) if err != nil { return err } - memoryFree, err := mg.metaReader.GetNodeMetric(consts.MetricMemFreeSystem) + watermarkScaleFactor, err := mg.metaServer.GetNodeMetric(consts.MetricMemScaleFactorSystem) if err != nil { + general.ErrorS(err, "Can not get system watermark scale factor") return err } - memoryCache, err := mg.metaReader.GetNodeMetric(consts.MetricMemPageCacheSystem) - if err != nil { - return err + for _, numaID := range availNUMAs.ToSliceInt() { + reclaimedCoresUsed, err := mg.metaServer.GetCgroupNumaMetric(mg.reclaimRelativeRootCgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup) + if err != nil { + return err + } + + numaTotal, err := mg.metaServer.GetNumaMetric(numaID, consts.MetricMemTotalNuma) + if err != nil { + return err + } + numaFree, err := mg.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) + if err != nil { + return err + } + + criticalWatermark := math.Max(float64(mg.minCriticalWatermark), numaTotal.Value*watermarkScaleFactor.Value/float64(10000)) + reclaimMemoryLimit += reclaimedCoresUsed.Value + + math.Max(numaFree.Value-criticalWaterMarkScaleFactor*criticalWatermark, 0) + + general.InfoS("NUMA memory info", "numaID", numaID, + "criticalWatermark", general.FormatMemoryQuantity(criticalWatermark), + "reclaimedCoresUsed", general.FormatMemoryQuantity(reclaimedCoresUsed.Value), + "numaTotal", general.FormatMemoryQuantity(numaTotal.Value), + "numaFree", general.FormatMemoryQuantity(numaFree.Value), + "reclaimMemoryLimit", general.FormatMemoryQuantity(reclaimMemoryLimit)) } - memoryBuffer, err := mg.metaReader.GetNodeMetric(consts.MetricMemBufferSystem) - if err != nil { - return err - } - - scaleFactor, err := mg.metaReader.GetNodeMetric(consts.MetricMemScaleFactorSystem) - if err != nil { - return err - } - - criticalWatermark := general.MaxFloat64(float64(mg.minCriticalWatermark*int64(mg.metaServer.NumNUMANodes)), memoryTotal.Value*scaleFactor.Value/10000) - buffer := memoryFree.Value + memoryCache.Value + memoryBuffer.Value - criticalWatermark - if buffer < 0 { - buffer = 0 - } - - reclaimGroupRss, err := mg.metaReader.GetCgroupMetric(mg.reclaimRelativeRootCgroupPath, consts.MetricMemRssCgroup) - if err != nil { - return err - } - - reclaimGroupUsed, err := mg.metaReader.GetCgroupMetric(mg.reclaimRelativeRootCgroupPath, consts.MetricMemUsageCgroup) - if err != nil { - return err - } - - reclaimMemoryLimit := general.MaxFloat64(reclaimGroupUsed.Value, reclaimGroupRss.Value+buffer) - - general.InfoS("memory details", - "system total", general.FormatMemoryQuantity(memoryTotal.Value), - "system free", general.FormatMemoryQuantity(memoryFree.Value), - "system cache", general.FormatMemoryQuantity(memoryCache.Value), - "system buffer", general.FormatMemoryQuantity(memoryBuffer.Value), - "system scaleFactor", general.FormatMemoryQuantity(scaleFactor.Value), - "criticalWatermark", general.FormatMemoryQuantity(criticalWatermark), - "buffer", general.FormatMemoryQuantity(buffer), - "reclaim cgroup rss", general.FormatMemoryQuantity(reclaimGroupRss.Value), - "reclaim cgroup used", general.FormatMemoryQuantity(reclaimGroupUsed.Value), - ) - mg.reclaimMemoryLimit.Store(int64(reclaimMemoryLimit)) + mg.reconcileStatus.Store(reconcileStatusSucceeded) return nil } func (mg *memoryGuard) GetAdvices() types.InternalMemoryCalculationResult { + if mg.reconcileStatus.Load() == reconcileStatusFailed { + general.Errorf("failed to get last reconcile result") + return types.InternalMemoryCalculationResult{} + } result := types.InternalMemoryCalculationResult{ ExtraEntries: []types.ExtraMemoryAdvices{ { diff --git a/pkg/metaserver/agent/metric/fake_metric.go b/pkg/metaserver/agent/metric/fake_metric.go index 42d8f4d293..cc35b9c089 100644 --- a/pkg/metaserver/agent/metric/fake_metric.go +++ b/pkg/metaserver/agent/metric/fake_metric.go @@ -162,10 +162,10 @@ func (f *FakeMetricsFetcher) GetCgroupMetric(cgroupPath, metricName string) (met return f.metricStore.GetCgroupMetric(cgroupPath, metricName) } -func (f *FakeMetricsFetcher) SetCgroupNumaMetric(cgroupPath, numaNode, metricName string, data metric.MetricData) { +func (f *FakeMetricsFetcher) SetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string, data metric.MetricData) { f.metricStore.SetCgroupNumaMetric(cgroupPath, numaNode, metricName, data) } -func (f *FakeMetricsFetcher) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (metric.MetricData, error) { +func (f *FakeMetricsFetcher) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (metric.MetricData, error) { return f.checkMetricDataExpire(f.metricStore.GetCgroupNumaMetric(cgroupPath, numaNode, metricName)) } diff --git a/pkg/metaserver/agent/metric/metric_impl.go b/pkg/metaserver/agent/metric/metric_impl.go index b0c443e8f4..c3108de7f6 100644 --- a/pkg/metaserver/agent/metric/metric_impl.go +++ b/pkg/metaserver/agent/metric/metric_impl.go @@ -278,7 +278,7 @@ func (f *MetricsFetcherImpl) GetCgroupMetric(cgroupPath, metricName string) (uti return f.checkMetricDataExpire(f.metricStore.GetCgroupMetric(cgroupPath, metricName)) } -func (f *MetricsFetcherImpl) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (utilmetric.MetricData, error) { +func (f *MetricsFetcherImpl) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (utilmetric.MetricData, error) { return f.checkMetricDataExpire(f.metricStore.GetCgroupNumaMetric(cgroupPath, numaNode, metricName)) } diff --git a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go index 0461c36203..b72af41b88 100644 --- a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go @@ -484,7 +484,12 @@ func (m *MalachiteMetricsProvisioner) processCgroupPerNumaMemoryData(cgroupPath updateTime := time.Unix(cgStats.V1.Memory.UpdateTime, 0) for _, data := range numaStats { - numaID := strings.TrimPrefix(data.NumaName, "N") + numaIDStr := strings.TrimPrefix(data.NumaName, "N") + numaID, err := strconv.Atoi(numaIDStr) + if err != nil { + klog.ErrorS(err, "failed to parse numa", "str", numaIDStr) + continue + } m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalTotal << pageShift)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemFilePerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalFile << pageShift)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemAnonPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.HierarchicalAnon << pageShift)}) @@ -494,7 +499,12 @@ func (m *MalachiteMetricsProvisioner) processCgroupPerNumaMemoryData(cgroupPath updateTime := time.Unix(cgStats.V2.Memory.UpdateTime, 0) for numa, data := range numaStats { - numaID := strings.TrimPrefix(numa, "N") + numaIDStr := strings.TrimPrefix(numa, "N") + numaID, err := strconv.Atoi(numaIDStr) + if err != nil { + klog.ErrorS(err, "failed to parse numaIDStr", "str", numaIDStr) + continue + } total := data.Anon + data.File + data.Unevictable m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemTotalPerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(total)}) m.metricStore.SetCgroupNumaMetric(cgroupPath, numaID, consts.MetricsMemFilePerNumaCgroup, utilmetric.MetricData{Time: &updateTime, Value: float64(data.File)}) diff --git a/pkg/metaserver/agent/metric/types/metric.go b/pkg/metaserver/agent/metric/types/metric.go index a19a99bcfd..53e304004f 100644 --- a/pkg/metaserver/agent/metric/types/metric.go +++ b/pkg/metaserver/agent/metric/types/metric.go @@ -86,7 +86,7 @@ type MetricsReader interface { // GetCgroupMetric get metric of cgroup path: /kubepods/burstable, /kubepods/besteffort, etc. GetCgroupMetric(cgroupPath, metricName string) (metric.MetricData, error) // GetCgroupNumaMetric get NUMA metric of qos class: /kubepods/burstable, /kubepods/besteffort, etc. - GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (metric.MetricData, error) + GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (metric.MetricData, error) HasSynced() bool } diff --git a/pkg/util/metric/store.go b/pkg/util/metric/store.go index c041892487..3fb88ad701 100644 --- a/pkg/util/metric/store.go +++ b/pkg/util/metric/store.go @@ -47,7 +47,7 @@ type MetricStore struct { podContainerNumaMetricMap map[string]map[string]map[string]map[string]MetricData // map[podUID]map[containerName]map[numaNode]map[metricName]data podVolumeMetricMap map[string]map[string]map[string]MetricData // map[podUID]map[volumeName]map[metricName]data cgroupMetricMap map[string]map[string]MetricData // map[cgroupPath]map[metricName]value - cgroupNumaMetricMap map[string]map[string]map[string]MetricData // map[cgroupPath]map[numaNode]map[metricName]value + cgroupNumaMetricMap map[string]map[int]map[string]MetricData // map[cgroupPath]map[numaNode]map[metricName]value } func NewMetricStore() *MetricStore { @@ -60,7 +60,7 @@ func NewMetricStore() *MetricStore { podContainerNumaMetricMap: make(map[string]map[string]map[string]map[string]MetricData), podVolumeMetricMap: make(map[string]map[string]map[string]MetricData), cgroupMetricMap: make(map[string]map[string]MetricData), - cgroupNumaMetricMap: make(map[string]map[string]map[string]MetricData), + cgroupNumaMetricMap: make(map[string]map[int]map[string]MetricData), } } @@ -277,13 +277,13 @@ func (c *MetricStore) GetCgroupMetric(cgroupPath, metricName string) (MetricData return data, nil } -func (c *MetricStore) SetCgroupNumaMetric(cgroupPath, numaNode, metricName string, data MetricData) { +func (c *MetricStore) SetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string, data MetricData) { c.mutex.Lock() defer c.mutex.Unlock() numaMetrics, ok := c.cgroupNumaMetricMap[cgroupPath] if !ok { - numaMetrics = make(map[string]map[string]MetricData) + numaMetrics = make(map[int]map[string]MetricData) c.cgroupNumaMetricMap[cgroupPath] = numaMetrics } metrics, ok := numaMetrics[numaNode] @@ -294,7 +294,7 @@ func (c *MetricStore) SetCgroupNumaMetric(cgroupPath, numaNode, metricName strin metrics[metricName] = data } -func (c *MetricStore) GetCgroupNumaMetric(cgroupPath, numaNode, metricName string) (MetricData, error) { +func (c *MetricStore) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (MetricData, error) { c.mutex.RLock() defer c.mutex.RUnlock() numaMetrics, ok := c.cgroupNumaMetricMap[cgroupPath]