Skip to content

Commit

Permalink
topology-aware: implement metrics collection.
Browse files Browse the repository at this point in the history
Implement collection of per zone prometheus metrics.
Currently we collect for each pool/zone the following
  - name, cpuset and memset
  - shared pool capacity, allocation, available amount
  - memory capacity, allocation, available amount
  - number of containers
  - number of containers in the shared pool

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
  • Loading branch information
klihub committed Nov 11, 2024
1 parent 57708c8 commit bf184c5
Show file tree
Hide file tree
Showing 2 changed files with 330 additions and 17 deletions.
329 changes: 329 additions & 0 deletions cmd/plugins/topology-aware/policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
"slices"
"strings"

libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy"
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"github.com/prometheus/client_golang/prometheus"
)

type TopologyAwareMetrics struct {
ZoneNames []string
Zones map[string]*Zone
Metrics Metrics
registered bool
}

type Zone struct {
Name string
Cpus cpuset.CPUSet
Mems libmem.NodeMask
SharedPool cpuset.CPUSet
SharedAssigned int
SharedAvailable int
MemCapacity int64
MemAssigned int64
MemAvailable int64
ContainerCount int
SharedContainerCount int
}

type Metrics struct {
zone *prometheus.GaugeVec
cpuSharedCapacity *prometheus.GaugeVec
cpuSharedAssigned *prometheus.GaugeVec
cpuSharedAvailable *prometheus.GaugeVec
memCapacity *prometheus.GaugeVec
memAssigned *prometheus.GaugeVec
memAvailable *prometheus.GaugeVec
containerCount *prometheus.GaugeVec
sharedContainerCount *prometheus.GaugeVec
}

func NewTopologyAwareMetrics() *TopologyAwareMetrics {
m := &TopologyAwareMetrics{
Zones: make(map[string]*Zone),
Metrics: Metrics{
zone: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_policy_zone_cpu_capacity",
Help: "A topology zone of CPUs.",
},
[]string{
"zone",
"cpus",
"mems",
},
),
cpuSharedCapacity: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_policy_zone_cpu_shared_capacity",
Help: "Capacity of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
cpuSharedAssigned: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_policy_zone_cpu_shared_assigned",
Help: "Assigned amount of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
cpuSharedAvailable: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_policy_zone_cpu_shared_available",
Help: "Available amount of shared CPU pool of a topology zone.",
},
[]string{
"zone",
"cpus",
},
),
memCapacity: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_zone_mem_capacity",
Help: "Memory capacity of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
memAssigned: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_zone_mem_assigned",
Help: "Amount of assigned memory of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
memAvailable: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_zone_mem_available",
Help: "Amount of available memory of a topology zone.",
},
[]string{
"zone",
"mems",
},
),
containerCount: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_zone_container_count",
Help: "Number of containers assigned to a topology zone.",
},
[]string{
"zone",
},
),
sharedContainerCount: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "topologyaware_zone_shared_container_count",
Help: "Number of containers in the shared CPU pool of a topology zone.",
},
[]string{
"zone",
},
),
},
}

return m
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
if p.metrics == nil {
p.metrics = NewTopologyAwareMetrics()
}

m := p.metrics

ch := make(chan *prometheus.Desc)
go func(ch chan *prometheus.Desc) {
m.Metrics.zone.Describe(ch)
m.Metrics.cpuSharedCapacity.Describe(ch)
m.Metrics.cpuSharedAssigned.Describe(ch)
m.Metrics.cpuSharedAvailable.Describe(ch)
m.Metrics.memCapacity.Describe(ch)
m.Metrics.memAssigned.Describe(ch)
m.Metrics.memAvailable.Describe(ch)
m.Metrics.containerCount.Describe(ch)
m.Metrics.sharedContainerCount.Describe(ch)
close(ch)
}(ch)

descriptors := []*prometheus.Desc{}
for d := range ch {
log.Info(" described metric %s", d.String())
descriptors = append(descriptors, d)
}

return descriptors
}

func (p *policy) CollectMetrics(pm policyapi.Metrics) ([]prometheus.Metric, error) {
m := p.metrics

ch := make(chan prometheus.Metric)
go func(ch chan<- prometheus.Metric) {
m.Metrics.zone.Collect(ch)
m.Metrics.cpuSharedCapacity.Collect(ch)
m.Metrics.cpuSharedAssigned.Collect(ch)
m.Metrics.cpuSharedAvailable.Collect(ch)
m.Metrics.memCapacity.Collect(ch)
m.Metrics.memAssigned.Collect(ch)
m.Metrics.memAvailable.Collect(ch)
m.Metrics.containerCount.Collect(ch)
m.Metrics.sharedContainerCount.Collect(ch)
close(ch)
}(ch)

metrics := []prometheus.Metric{}
for m := range ch {
log.Info(" collected metric %s", m.Desc().String())
metrics = append(metrics, m)
}

return metrics, nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
if p.metrics == nil {
p.metrics = NewTopologyAwareMetrics()
}

m := p.metrics

for _, pool := range p.pools {
var (
name = pool.Name()
zone = m.Zones[name]
free = pool.FreeSupply().(*supply)
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...)
sharedPool = free.SharableCPUs().Union(free.ReservedCPUs())
containers = 0
sharedctrs = 0
)

if zone == nil {
var (
capa = pool.GetSupply().(*supply)
cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs())
)
zone = &Zone{
Name: name,
Cpus: cpus,
Mems: mems,
MemCapacity: p.memAllocator.ZoneCapacity(mems),
}

m.Zones[name] = zone
m.ZoneNames = append(m.ZoneNames, name)

m.Metrics.zone.WithLabelValues(
zone.Name,
zone.Cpus.String(),
zone.Mems.String(),
).Set(float64(zone.Cpus.Size()))

m.Metrics.memCapacity.WithLabelValues(
zone.Name,
zone.Mems.String(),
).Set(float64(zone.MemCapacity))

log.Info(" created metrics for zone %s", name)
}

log.Debug("polling metrics for zone %s...", name)

for _, g := range p.allocations.grants {
if g.GetCPUNode().Name() == pool.Name() {
containers++
if g.ReservedPortion() != 0 || g.CPUPortion() != 0 {
sharedctrs++
}
}
}

zone.SharedPool = sharedPool
zone.SharedAssigned = free.GrantedReserved() + free.GrantedShared()
zone.SharedAvailable = free.AllocatableSharedCPU()
zone.MemAssigned = p.memAllocator.ZoneUsage(mems)
zone.MemAvailable = p.memAllocator.ZoneAvailable(mems)
zone.ContainerCount = containers
zone.SharedContainerCount = sharedctrs

m.Metrics.cpuSharedCapacity.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedPool.Size()))

m.Metrics.cpuSharedAssigned.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedAssigned) / 1000.0)

m.Metrics.cpuSharedAvailable.WithLabelValues(
zone.Name,
zone.SharedPool.String(),
).Set(float64(zone.SharedAvailable) / 1000.0)

m.Metrics.memAssigned.WithLabelValues(
zone.Name,
zone.Mems.MemsetString(),
).Set(float64(zone.MemAssigned))

m.Metrics.memAvailable.WithLabelValues(
zone.Name,
zone.Mems.MemsetString(),
).Set(float64(zone.MemAvailable))

m.Metrics.containerCount.WithLabelValues(
zone.Name,
).Set(float64(zone.ContainerCount))

m.Metrics.sharedContainerCount.WithLabelValues(
zone.Name,
).Set(float64(zone.SharedContainerCount))
}

if p.metrics == nil {
slices.SortFunc(m.ZoneNames, func(a, b string) int {
poolA, poolB := p.nodes[a], p.nodes[b]
if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 {
return diff
}
return strings.Compare(a, b)
})
p.metrics = m
}

return p.metrics
}
18 changes: 1 addition & 17 deletions cmd/plugins/topology-aware/policy/topology-aware-policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ import (
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"k8s.io/apimachinery/pkg/api/resource"

"github.com/prometheus/client_golang/prometheus"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
"github.com/containers/nri-plugins/pkg/cpuallocator"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
Expand Down Expand Up @@ -68,6 +66,7 @@ type policy struct {
cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
memAllocator *libmem.Allocator
coldstartOff bool // coldstart forced off (have movable PMEM zones)
metrics *TopologyAwareMetrics
}

var opt = &cfgapi.Config{}
Expand Down Expand Up @@ -306,21 +305,6 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
return false, nil
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
return nil, nil
}

// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
func (p *policy) GetTopologyZones() []*policyapi.TopologyZone {
zones := []*policyapi.TopologyZone{}
Expand Down

0 comments on commit bf184c5

Please sign in to comment.