From 9ae5f99b1155190b11c220d7ade9e70af4eb006b Mon Sep 17 00:00:00 2001 From: wsy327643 Date: Mon, 6 Nov 2023 14:48:37 +0800 Subject: [PATCH] feat: add dcgm-export metrics collection --- pkg/bootstrap/bootstrap.go | 3 + pkg/openmetric/dcgm.go | 104 +++++++++++++++++++++++++++++++++++ pkg/openmetric/openmetric.go | 21 +++++++ 3 files changed, 128 insertions(+) create mode 100644 pkg/openmetric/dcgm.go diff --git a/pkg/bootstrap/bootstrap.go b/pkg/bootstrap/bootstrap.go index f5fce57..a20a921 100644 --- a/pkg/bootstrap/bootstrap.go +++ b/pkg/bootstrap/bootstrap.go @@ -390,6 +390,9 @@ func (b *AgentBootstrap) setupSidecarAgent() error { return err } + om := openmetric.NewManager(ctm) + om.Start() + lsm := logstream.NewManager() b.LSM = lsm diff --git a/pkg/openmetric/dcgm.go b/pkg/openmetric/dcgm.go new file mode 100644 index 0000000..1f6f479 --- /dev/null +++ b/pkg/openmetric/dcgm.go @@ -0,0 +1,104 @@ +/* + * Copyright 2022 Holoinsight Project Authors. Licensed under Apache-2.0. + */ + +package openmetric + +import ( + "context" + "fmt" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/discovery" + "github.com/prometheus/prometheus/discovery/targetgroup" + "github.com/traas-stack/holoinsight-agent/pkg/cri" + "github.com/traas-stack/holoinsight-agent/pkg/ioc" + v1 "k8s.io/api/core/v1" + "strings" + "sync" +) + +type ( + dcgmConfig struct { + } + dcgmDiscovery struct { + c *dcgmConfig + ctx context.Context + up chan<- []*targetgroup.Group + mutex sync.Mutex + last *v1.Pod + } +) + +func (o *dcgmDiscovery) OnAnyPodChanged() { + o.once(false) +} + +func (c *dcgmConfig) Name() string { + return "dcgm" +} + +func (c *dcgmConfig) NewDiscoverer(discovery.DiscovererOptions) (discovery.Discoverer, error) { + return &dcgmDiscovery{c: c}, nil +} + +func (o *dcgmDiscovery) Run(ctx context.Context, up chan<- []*targetgroup.Group) { + o.ctx = ctx + o.up = up + ioc.Crii.AddListener(o) + + o.once(true) + + <-ctx.Done() + ioc.Crii.RemoveListener(o) +} + +func (p *dcgmDiscovery) buildPod(pod *v1.Pod) *targetgroup.Group { + tg := &targetgroup.Group{ + Source: podSource(pod), + } + // PodIP can be empty when a pod is starting or has been evicted. + if len(pod.Status.PodIP) == 0 { + return tg + } + + tg.Labels = podLabels(pod) + tg.Labels[model.MetaLabelPrefix+"kubernetes_namespace"] = lv(pod.Namespace) + tg.Labels[model.MetaLabelPrefix+"kubernetes_pod_name"] = lv(pod.Name) + + tg.Targets = append(tg.Targets, model.LabelSet{ + model.AddressLabel: lv(fmt.Sprintf("%s:%s", pod.Status.PodIP, "9400")), + }) + + return tg +} + +func (o *dcgmDiscovery) once(init bool) { + o.mutex.Lock() + defer o.mutex.Unlock() + pods := ioc.Crii.GetAllPods() + var targetPod *cri.Pod + for _, pod := range pods { + if strings.Contains(pod.GetName(), "dcgm-export") { + targetPod = pod + break + } + } + var tg *targetgroup.Group + if targetPod != nil { + if o.last != nil && o.last.Generation == targetPod.Generation { + return + } + o.last = targetPod.Pod + tg = o.buildPod(targetPod.Pod) + } else { + // delete + if !init && o.last == nil || targetPod == nil { + return + } + o.last = nil + tg = &targetgroup.Group{ + Source: podSource(targetPod.Pod), + } + } + o.up <- []*targetgroup.Group{tg} +} diff --git a/pkg/openmetric/openmetric.go b/pkg/openmetric/openmetric.go index 1c182bd..9db2187 100644 --- a/pkg/openmetric/openmetric.go +++ b/pkg/openmetric/openmetric.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/pkg/relabel" "github.com/traas-stack/holoinsight-agent/pkg/collecttask" "github.com/traas-stack/holoinsight-agent/pkg/cri/impl/netproxy" "go.uber.org/zap" @@ -192,6 +193,26 @@ func (m *Manager) Start() { m.jobs[t.Key] = scrapeConfig changed = true } + + m.jobs["dcgm"] = &config.ScrapeConfig{ + JobName: "dcgm", + HonorLabels: true, + HonorTimestamps: false, + Params: nil, + ScrapeInterval: model.Duration(5 * time.Second), + ScrapeTimeout: model.Duration(5 * time.Second), + MetricsPath: "/metrics", + Scheme: "http", + SampleLimit: 0, + TargetLimit: 0, + ServiceDiscoveryConfigs: discovery.Configs{ + &dcgmConfig{}, + }, + RelabelConfigs: []*relabel.Config{}, + MetricRelabelConfigs: nil, + } + + changed = true // //logger.Infoz("[openmetric] add kubernetes-pod") //