From 63f011467009f295288404d0ce8fad1f99cb126b Mon Sep 17 00:00:00 2001 From: Roman Briskine Date: Wed, 3 Apr 2024 11:13:25 +0200 Subject: [PATCH] Bug fixes in gpu and node collectors --- gpu.go | 17 ++++++++++------- gpus.go | 2 +- node.go | 12 +++++------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/gpu.go b/gpu.go index 43971ae..272c2d3 100644 --- a/gpu.go +++ b/gpu.go @@ -31,7 +31,10 @@ type GPUMetrics struct { } func GPUGetMetrics() map[string]*GPUMetrics { - args := []string{"-a", "-h", "--Format='Nodes: ,Gres: ,GresUsed:'", "--state=idle,allocated"} + // space prefix ensures that the full value is displayed. Without it, the value is trancated + // to 20 chars. + // Suppress your natural instinct to add single quotes around Format + args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed: ", "--state=idle,allocated"} output := string(Execute("sinfo", args)) return ParseGPUMetrics(output) } @@ -127,11 +130,11 @@ func (cc *GPUCollector) Describe(ch chan<- *prometheus.Desc) { } func (cc *GPUCollector) Collect(ch chan<- prometheus.Metric) { - cm := GPUGetMetrics() - for gpu_type := range cm { - ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, float64(cm[gpu_type].alloc)) - ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, float64(cm[gpu_type].idle)) - ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, float64(cm[gpu_type].total)) - ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm[gpu_type].utilization) + gpus := GPUGetMetrics() + for gpu := range gpus { + ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, float64(gpus[gpu].alloc), gpu) + ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, float64(gpus[gpu].idle), gpu) + ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, float64(gpus[gpu].total), gpu) + ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, gpus[gpu].utilization, gpu) } } diff --git a/gpus.go b/gpus.go index 7eef1f3..5b7125e 100644 --- a/gpus.go +++ b/gpus.go @@ -31,7 +31,7 @@ type GPUsMetrics struct { } func GPUsGetMetrics() *GPUsMetrics { - args := []string{"-a", "-h", "--Format='Nodes: ,Gres: ,GresUsed:'", "--state=idle,allocated"} + args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed: ", "--state=idle,allocated"} output := string(Execute("sinfo", args)) return ParseGPUsMetrics(output) } diff --git a/node.go b/node.go index bf2f759..b82ffdf 100644 --- a/node.go +++ b/node.go @@ -16,8 +16,6 @@ along with this program. If not, see . */ package main import ( - "log" - "os/exec" "sort" "strconv" "strings" @@ -82,11 +80,11 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { // NodeData executes the sinfo command to get data for each node // It returns the output of the sinfo command func NodeData() []byte { - cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong") - out, err := cmd.Output() - if err != nil { - log.Fatal(err) - } + // Space prefix ensures that the full value is displayed. Without it, the value is trancated + // to 20 chars. + // Suppress your natural instinct to add single quotes around Format + args := []string{"-h", "-N", "--Format=NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong: "} + out := Execute("sinfo", args) return out }