Skip to content

Commit

Permalink
Bug fixes in gpu and node collectors
Browse files Browse the repository at this point in the history
  • Loading branch information
brisk022 committed Apr 3, 2024
1 parent 17b1d12 commit 63f0114
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 15 deletions.
17 changes: 10 additions & 7 deletions gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ type GPUMetrics struct {
}

func GPUGetMetrics() map[string]*GPUMetrics {
args := []string{"-a", "-h", "--Format='Nodes: ,Gres: ,GresUsed:'", "--state=idle,allocated"}
// space prefix ensures that the full value is displayed. Without it, the value is trancated
// to 20 chars.
// Suppress your natural instinct to add single quotes around Format
args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed: ", "--state=idle,allocated"}
output := string(Execute("sinfo", args))
return ParseGPUMetrics(output)
}
Expand Down Expand Up @@ -127,11 +130,11 @@ func (cc *GPUCollector) Describe(ch chan<- *prometheus.Desc) {
}

func (cc *GPUCollector) Collect(ch chan<- prometheus.Metric) {
cm := GPUGetMetrics()
for gpu_type := range cm {
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, float64(cm[gpu_type].alloc))
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, float64(cm[gpu_type].idle))
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, float64(cm[gpu_type].total))
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm[gpu_type].utilization)
gpus := GPUGetMetrics()
for gpu := range gpus {
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, float64(gpus[gpu].alloc), gpu)
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, float64(gpus[gpu].idle), gpu)
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, float64(gpus[gpu].total), gpu)
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, gpus[gpu].utilization, gpu)
}
}
2 changes: 1 addition & 1 deletion gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type GPUsMetrics struct {
}

func GPUsGetMetrics() *GPUsMetrics {
args := []string{"-a", "-h", "--Format='Nodes: ,Gres: ,GresUsed:'", "--state=idle,allocated"}
args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed: ", "--state=idle,allocated"}
output := string(Execute("sinfo", args))
return ParseGPUsMetrics(output)
}
Expand Down
12 changes: 5 additions & 7 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"log"
"os/exec"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -82,11 +80,11 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
}
// Space prefix ensures that the full value is displayed. Without it, the value is trancated
// to 20 chars.
// Suppress your natural instinct to add single quotes around Format
args := []string{"-h", "-N", "--Format=NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong: "}
out := Execute("sinfo", args)
return out
}

Expand Down

0 comments on commit 63f0114

Please sign in to comment.