diff --git a/Dockerfile b/Dockerfile index af27cf5..11ec021 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,25 @@ -FROM golang:1.17-alpine as builder +FROM golang:1.20 as builder ARG TARGETARCH COPY . . RUN mkdir -p /app \ && unset GOPATH \ - && GOOS=linux GOARCH=${TARGETARCH} go build -o /app/jetson-exporter + && CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -o /app/jetson-exporter + +FROM waggle/plugin-base:1.1.1-base + +RUN apt-get update \ + && apt-get install -y \ + gnupg \ + ca-certificates \ + nano + +COPY etc/apt/sources.list.d/nvidia-l4t-apt-source.list \ + /etc/apt/sources.list.d/nvidia-l4t-apt-source.list +RUN apt-key adv --fetch-key http://repo.download.nvidia.com/jetson/jetson-ota-public.asc \ + && apt-get update \ + && apt-get install --no-install-recommends -y \ + nvidia-l4t-tools -FROM golang:1.17-alpine COPY --from=builder /app/ /app/ WORKDIR /app CMD ["/app/jetson-exporter"] \ No newline at end of file diff --git a/Makefile b/Makefile index 1f8b3e0..1bdbcf5 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ build: - go build -o ./out/jetson-exporter jetson_exporter.go + CGO_ENABLED=0 go build -o ./out/jetson-exporter . build-arm64: - GOOS=linux GOARCH=arm64 go build -o ./out/jetson-exporter jetson_exporter.go + CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o ./out/jetson-exporter . diff --git a/README.md b/README.md index 6f701f8..fc51376 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,10 @@ Jetson exporter is a metric provider for Jetson Tegra GPU. Scrapers can hit `/metrics` endpoint to get Prometheus-formatted metrics. # Metrics -Provided metrics include, -- **sys.metrics.gpu.average.1s**: exponential moving average of GPU utilization over last 1 second -- **sys.metrics.gpu.average.5s**: exponential moving average of GPU utilization over last 5 second -- **sys.metrics.gpu.average.15s**: exponential moving average of GPU utilization over last 15 second +Provided metrics can be found in [tegrastats.go](./tegrastats.go) # Kubernetes -The exporter can be deployed as Kubernetes DaemonSet to provide the metrics per Jetson device. +The jetons exporter can be deployed as Kubernetes DaemonSet to provide the metrics per Jetson device. # Main Advantage Current Jetson platform for CUDA GPU (Sep 2022) is implemented differently from Desktop (amd64) CUDA platform. This blocks Jetson users from taking full features of Nvidia tools for device monitoring. `tegrastats` only provides a snapshot of GPU utilization which also makes users difficult to monitor usage while running CUDA-enabled programs. This exporter aggregates GPU utilization and provides wider picture of how CUDA GPU performs. @@ -16,6 +13,3 @@ Current Jetson platform for CUDA GPU (Sep 2022) is implemented differently from # Limitation - Jetson GPU shares memory with CPU such that this exporter does not provide GPU memory usage - We have not found a way to map GPU utilization with a process ID to identify which process is using the resource. This means that GPU utilization does not necessarily come from a particular program, but could come from other program running at the same time. - -# Developer Note -Current provided metrics are limited to a few metrics. More metrics may be added if there are needs. diff --git a/data/test_tegrastats.txt b/data/test_tegrastats.txt new file mode 100644 index 0000000..c872d7f --- /dev/null +++ b/data/test_tegrastats.txt @@ -0,0 +1,76 @@ +# HELP tegra_cpu_frequency_hz CPU Clock frequency +# TYPE tegra_cpu_frequency_hz gauge +tegra_cpu_frequency_hz{cpu="1"} 1.42e+09 +tegra_cpu_frequency_hz{cpu="2"} 1.42e+09 +tegra_cpu_frequency_hz{cpu="3"} 1.42e+09 +tegra_cpu_frequency_hz{cpu="4"} 1.42e+09 +tegra_cpu_frequency_hz{cpu="5"} 1.42e+09 +tegra_cpu_frequency_hz{cpu="6"} 1.42e+09 +# HELP tegra_cpu_util_percentage Utilization of CPU in percentage +# TYPE tegra_cpu_util_percentage gauge +tegra_cpu_util_percentage{cpu="1"} 47 +tegra_cpu_util_percentage{cpu="2"} 23 +tegra_cpu_util_percentage{cpu="3"} 32 +tegra_cpu_util_percentage{cpu="4"} 22 +tegra_cpu_util_percentage{cpu="5"} 31 +tegra_cpu_util_percentage{cpu="6"} 96 +# HELP tegra_emc_frequency_hz External memory controller clock frequency +# TYPE tegra_emc_frequency_hz gauge +tegra_emc_frequency_hz 1.6e+09 +# HELP tegra_emc_util_percentage Utilization of external memory controller in percentage +# TYPE tegra_emc_util_percentage gauge +tegra_emc_util_percentage 2 +# HELP tegra_gpu_frequency_hz GPU clock frequency +# TYPE tegra_gpu_frequency_hz gauge +tegra_gpu_frequency_hz 1.109e+09 +# HELP tegra_gpu_util_percentage Utilization of GPU in percentage +# TYPE tegra_gpu_util_percentage gauge +tegra_gpu_util_percentage 0 +# HELP tegra_last_updated_timestamp_epoch An epoch time of when the stats were collected from the system +# TYPE tegra_last_updated_timestamp_epoch gauge +tegra_last_updated_timestamp_epoch 1.701465532e+09 +# HELP tegra_lfb_nblock_count Count of largest free block +# TYPE tegra_lfb_nblock_count gauge +tegra_lfb_nblock_count 7 +# HELP tegra_lfb_size_bytes Size of largest free block +# TYPE tegra_lfb_size_bytes gauge +tegra_lfb_size_bytes 4.194304e+06 +# HELP tegra_mts_bg_percentage Time spent in foreground tasks +# TYPE tegra_mts_bg_percentage gauge +tegra_mts_bg_percentage 9 +# HELP tegra_mts_fg_percentage Time spent in background tasks +# TYPE tegra_mts_fg_percentage gauge +tegra_mts_fg_percentage 1 +# HELP tegra_ram_total_bytes Total memory +# TYPE tegra_ram_total_bytes gauge +tegra_ram_total_bytes 8.148484096e+09 +# HELP tegra_ram_used_bytes Current used memory +# TYPE tegra_ram_used_bytes gauge +tegra_ram_used_bytes 5.500829696e+09 +# HELP tegra_swap_cached_bytes Current swap cache memory +# TYPE tegra_swap_cached_bytes gauge +tegra_swap_cached_bytes 2.9360128e+08 +# HELP tegra_swap_total_bytes Total swap memory +# TYPE tegra_swap_total_bytes gauge +tegra_swap_total_bytes 2.1253586944e+10 +# HELP tegra_swap_used_bytes Current swap used memory +# TYPE tegra_swap_used_bytes gauge +tegra_swap_used_bytes 1.030750208e+09 +# HELP tegra_temperature_celcius Temperature reading in Celcius +# TYPE tegra_temperature_celcius gauge +tegra_temperature_celcius{sensor="ao"} 29 +tegra_temperature_celcius{sensor="aux"} 30 +tegra_temperature_celcius{sensor="cpu"} 33.5 +tegra_temperature_celcius{sensor="gpu"} 31.5 +tegra_temperature_celcius{sensor="pmic"} 100 +tegra_temperature_celcius{sensor="thermal"} 31.350000381469727 +# HELP tegra_wattage_average_milliwatts Averaged Watts of the hardware +# TYPE tegra_wattage_average_milliwatts gauge +tegra_wattage_average_milliwatts{sensor="vdd_cpu_gpu_cv"} 2119 +tegra_wattage_average_milliwatts{sensor="vdd_in"} 5510 +tegra_wattage_average_milliwatts{sensor="vdd_soc"} 1051 +# HELP tegra_wattage_current_milliwatts Current Watts of the hardware +# TYPE tegra_wattage_current_milliwatts gauge +tegra_wattage_current_milliwatts{sensor="vdd_cpu_gpu_cv"} 2706 +tegra_wattage_current_milliwatts{sensor="vdd_in"} 6140 +tegra_wattage_current_milliwatts{sensor="vdd_soc"} 1074 diff --git a/etc/apt/sources.list.d/nvidia-l4t-apt-source.list b/etc/apt/sources.list.d/nvidia-l4t-apt-source.list new file mode 100644 index 0000000..2666206 --- /dev/null +++ b/etc/apt/sources.list.d/nvidia-l4t-apt-source.list @@ -0,0 +1,2 @@ +deb https://repo.download.nvidia.com/jetson/common r32.4 main +deb https://repo.download.nvidia.com/jetson/t194 r32.4 main \ No newline at end of file diff --git a/go.mod b/go.mod index da8d4b1..8157c1c 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/waggle-sensor/jetson-exporter -go 1.17 +go 1.20 require ( github.com/influxdata/influxdb-client-go/v2 v2.11.0 @@ -10,6 +10,7 @@ require ( require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/deepmap/oapi-codegen v1.8.2 // indirect github.com/golang/protobuf v1.5.2 // indirect github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 // indirect diff --git a/go.sum b/go.sum index d6c1e8e..35c1547 100644 --- a/go.sum +++ b/go.sum @@ -121,7 +121,6 @@ github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= -github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= @@ -216,14 +215,11 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8= github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= @@ -322,7 +318,6 @@ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -516,9 +511,7 @@ gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/influxdb.go b/influxdb.go index 796a94c..1484830 100644 --- a/influxdb.go +++ b/influxdb.go @@ -1,5 +1,9 @@ package main +// This is archived to minimize functionality of jetson exporter +// If metrics need to be published use metrics collection agents +// like Grafana agent, Telegraf, Fluentd, etc. + import ( "time" diff --git a/jetson_exporter.go b/jetson_exporter.go index 48b794b..040b88b 100644 --- a/jetson_exporter.go +++ b/jetson_exporter.go @@ -6,6 +6,8 @@ import ( "log" "net/http" "os" + "os/signal" + "syscall" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" @@ -23,36 +25,48 @@ func main() { var port string metricsPath := "/metrics" flag.StringVar(&port, "port", getenv("PORT", "9091"), "Port number to listen") - var collectorConfig TegraGPUCollectorConfig - flag.IntVar(&collectorConfig.CollectionIntervalInMilli, "sampling", 100, "Sampling interval in milliseconds") - flag.StringVar(&collectorConfig.LoadPath, "loadpath", "/sys/devices/gpu.0/load", "Path to GPU load") - flag.StringVar(&collectorConfig.CurrentDeviceFrqPathRex, "devfreqpathrex", "/sys/devices/gpu.0/devfreq/*/cur_freq", "Path described in Regression to current frequency of GPU device") - var publisherConfig PublisherConfig - flag.StringVar(&publisherConfig.NodeName, "nodename", getenv("KUBENODE", ""), "Name of the Kubernetes node") - flag.StringVar(&publisherConfig.InfluxDBURL, "influxdb-url", getenv("INFLUXDB_URL", ""), "InfluxDB URL") - flag.StringVar(&publisherConfig.InfluxDBToken, "influxdb-token", getenv("INFLUXDB_TOKEN", ""), "InfluxDB token") - flag.StringVar(&publisherConfig.InfluxDBOrganization, "influxdb-org", getenv("INFLUXDB_ORG", "waggle"), "InfluxDB organization") - flag.StringVar(&publisherConfig.InfluxDBBucket, "influxdb-bucket", getenv("INFLUXDB_BUCKET", "waggle"), "InfluxDB bucket") - flag.IntVar(&publisherConfig.InfluxDBPublishInterval, "influxdb-interval", 1, "InlufxDB publishing interval in seconds") flag.Parse() - fmt.Println("Jetson exporter started") - fmt.Println("Parameters are:") - fmt.Printf("\t Sampling Interval: %d millisecond\n", collectorConfig.CollectionIntervalInMilli) - fmt.Printf("\t Loadpath: %s\n", collectorConfig.LoadPath) - fmt.Printf("\t Endpoint: %s\n", metricsPath) - collector := NewTegraGPUCollector(&collectorConfig) - collector.Configure() - stopCh := make(chan bool, 1) + tegrastats := NewTegraStats() + + log.Println("Jetson exporter starts...") + log.Println("Parameters are:") + log.Printf("\t Endpoint: %s\n", metricsPath) + log.Printf("\t TegraStats command: %v", tegrastats.GetTegraStatsCommandWithArguments()) + + // watch signals to terminate external programs cleanly. + sigc := make(chan os.Signal, 1) + signal.Notify(sigc, + syscall.SIGHUP, + syscall.SIGINT, + syscall.SIGTERM, + syscall.SIGQUIT) + + log.Println("Executing the tegrastats command in the background...") + err := tegrastats.Start() + if err != nil { + panic(err) + } reg := prometheus.NewRegistry() reg.MustRegister(collectors.NewGoCollector()) - reg.MustRegister(collector) - go collector.RunUntil(stopCh) - if publisherConfig.InfluxDBURL != "" { - fmt.Println("InfluxDB URL is provided. Metrics will be published.") - fmt.Printf("\t Publishing Interval: %d second(s) \n", publisherConfig.InfluxDBPublishInterval) - publisher := NewInfluxDBPublisher(publisherConfig, collector) - go publisher.RunUntil(stopCh) - } + reg.MustRegister(tegrastats) http.Handle(metricsPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{EnableOpenMetrics: true})) - log.Fatal(http.ListenAndServe(fmt.Sprintf("0.0.0.0:%s", port), nil)) + sige := make(chan error, 1) + go func() { + err := http.ListenAndServe(fmt.Sprintf("0.0.0.0:%s", port), nil) + sige <- err + }() + + for { + select { + case err := <-sige: + log.Println("HTTP listener returned with an error") + log.Printf("%s\n", err) + tegrastats.Close() + return + case <-sigc: + log.Printf("OS signal received. Gracefully terminating...") + tegrastats.Close() + return + } + } } diff --git a/tegrastats.go b/tegrastats.go new file mode 100644 index 0000000..84e3bdf --- /dev/null +++ b/tegrastats.go @@ -0,0 +1,438 @@ +package main + +import ( + "bufio" + "fmt" + "log" + "os/exec" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// Acknowledgement: the expressions and parser come from +// https://github.com/rbonghi/jetson_stats/blob/master/jtop/core/tegra_parse.py +// Detailed description on how to intepret the tegrastats output +// https://docs.nvidia.com/drive/drive_os_5.1.6.1L/nvvib_docs/index.html#page/DRIVE_OS_Linux_SDK_Development_Guide/Utilities/util_tegrastats.html +var ( + regSwap = regexp.MustCompile(`SWAP (\d+)\/(\d+)(\w)B( ?)\(cached (\d+)(\w)B\)`) + regCPU = regexp.MustCompile(`CPU \[(.*?)\]`) + regValueFreq = regexp.MustCompile(`\b(\d+)%@(\d+)`) + regRAM = regexp.MustCompile(`RAM (\d+)\/(\d+)(\w)B( ?)\(lfb (\d+)x(\d+)(\w)B\)`) + regEMC = regexp.MustCompile(`EMC_FREQ \b(\d+)%@(\d+)`) + regMTS = regexp.MustCompile(`MTS fg (\d+)% bg (\d+)%`) + regGPU = regexp.MustCompile(`GR3D_FREQ \b(\d+)%@(\d+)`) + regWatt = regexp.MustCompile(`\b(\w+) ([0-9.]+)\/([0-9.]+)\b`) + regTemp = regexp.MustCompile(`\b(\w+)@(-?[0-9.]+)C\b`) + + gBytes = 1024 * 1024 * 1024 + mBytes = 1024 * 1024 + kBytes = 1024 +) + +func parseUnit(s string) int { + if s == "M" { + return mBytes + } else if s == "G" { + return gBytes + } else if s == "K" { + return kBytes + } else { + return 1 + } +} + +func parseValueWithFreq(s string) (v int, f int) { + if m := regValueFreq.FindAllStringSubmatch(s, len(s)); m != nil { + if value, err := strconv.Atoi(m[0][1]); err == nil { + v = value + } + if freq, err := strconv.Atoi(m[0][2]); err == nil { + f = freq + } + } + return +} + +type TegraStats struct { + cmd *exec.Cmd + TegraStatsString string + TegraStatsLastUpdated time.Time + m map[string]*prometheus.Desc + mu sync.Mutex +} + +func NewTegraStats() *TegraStats { + return NewTegraStatsWithCommand(exec.Command("tegrastats", "--interval", "2000")) +} + +func NewTegraStatsWithCommand(c *exec.Cmd) *TegraStats { + newTegraStats := &TegraStats{ + cmd: c, + m: map[string]*prometheus.Desc{ + "tegra_last_updated_timestamp_epoch": prometheus.NewDesc( + "tegra_last_updated_timestamp_epoch", + "An epoch time of when the stats were collected from the system", nil, nil, + ), + "tegra_temperature_celcius": prometheus.NewDesc( + "tegra_temperature_celcius", + "Temperature reading in Celcius", nil, nil, + ), + "tegra_cpu_frequency_hz": prometheus.NewDesc( + "tegra_cpu_frequency_hz", + "CPU Clock frequency", nil, nil, + ), + "tegra_cpu_util_percentage": prometheus.NewDesc( + "tegra_cpu_util_percentage", + "Utilization of CPU in percentage", nil, nil, + ), + "tegra_emc_frequency_hz": prometheus.NewDesc( + "tegra_emc_frequency_hz", + "External memory controller clock frequency", nil, nil, + ), + "tegra_emc_util_percentage": prometheus.NewDesc( + "tegra_emc_util_percentage", + "Utilization of external memory controller in percentage", nil, nil, + ), + "tegra_gpu_frequency_hz": prometheus.NewDesc( + "tegra_gpu_frequency_hz", + "GPU clock frequency", nil, nil, + ), + "tegra_gpu_util_percentage": prometheus.NewDesc( + "tegra_gpu_util_percentage", + "Utilization of GPU in percentage", nil, nil, + ), + "tegra_lfb_nblock_count": prometheus.NewDesc( + "tegra_lfb_nblock_count", + "Count of largest free block", nil, nil, + ), + "tegra_lfb_size_bytes": prometheus.NewDesc( + "tegra_lfb_size_bytes", + "Size of largest free block", nil, nil, + ), + "tegra_mts_bg_percentage": prometheus.NewDesc( + "tegra_mts_bg_percentage", + "Time spent in foreground tasks", nil, nil, + ), + "tegra_mts_fg_percentage": prometheus.NewDesc( + "tegra_mts_fg_percentage", + "Time spent in background tasks", nil, nil, + ), + "tegra_ram_total_bytes": prometheus.NewDesc( + "tegra_ram_total_bytes", + "Total memory", nil, nil, + ), + "tegra_ram_used_bytes": prometheus.NewDesc( + "tegra_ram_used_bytes", + "Current used memory", nil, nil, + ), + "tegra_swap_total_bytes": prometheus.NewDesc( + "tegra_swap_total_bytes", + "Total swap memory", nil, nil, + ), + "tegra_swap_cached_bytes": prometheus.NewDesc( + "tegra_swap_cached_bytes", + "Current swap cache memory", nil, nil, + ), + "tegra_swap_used_bytes": prometheus.NewDesc( + "tegra_swap_used_bytes", + "Current swap used memory", nil, nil, + ), + "tegra_wattage_current_milliwatts": prometheus.NewDesc( + "tegra_wattage_current_milliwatts", + "Current Watts of the hardware", nil, nil, + ), + "tegra_wattage_average_milliwatts": prometheus.NewDesc( + "tegra_wattage_average_milliwatts", + "Averaged Watts of the hardware", nil, nil, + ), + }, + } + return newTegraStats +} + +func (t *TegraStats) GetTegraStatsCommandWithArguments() []string { + return t.cmd.Args +} + +func (t *TegraStats) parseTegraStats(s string) []prometheus.Metric { + metrics := []prometheus.Metric{} + // RAM + if m := regRAM.FindAllStringSubmatch(s, len(s)); m != nil { + // [[RAM 5246/7771MB (lfb 7x4MB) 5246 7771 M 7 4 M]] + unitRAM := parseUnit(m[0][3]) + if ramUsed, err := strconv.Atoi(m[0][1]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_ram_used_bytes"], + prometheus.GaugeValue, + float64(ramUsed*unitRAM), + )) + } + if ramTotal, err := strconv.Atoi(m[0][2]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_ram_total_bytes"], + prometheus.GaugeValue, + float64(ramTotal*unitRAM), + )) + } + if lfbBlockCount, err := strconv.Atoi(m[0][5]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_lfb_nblock_count"], + prometheus.GaugeValue, + float64(lfbBlockCount), + )) + } + unitlfb := parseUnit(m[0][7]) + if blockSize, err := strconv.Atoi(m[0][6]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_lfb_size_bytes"], + prometheus.GaugeValue, + float64(blockSize*unitlfb), + )) + } + } + + // SWAP + if m := regSwap.FindAllStringSubmatch(s, len(s)); m != nil { + // [[SWAP 983/20269MB (cached 280MB) 983 20269 M 280 M]] + unitSwap := parseUnit(m[0][3]) + if swapUsed, err := strconv.Atoi(m[0][1]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_swap_used_bytes"], + prometheus.GaugeValue, + float64(swapUsed*unitSwap), + )) + } + if swapTotal, err := strconv.Atoi(m[0][2]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_swap_total_bytes"], + prometheus.GaugeValue, + float64(swapTotal*unitSwap), + )) + } + unitCached := parseUnit(m[0][6]) + if swapCached, err := strconv.Atoi(m[0][5]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_swap_cached_bytes"], + prometheus.GaugeValue, + float64(swapCached*unitCached), + )) + } + } + + // CPU + if m := regCPU.FindAllStringSubmatch(s, len(s)); m != nil { + // [[CPU [47%@1420,23%@1420,32%@1420,22%@1420,31%@1420,96%@1420] 47%@1420,23%@1420,32%@1420,22%@1420,31%@1420,96%@1420]] + labels := []string{"cpu"} + dUtil := prometheus.NewDesc("tegra_cpu_util_percentage", "Utilization of CPU in percentage", labels, nil) + dFreq := prometheus.NewDesc("tegra_cpu_frequency_hz", "CPU Clock frequency", labels, nil) + for i, v := range strings.Split(m[0][1], ",") { + if v == "off" { + continue + } + p, f := parseValueWithFreq(v) + metrics = append(metrics, prometheus.MustNewConstMetric( + dUtil, + prometheus.GaugeValue, + float64(p), + fmt.Sprintf("%d", i+1), + )) + metrics = append(metrics, prometheus.MustNewConstMetric( + dFreq, + prometheus.GaugeValue, + // tegrastats outputs the frequency in MHz + float64(f*1e6), + fmt.Sprintf("%d", i+1), + )) + } + } + + // EMC_FREQ + if m := regEMC.FindAllStringSubmatch(s, len(s)); m != nil { + // [[EMC_FREQ 2%@1600 2 1600]] + if emcUsed, err := strconv.Atoi(m[0][1]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_emc_util_percentage"], + prometheus.GaugeValue, + float64(emcUsed), + )) + } + if emcFreq, err := strconv.Atoi(m[0][2]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_emc_frequency_hz"], + prometheus.GaugeValue, + // tegrastats outputs the frequency in MHz + float64(emcFreq*1e6), + )) + } + } + + // GPU + if m := regGPU.FindAllStringSubmatch(s, len(s)); m != nil { + // [[GR3D_FREQ 0%@1109 0 1109]] + if gpuUsed, err := strconv.Atoi(m[0][1]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_gpu_util_percentage"], + prometheus.GaugeValue, + float64(gpuUsed), + )) + } + if gpuFreq, err := strconv.Atoi(m[0][2]); err == nil { + // tegrastats outputs the frequency in MHz + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_gpu_frequency_hz"], + prometheus.GaugeValue, + // tegrastats outputs the frequency in MHz + float64(gpuFreq*1e6), + )) + } + } + + // MTS + if m := regMTS.FindAllStringSubmatch(s, len(s)); m != nil { + // [[MTS fg 1% bg 9% 1 9]] + if fg, err := strconv.Atoi(m[0][1]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_mts_fg_percentage"], + prometheus.GaugeValue, + float64(fg), + )) + } + if bg, err := strconv.Atoi(m[0][2]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + t.m["tegra_mts_bg_percentage"], + prometheus.GaugeValue, + float64(bg), + )) + } + } + + // Temperature + if m := regTemp.FindAllStringSubmatch(s, len(s)); m != nil { + // [[AO@29C AO 29] [GPU@31.5C GPU 31.5] [PMIC@100C PMIC 100] [AUX@30C AUX 30] [CPU@33.5C CPU 33.5] [thermal@31.35C thermal 31.35]] + labels := []string{"sensor"} + dTemp := prometheus.NewDesc("tegra_temperature_celcius", "Temperature reading in Celcius", labels, nil) + for _, v := range m { + if temp, err := strconv.ParseFloat(v[2], 32); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + dTemp, + prometheus.GaugeValue, + temp, + strings.ToLower(v[1]), + )) + } + } + } + + // Watts + if m := regWatt.FindAllStringSubmatch(s, len(s)); m != nil { + // [[VDD_IN 6140/5510 VDD_IN 6140 5510] [VDD_CPU_GPU_CV 2706/2119 VDD_CPU_GPU_CV 2706 2119] [VDD_SOC 1074/1051 VDD_SOC 1074 1051]] + labels := []string{"sensor"} + dCurWatt := prometheus.NewDesc("tegra_wattage_current_milliwatts", "Current Watts of the hardware", labels, nil) + dAvgWatt := prometheus.NewDesc("tegra_wattage_average_milliwatts", "Averaged Watts of the hardware", labels, nil) + for _, v := range m { + if wattCurrent, err := strconv.Atoi(v[2]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + dCurWatt, + prometheus.GaugeValue, + float64(wattCurrent), + strings.ToLower(v[1]), + )) + } + if wattAveraged, err := strconv.Atoi(v[3]); err == nil { + metrics = append(metrics, prometheus.MustNewConstMetric( + dAvgWatt, + prometheus.GaugeValue, + float64(wattAveraged), + strings.ToLower(v[1]), + )) + } + } + } + return metrics +} + +func (t *TegraStats) AddTegraStatsString(s string) { + t.mu.Lock() + t.TegraStatsString = s + t.TegraStatsLastUpdated = time.Now() + t.mu.Unlock() +} + +func (t *TegraStats) AddTegraStatsStringRaw(s string, _t time.Time) { + t.mu.Lock() + t.TegraStatsString = s + t.TegraStatsLastUpdated = _t + t.mu.Unlock() +} + +func (t *TegraStats) Start() error { + if t.cmd == nil { + return fmt.Errorf("no tegrastats command is provided") + } + stdout, err := t.cmd.StdoutPipe() + if err != nil { + return err + } + // stderr, err := t.cmd.StderrPipe() + // if err != nil { + // return err + // } + err = t.cmd.Start() + if err != nil { + return err + } + log.Printf("tegrastats starts with the process %d", t.cmd.Process.Pid) + go func() { + scanner := bufio.NewScanner(stdout) + for scanner.Scan() { + if err := scanner.Err(); err != nil { + log.Println(err) + } else { + t.AddTegraStatsString(scanner.Text()) + } + } + }() + return nil +} + +func (t *TegraStats) Describe(ch chan<- *prometheus.Desc) { + for _, d := range t.m { + ch <- d + } +} + +func (t *TegraStats) Collect(ch chan<- prometheus.Metric) { + t.mu.Lock() + statsStr := t.TegraStatsString + lastUpdated := t.TegraStatsLastUpdated + t.mu.Unlock() + // + if statsStr == "" { + log.Printf("Prometheus Collect called, but tegrastats not yet collected.") + return + } + if cm := t.parseTegraStats(statsStr); len(cm) > 0 { + ch <- prometheus.MustNewConstMetric( + t.m["tegra_last_updated_timestamp_epoch"], + prometheus.GaugeValue, + float64(lastUpdated.Unix()), + ) + for _, c := range cm { + ch <- c + } + } +} + +func (t *TegraStats) Close() { + if t.cmd != nil { + if t.cmd.Process != nil { + log.Printf("Killing tegrastats process %d...", t.cmd.Process.Pid) + t.cmd.Process.Kill() + } + } +} diff --git a/tegrastats_test.go b/tegrastats_test.go new file mode 100644 index 0000000..a6e3448 --- /dev/null +++ b/tegrastats_test.go @@ -0,0 +1,37 @@ +package main + +import ( + "os" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestParseTegraStats(t *testing.T) { + testLine := "RAM 5246/7771MB (lfb 7x4MB) SWAP 983/20269MB (cached 280MB) CPU [47%@1420,23%@1420,32%@1420,22%@1420,31%@1420,96%@1420] EMC_FREQ 2%@1600 GR3D_FREQ 0%@1109 APE 150 MTS fg 1% bg 9% AO@29C GPU@31.5C PMIC@100C AUX@30C CPU@33.5C thermal@31.35C VDD_IN 6140/5510 VDD_CPU_GPU_CV 2706/2119 VDD_SOC 1074/1051" + c := NewTegraStats() + // Because the metrics include the time that represent when the stats collected, + // we inject an arbitrary number that matches with the metrics file. + c.AddTegraStatsStringRaw(testLine, time.Unix(int64(1701465532), 0)) + reg := prometheus.NewRegistry() + reg.MustRegister(c) + + metricsFile := "data/test_tegrastats.txt" + // NOTE: Uncomment below to generate a new file. + // Then, you will need to comment this out again for this unittest + // err := prometheus.WriteToTextfile(metricsFile, reg) + // if err != nil { + // t.Fatalf("Metric comparison failed: %s", err) + // } + + wantMetrics, err := os.Open(metricsFile) + if err != nil { + t.Fatalf("unable to read input test file %s", metricsFile) + } + err = testutil.GatherAndCompare(reg, wantMetrics) + if err != nil { + t.Fatalf("Metric comparison failed: %s", err) + } +}