From 337fb73346b715afe999204e2a72fd30c806a4c1 Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Thu, 14 Nov 2024 14:47:13 +0100 Subject: [PATCH] Fix #201 - report per-gpu per-sample statistics, akin to 'load' --- src/amd.rs | 32 +- src/gpu.rs | 51 +- src/nvidia.rs | 397 ++++++++++----- src/ps.rs | 98 +++- src/sysinfo.rs | 4 +- src/testdata/nvidia-smi-output.txt | 778 +++++++++++++++++++++++++++++ tests/gpuinfo.sh | 27 + tests/run_tests.sh | 1 + 8 files changed, 1238 insertions(+), 150 deletions(-) create mode 100644 src/testdata/nvidia-smi-output.txt create mode 100755 tests/gpuinfo.sh diff --git a/src/amd.rs b/src/amd.rs index 741c00b..6f37d6d 100644 --- a/src/amd.rs +++ b/src/amd.rs @@ -1,18 +1,7 @@ -/// Get info about AMD graphics cards by parsing the output of rocm-smi. -/// -/// This is pretty hacky! Something better than this is likely needed and hopefully possible. -/// -/// The returned information is keyed by (device, pid) so that if a process uses multiple devices, -/// the total utilization for the process must be summed across devices. We do this to be -/// compatible with the NVIDIA module (nvidia.rs). -/// -/// There is no information here about absolute memory usage numbers. The cards I have don't -/// support getting that information. Other cards might. In that case, the --showmemusage switch -/// (can be combined with --showgpupids in a single invocation) might be useful. -/// -/// Even though the output is presented in the same format as for NVIDIA, we only have partial stats -/// about the usage of various processes on the various devices. We divide the utilization of a -/// device by the number of processes on the device. This is approximate. +// Get info about AMD graphics cards by parsing the output of rocm-smi. +// +// This is pretty hacky! Something better than this is likely needed and hopefully possible. + use crate::command::{self, CmdError}; use crate::gpu; use crate::ps::UserTable; @@ -35,17 +24,24 @@ pub fn probe() -> Option> { } impl gpu::GPU for AmdGPU { - fn get_manufacturer(&self) -> String { + fn get_manufacturer(&mut self) -> String { "AMD".to_string() } - fn get_configuration(&self) -> Result, String> { + fn get_card_configuration(&mut self) -> Result, String> { get_amd_configuration() } - fn get_utilization(&self, user_by_pid: &UserTable) -> Result, String> { + fn get_process_utilization( + &mut self, + user_by_pid: &UserTable, + ) -> Result, String> { get_amd_utilization(user_by_pid) } + + fn get_card_utilization(&mut self) -> Result, String> { + Ok(vec![]) + } } // On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD diff --git a/src/gpu.rs b/src/gpu.rs index 6c2d036..d08c3cf 100644 --- a/src/gpu.rs +++ b/src/gpu.rs @@ -1,8 +1,10 @@ use crate::amd; use crate::nvidia; -use crate::ps::UserTable; +use crate::ps; -#[derive(PartialEq)] +// Per-sample process information, across cards + +#[derive(PartialEq, Default, Clone)] pub struct Process { pub device: Option, // Device ID pub pid: usize, // Process ID @@ -14,9 +16,13 @@ pub struct Process { pub command: String, // The command, _unknown_ for zombies, _noinfo_ if not known } +// Used to tag a Process entry when the uid can't be determined + pub const ZOMBIE_UID: usize = 666666; -#[derive(PartialEq, Default)] +// Sample-invariant card information + +#[derive(PartialEq, Default, Clone)] pub struct Card { pub bus_addr: String, pub index: i32, // Card index (changes at boot) @@ -33,12 +39,45 @@ pub struct Card { pub max_mem_clock_mhz: i32, } +// Per-sample card information, across processes + +#[derive(PartialEq, Default, Clone)] +pub struct CardState { + pub index: i32, // Stable card identifier + pub fan_speed_pct: f32, + pub compute_mode: String, + pub perf_state: String, + pub mem_reserved_kib: i64, + pub mem_used_kib: i64, + pub gpu_utilization_pct: f32, + pub mem_utilization_pct: f32, + pub temp_c: i32, + pub power_watt: i32, + pub power_limit_watt: i32, + pub ce_clock_mhz: i32, + pub mem_clock_mhz: i32, +} + +// Abstract GPU information across GPU types. +// +// As get_manufacturer() is for the GPU object as a whole and not per-card, we are currently +// assuming that nodes don't have cards from multiple manufacturers. +// +// get_card_configuration() and get_card_utilization() return vectors that are sorted by their index +// fields, and indices shall be tightly packed. + pub trait GPU { - fn get_manufacturer(&self) -> String; - fn get_configuration(&self) -> Result, String>; - fn get_utilization(&self, user_by_pid: &UserTable) -> Result, String>; + fn get_manufacturer(&mut self) -> String; + fn get_card_configuration(&mut self) -> Result, String>; + fn get_process_utilization( + &mut self, + user_by_pid: &ps::UserTable, + ) -> Result, String>; + fn get_card_utilization(&mut self) -> Result, String>; } +// Probe the system for GPUs. + pub fn probe() -> Option> { if let Some(nvidia) = nvidia::probe() { Some(nvidia) diff --git a/src/nvidia.rs b/src/nvidia.rs index 710a2c4..aaa6f11 100644 --- a/src/nvidia.rs +++ b/src/nvidia.rs @@ -1,11 +1,5 @@ -/// Run nvidia-smi and return a vector of process samples. -/// -/// The information is keyed by (device, pid) so that if a process uses multiple devices, the total -/// utilization for the process must be summed across devices. (This is the natural mode of output -/// for `nvidia-smi pmon`.) -/// -/// Crucially, the data are sampling data: they contain no (long) running averages, but are -/// snapshots of the system at the time the sample is taken. +// Get info about Nvidia graphics cards by parsing the output of nvidia-smi. + use crate::command::{self, CmdError}; use crate::gpu; use crate::ps::UserTable; @@ -16,28 +10,65 @@ use crate::TIMEOUT_SECONDS; use crate::util::map; use std::path::Path; -pub struct NvidiaGPU {} +pub struct NvidiaGPU { + // At the moment, all this information is the result of a single run of nvidia-smi, so we cache + // it since there will otherwise be two runs. + // + // TODO: It's possible the process information should be derived from this run, too. + info: Option, String>>, +} + +#[derive(Default)] +struct PerCard { + info: gpu::Card, + state: gpu::CardState, +} pub fn probe() -> Option> { if nvidia_present() { - Some(Box::new(NvidiaGPU {})) + Some(Box::new(NvidiaGPU { info: None })) } else { None } } impl gpu::GPU for NvidiaGPU { - fn get_manufacturer(&self) -> String { + fn get_manufacturer(&mut self) -> String { "NVIDIA".to_string() } - fn get_configuration(&self) -> Result, String> { - get_nvidia_configuration() + fn get_card_configuration(&mut self) -> Result, String> { + if self.info.is_none() { + self.info = Some(get_nvidia_configuration(&vec!["-a"])) + } + match self.info.as_ref().unwrap() { + Ok(data) => Ok(data + .iter() + .map(|pc| pc.info.clone()) + .collect::>()), + Err(e) => Err(e.clone()), + } } - fn get_utilization(&self, user_by_pid: &UserTable) -> Result, String> { + fn get_process_utilization( + &mut self, + user_by_pid: &UserTable, + ) -> Result, String> { get_nvidia_utilization(user_by_pid) } + + fn get_card_utilization(&mut self) -> Result, String> { + if self.info.is_none() { + self.info = Some(get_nvidia_configuration(&vec!["-a"])) + } + match self.info.as_ref().unwrap() { + Ok(data) => Ok(data + .iter() + .map(|pc| pc.state.clone()) + .collect::>()), + Err(e) => Err(e.clone()), + } + } } // On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are @@ -48,8 +79,9 @@ fn nvidia_present() -> bool { } // `nvidia-smi -a` (aka `nvidia-smi -q`) dumps a lot of information about all the cards in a -// semi-structured form. It is fairly slow, the reason being it also obtains information about -// running processes. But if we only run this function for sysinfo that's OK. +// semi-structured form. Without additional arguments it is fairly slow, the reason being it also +// obtains information about running processes. But if we only run it without more arguments for +// sysinfo then that's OK. For other purposes, adding -d ,... is helpful for performance. // // In brief, the input is a set of lines with a preamble followed by zero or more cards. // Indentation indicates nesting of sections and subsections. Everything ends implicitly; if an @@ -78,121 +110,198 @@ fn nvidia_present() -> bool { // just take that to be the card index. (In contrast, the Minor Number does not always follow that // order.) -pub fn get_nvidia_configuration() -> Result, String> { - match command::safe_command("nvidia-smi", &["-a"], TIMEOUT_SECONDS) { - Ok(raw_text) => { - enum State { - Preamble, - InCard, - FbMemoryUsage, - GpuPowerReadings, - MaxClocks, - } - let mut cuda = "".to_string(); - let mut driver = "".to_string(); - let mut state = State::Preamble; - let mut cards = vec![]; - let mut card: gpu::Card = Default::default(); - 'next_line: for l in raw_text.lines() { - 'reprocess_line: loop { - match state { - State::Preamble => { - if l.starts_with("CUDA Version") { - cuda = field_value(l); - } else if l.starts_with("Driver Version") { - driver = field_value(l); - } else if l.starts_with("GPU ") { - if !card.bus_addr.is_empty() { - cards.push(card); - } - card = Default::default(); - card.bus_addr = l[4..].to_string(); - card.driver = driver.clone(); - card.firmware = cuda.clone(); - card.index = cards.len() as i32; - state = State::InCard; - } - continue 'next_line; +fn get_nvidia_configuration(smi_args: &[&str]) -> Result, String> { + match command::safe_command("nvidia-smi", smi_args, TIMEOUT_SECONDS) { + Ok(raw_text) => Ok(parse_nvidia_configuration(&raw_text)), + Err(CmdError::CouldNotStart(_)) => Ok(vec![]), + Err(e) => Err(format!("{:?}", e)), + } +} + +fn parse_nvidia_configuration(raw_text: &str) -> Vec { + enum State { + Preamble, + InCard, + FbMemoryUsage, + GpuPowerReadings, + MaxClocks, + Clocks, + Utilization, + Temperature, + } + let mut cuda = "".to_string(); + let mut driver = "".to_string(); + let mut state = State::Preamble; + let mut cards = vec![]; + let mut card: PerCard = Default::default(); + 'next_line: for l in raw_text.lines() { + 'reprocess_line: loop { + match state { + State::Preamble => { + if l.starts_with("CUDA Version") { + cuda = field_value(l); + } else if l.starts_with("Driver Version") { + driver = field_value(l); + } else if l.starts_with("GPU ") { + if !card.info.bus_addr.is_empty() { + cards.push(card); } - State::InCard => { - if !l.starts_with(" ") { - state = State::Preamble; - continue 'reprocess_line; - } - if l.starts_with(" Product Name") { - card.model = field_value(l); - } else if l.starts_with(" Product Architecture") { - card.arch = field_value(l); - } else if l.starts_with(" GPU UUID") { - card.uuid = field_value(l); - } else if l == " FB Memory Usage" { - state = State::FbMemoryUsage; - } else if l == " GPU Power Readings" { - state = State::GpuPowerReadings; - } else if l == " Max Clocks" { - state = State::MaxClocks; - } - continue 'next_line; + card = Default::default(); + card.info.bus_addr = l[4..].to_string(); + card.info.driver = driver.clone(); + card.info.firmware = cuda.clone(); + card.info.index = cards.len() as i32; + card.state.index = card.info.index; + state = State::InCard; + } + continue 'next_line; + } + State::InCard => { + if !l.starts_with(" ") { + state = State::Preamble; + continue 'reprocess_line; + } + if l.starts_with(" Product Name") { + card.info.model = field_value(l); + } else if l.starts_with(" Product Architecture") { + card.info.arch = field_value(l); + } else if l.starts_with(" GPU UUID") { + card.info.uuid = field_value(l); + } else if l.starts_with(" Fan Speed") { + if let Ok(n) = field_value_stripped(l, "%").parse::() { + card.state.fan_speed_pct = n; } - State::FbMemoryUsage => { - if !l.starts_with(" ") { - state = State::InCard; - continue 'reprocess_line; - } - if l.starts_with(" Total") { - if let Ok(n) = field_value_stripped(l, "MiB").parse::() { - card.mem_size_kib = n * 1024; - } - } - continue 'next_line; + } else if l.starts_with(" Compute Mode") { + card.state.compute_mode = field_value(l); + } else if l.starts_with(" Performance State") { + card.state.perf_state = field_value(l); + } else if l == " FB Memory Usage" { + state = State::FbMemoryUsage; + } else if l == " GPU Power Readings" { + state = State::GpuPowerReadings; + } else if l == " Max Clocks" { + state = State::MaxClocks; + } else if l == " Clocks" { + state = State::Clocks; + } else if l == " Utilization" { + state = State::Utilization; + } else if l == " Temperature" { + state = State::Temperature; + } + continue 'next_line; + } + State::FbMemoryUsage => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" Total") { + if let Ok(n) = field_value_stripped(l, "MiB").parse::() { + card.info.mem_size_kib = n * 1024; } - State::GpuPowerReadings => { - if !l.starts_with(" ") { - state = State::InCard; - continue 'reprocess_line; - } - if l.starts_with(" Current Power Limit") { - if let Ok(n) = field_value_stripped(l, "W").parse::() { - card.power_limit_watt = n.ceil() as i32; - } - } else if l.starts_with(" Min Power Limit") { - if let Ok(n) = field_value_stripped(l, "W").parse::() { - card.min_power_limit_watt = n.ceil() as i32; - } - } else if l.starts_with(" Max Power Limit") { - if let Ok(n) = field_value_stripped(l, "W").parse::() { - card.max_power_limit_watt = n.ceil() as i32; - } - } - continue 'next_line; + } else if l.starts_with(" Reserved") { + if let Ok(n) = field_value_stripped(l, "MiB").parse::() { + card.state.mem_reserved_kib = n * 1024; } - State::MaxClocks => { - if !l.starts_with(" ") { - state = State::InCard; - continue 'reprocess_line; - } - if l.starts_with(" SM") { - if let Ok(n) = field_value_stripped(l, "MHz").parse::() { - card.max_ce_clock_mhz = n; - } - } else if l.starts_with(" Memory") { - if let Ok(n) = field_value_stripped(l, "MHz").parse::() { - card.max_mem_clock_mhz = n; - } - } - continue 'next_line; + } else if l.starts_with(" Used") { + if let Ok(n) = field_value_stripped(l, "MiB").parse::() { + card.state.mem_used_kib = n * 1024; } } + continue 'next_line; + } + State::GpuPowerReadings => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" Current Power Limit") { + if let Ok(n) = field_value_stripped(l, "W").parse::() { + card.info.power_limit_watt = n.ceil() as i32; + card.state.power_limit_watt = card.info.power_limit_watt; + } + } else if l.starts_with(" Min Power Limit") { + if let Ok(n) = field_value_stripped(l, "W").parse::() { + card.info.min_power_limit_watt = n.ceil() as i32; + } + } else if l.starts_with(" Max Power Limit") { + if let Ok(n) = field_value_stripped(l, "W").parse::() { + card.info.max_power_limit_watt = n.ceil() as i32; + } + } else if l.starts_with(" Power Draw") { + if let Ok(n) = field_value_stripped(l, "W").parse::() { + card.state.power_watt = n.ceil() as i32; + } + } + continue 'next_line; + } + State::MaxClocks => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" SM") { + if let Ok(n) = field_value_stripped(l, "MHz").parse::() { + card.info.max_ce_clock_mhz = n; + } + } else if l.starts_with(" Memory") { + if let Ok(n) = field_value_stripped(l, "MHz").parse::() { + card.info.max_mem_clock_mhz = n; + } + } + continue 'next_line; + } + State::Clocks => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" SM") { + if let Ok(n) = field_value_stripped(l, "MHz").parse::() { + card.state.ce_clock_mhz = n; + } + } else if l.starts_with(" Memory") { + if let Ok(n) = field_value_stripped(l, "MHz").parse::() { + card.state.mem_clock_mhz = n; + } + } + continue 'next_line; + } + State::Utilization => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" Gpu") { + if let Ok(n) = field_value_stripped(l, "%").parse::() { + card.state.gpu_utilization_pct = n; + } + } else if l.starts_with(" Memory") { + if let Ok(n) = field_value_stripped(l, "%").parse::() { + card.state.mem_utilization_pct = n; + } + } + continue 'next_line; + } + State::Temperature => { + if !l.starts_with(" ") { + state = State::InCard; + continue 'reprocess_line; + } + if l.starts_with(" GPU Current Temp") { + if let Ok(n) = field_value_stripped(l, "C").parse::() { + card.state.temp_c = n; + } + } + continue 'next_line; } } - if !card.bus_addr.is_empty() { - cards.push(card); - } - Ok(cards) } - Err(CmdError::CouldNotStart(_)) => Ok(vec![]), - Err(e) => Err(format!("{:?}", e)), } + if !card.info.bus_addr.is_empty() { + cards.push(card); + } + cards } fn field_value(l: &str) -> String { @@ -661,3 +770,49 @@ fn test_parsed_bad_query_output5() { 1864615, y1426"; assert!(parse_query_output(text, &mkusers()).is_err()); } + +#[test] +fn test_parse_nvidia_configuration() { + // Some fields in that output have been anonymized and a few have been changed to make it more + // interesting. + let cs = parse_nvidia_configuration(std::include_str!("testdata/nvidia-smi-output.txt")); + + // Check # of cards and that they are plausibly independent + assert!(cs.len() == 4); + assert!(cs[0].info.bus_addr == "00000000:18:00.0"); + assert!(cs[0].info.index == 0); + assert!(cs[1].info.bus_addr == "00000000:3B:00.0"); + assert!(cs[1].info.index == 1); + assert!(cs[2].info.bus_addr == "00000000:86:00.0"); + assert!(cs[2].info.index == 2); + assert!(cs[3].info.bus_addr == "00000000:AF:00.0"); + assert!(cs[3].info.index == 3); + + // Check details of cs[3] (more interesting than cs[0]) + let c = &cs[3]; + assert!(c.info.model == "NVIDIA GeForce RTX 2080 Ti"); + assert!(c.info.arch == "Turing"); + assert!(c.info.driver == "545.23.08"); + assert!(c.info.firmware == "12.3"); + assert!(c.info.uuid == "GPU-198d6802-0000-0000-0000-000000000000"); + assert!(c.info.mem_size_kib == 11264*1024); + assert!(c.info.power_limit_watt == 250); + assert!(c.info.max_power_limit_watt == 280); + assert!(c.info.min_power_limit_watt == 100); + assert!(c.info.max_ce_clock_mhz == 2100); + assert!(c.info.max_mem_clock_mhz == 7000); + + assert!(c.state.index == 3); + assert!(c.state.fan_speed_pct == 28.0); + assert!(c.state.compute_mode == "Default"); + assert!(c.state.perf_state == "P8"); + assert!(c.state.mem_reserved_kib == 252*1024); + assert!(c.state.mem_used_kib == 3*1024); + assert!(c.state.gpu_utilization_pct == 5.0); + assert!(c.state.mem_utilization_pct == 8.0); + assert!(c.state.temp_c == 34); + assert!(c.state.power_watt == 19); // ceil(18.10) + assert!(c.state.power_limit_watt == 250); + assert!(c.state.ce_clock_mhz == 300); + assert!(c.state.mem_clock_mhz == 405); +} diff --git a/src/ps.rs b/src/ps.rs index a4fcf92..44964df 100644 --- a/src/ps.rs +++ b/src/ps.rs @@ -340,12 +340,58 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta let mut gpu_status = GpuStatus::Ok; let gpu_utilization: Vec; + let mut gpu_info: String = "".to_string(); match gpu::probe() { None => { gpu_status = GpuStatus::UnknownFailure; } - Some(gpu) => { - match gpu.get_utilization(&user_by_pid) { + Some(mut gpu) => { + match gpu.get_card_utilization() { + Err(_) => {} + Ok(ref cards) => { + let mut s = "".to_string(); + s = add_key(s, "fan%", cards, |c: &gpu::CardState| { + nonzero(c.fan_speed_pct as i64) + }); + s = add_key(s, "mode", cards, |c: &gpu::CardState| { + if c.compute_mode == "Default" { + "".to_string() + } else { + c.compute_mode.clone() + } + }); + s = add_key(s, "perf", cards, |c: &gpu::CardState| c.perf_state.clone()); + // Reserved memory is really not interesting, it's possible it would have been + // interesting as part of the card configuration. + //s = add_key(s, "mreskib", cards, |c: &gpu::CardState| nonzero(c.mem_reserved_kib)); + s = add_key(s, "musekib", cards, |c: &gpu::CardState| { + nonzero(c.mem_used_kib) + }); + s = add_key(s, "cutil%", cards, |c: &gpu::CardState| { + nonzero(c.gpu_utilization_pct as i64) + }); + s = add_key(s, "mutil%", cards, |c: &gpu::CardState| { + nonzero(c.mem_utilization_pct as i64) + }); + s = add_key(s, "tempc", cards, |c: &gpu::CardState| { + nonzero(c.temp_c.into()) + }); + s = add_key(s, "poww", cards, |c: &gpu::CardState| { + nonzero(c.power_watt.into()) + }); + s = add_key(s, "powlimw", cards, |c: &gpu::CardState| { + nonzero(c.power_limit_watt.into()) + }); + s = add_key(s, "cez", cards, |c: &gpu::CardState| { + nonzero(c.ce_clock_mhz.into()) + }); + s = add_key(s, "memz", cards, |c: &gpu::CardState| { + nonzero(c.mem_clock_mhz.into()) + }); + gpu_info = s; + } + } + match gpu.get_process_utilization(&user_by_pid) { Err(_e) => { gpu_status = GpuStatus::UnknownFailure; } @@ -496,6 +542,7 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta } else { None }, + if !did_print { Some(&gpu_info) } else { None }, ) { Ok(did_print_one) => did_print = did_print_one || did_print, Err(_) => { @@ -541,6 +588,7 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta } else { None }, + if !did_print { Some(&gpu_info) } else { None }, ); } @@ -548,6 +596,44 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta let _ = writer.flush(); } +fn add_key( + mut s: String, + key: &str, + cards: &[gpu::CardState], + extract: fn(&gpu::CardState) -> String, +) -> String { + let mut vs = "".to_string(); + let mut any = false; + let mut first = true; + for c in cards { + let v = extract(c); + if !first { + vs = vs + "|"; + } + if v != "" { + any = true; + vs = vs + &v; + } + first = false; + } + if any { + if s != "" { + s += ","; + } + s + key + "=" + &vs + } else { + s + } +} + +fn nonzero(x: i64) -> String { + if x == 0 { + "".to_string() + } else { + format!("{:?}", x) + } +} + fn filter_proc(proc_info: &ProcInfo, params: &PrintParameters) -> bool { let mut included = false; @@ -619,6 +705,7 @@ fn print_record( params: &PrintParameters, proc_info: &ProcInfo, per_cpu_secs: Option<&[u64]>, + gpu_info: Option<&str>, ) -> Result { // Mandatory fields. @@ -694,7 +781,12 @@ fn print_record( if params.opts.load { if let Some(cpu_secs) = per_cpu_secs { if cpu_secs.len() > 0 { - fields.push(format!("load={}", encode_cpu_secs_base45el(cpu_secs))) + fields.push(format!("load={}", encode_cpu_secs_base45el(cpu_secs))); + } + } + if let Some(gpu_info) = gpu_info { + if gpu_info != "" { + fields.push(format!("gpuinfo={gpu_info}")); } } } diff --git a/src/sysinfo.rs b/src/sysinfo.rs index f1c6495..387c952 100644 --- a/src/sysinfo.rs +++ b/src/sysinfo.rs @@ -29,8 +29,8 @@ fn do_show_system( let mem_by = procfs::get_memtotal_kib(fs)? * 1024; let mem_gib = (mem_by as f64 / GIB as f64).round() as i64; let (mut cards, manufacturer) = match gpu::probe() { - Some(device) => ( - match device.get_configuration() { + Some(mut device) => ( + match device.get_card_configuration() { Ok(cards) => cards, Err(_) => vec![], }, diff --git a/src/testdata/nvidia-smi-output.txt b/src/testdata/nvidia-smi-output.txt new file mode 100644 index 0000000..a294f07 --- /dev/null +++ b/src/testdata/nvidia-smi-output.txt @@ -0,0 +1,778 @@ + +==============NVSMI LOG============== + +Timestamp : Fri Nov 15 10:23:11 2024 +Driver Version : 545.23.08 +CUDA Version : 12.3 + +Attached GPUs : 4 +GPU 00000000:18:00.0 + Product Name : NVIDIA GeForce RTX 2080 Ti + Product Brand : GeForce + Product Architecture : Turing + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-2a8f06c3-0000-0000-0000-000000000000 + Minor Number : 0 + VBIOS Version : 90.02.0B.40.09 + MultiGPU Board : No + Board ID : 0x1800 + Board Part Number : N/A + GPU Part Number : 1E04-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G001.0000.02.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU C2C Mode : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x18 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1E0410DE + Bus Id : 00000000:18:00.0 + Sub System Id : 0x86751043 + GPU Link Info + PCIe Generation + Max : 3 + Current : 3 + Device Current : 3 + Device Max : 3 + Host Max : 3 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 3058000 KB/s + Rx Throughput : 310000 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 56 % + Performance State : P2 + Clocks Event Reasons + Idle : Not Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 11264 MiB + Reserved : 252 MiB + Used : 10081 MiB + Free : 929 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 8 MiB + Free : 248 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 70 % + Memory : 39 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows : N/A + Temperature + GPU Current Temp : 86 C + GPU T.Limit Temp : N/A + GPU Shutdown Temp : 94 C + GPU Slowdown Temp : 91 C + GPU Max Operating Temp : 89 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + GPU Power Readings + Power Draw : 191.43 W + Current Power Limit : 250.00 W + Requested Power Limit : 250.00 W + Default Power Limit : 250.00 W + Min Power Limit : 110.00 W + Max Power Limit : 290.00 W + GPU Memory Power Readings + Power Draw : N/A + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 1485 MHz + SM : 1485 MHz + Memory : 6800 MHz + Video : 1380 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 2100 MHz + SM : 2100 MHz + Memory : 7000 MHz + Video : 1950 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : N/A + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 2620449 + Type : C + Name : python3 + Used GPU Memory : 9876 MiB + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 3066537 + Type : C + Name : /itf-fi-ml/home/user1/.julia/juliaup/julia-1.9.4+0.x64.linux.gnu/bin/julia + Used GPU Memory : 202 MiB + +GPU 00000000:3B:00.0 + Product Name : NVIDIA GeForce RTX 2080 Ti + Product Brand : GeForce + Product Architecture : Turing + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-16bca67b-0000-0000-0000-000000000000 + Minor Number : 1 + VBIOS Version : 90.02.0B.00.BB + MultiGPU Board : No + Board ID : 0x3b00 + Board Part Number : N/A + GPU Part Number : 1E07-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G001.0000.02.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU C2C Mode : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x3B + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1E0710DE + Bus Id : 00000000:3B:00.0 + Sub System Id : 0x86661043 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Device Current : 1 + Device Max : 3 + Host Max : 3 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 30 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 11264 MiB + Reserved : 252 MiB + Used : 1299 MiB + Free : 9711 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 6 MiB + Free : 250 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows : N/A + Temperature + GPU Current Temp : 40 C + GPU T.Limit Temp : N/A + GPU Shutdown Temp : 94 C + GPU Slowdown Temp : 91 C + GPU Max Operating Temp : 89 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + GPU Power Readings + Power Draw : 40.23 W + Current Power Limit : 250.00 W + Requested Power Limit : 250.00 W + Default Power Limit : 250.00 W + Min Power Limit : 100.00 W + Max Power Limit : 300.00 W + GPU Memory Power Readings + Power Draw : N/A + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 300 MHz + SM : 300 MHz + Memory : 405 MHz + Video : 540 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 2100 MHz + SM : 2100 MHz + Memory : 7000 MHz + Video : 1950 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : N/A + Fabric + State : N/A + Status : N/A + Processes + GPU instance ID : N/A + Compute instance ID : N/A + Process ID : 2482652 + Type : C + Name : python3 + Used GPU Memory : 1296 MiB + +GPU 00000000:86:00.0 + Product Name : NVIDIA GeForce RTX 2080 Ti + Product Brand : GeForce + Product Architecture : Turing + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-e53e5dfb-0000-0000-0000-000000000000 + Minor Number : 2 + VBIOS Version : 90.02.0B.40.09 + MultiGPU Board : No + Board ID : 0x8600 + Board Part Number : N/A + GPU Part Number : 1E04-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G001.0000.02.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU C2C Mode : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x86 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1E0410DE + Bus Id : 00000000:86:00.0 + Sub System Id : 0x86751043 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Device Current : 1 + Device Max : 3 + Host Max : 3 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 28 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 11264 MiB + Reserved : 252 MiB + Used : 3 MiB + Free : 11007 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 0 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows : N/A + Temperature + GPU Current Temp : 34 C + GPU T.Limit Temp : N/A + GPU Shutdown Temp : 94 C + GPU Slowdown Temp : 91 C + GPU Max Operating Temp : 89 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + GPU Power Readings + Power Draw : 40.26 W + Current Power Limit : 250.00 W + Requested Power Limit : 250.00 W + Default Power Limit : 250.00 W + Min Power Limit : 100.00 W + Max Power Limit : 280.00 W + GPU Memory Power Readings + Power Draw : N/A + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 300 MHz + SM : 300 MHz + Memory : 405 MHz + Video : 540 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 2100 MHz + SM : 2100 MHz + Memory : 7000 MHz + Video : 1950 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : N/A + Fabric + State : N/A + Status : N/A + Processes : None + +GPU 00000000:AF:00.0 + Product Name : NVIDIA GeForce RTX 2080 Ti + Product Brand : GeForce + Product Architecture : Turing + Display Mode : Disabled + Display Active : Disabled + Persistence Mode : Enabled + Addressing Mode : None + MIG Mode + Current : N/A + Pending : N/A + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-198d6802-0000-0000-0000-000000000000 + Minor Number : 3 + VBIOS Version : 90.02.0B.40.09 + MultiGPU Board : No + Board ID : 0xaf00 + Board Part Number : N/A + GPU Part Number : 1E04-300-A1 + FRU Part Number : N/A + Module ID : 1 + Inforom Version + Image Version : G001.0000.02.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + Inforom BBX Object Flush + Latest Timestamp : N/A + Latest Duration : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GSP Firmware Version : N/A + GPU C2C Mode : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + GPU Reset Status + Reset Required : No + Drain and Reset Recommended : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0xAF + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1E0410DE + Bus Id : 00000000:AF:00.0 + Sub System Id : 0x86751043 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Device Current : 1 + Device Max : 3 + Host Max : 3 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Atomic Caps Inbound : N/A + Atomic Caps Outbound : N/A + Fan Speed : 28 % + Performance State : P8 + Clocks Event Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 11264 MiB + Reserved : 252 MiB + Used : 3 MiB + Free : 11007 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 4 MiB + Free : 252 MiB + Conf Compute Protected Memory Usage + Total : 0 MiB + Used : 0 MiB + Free : 0 MiB + Compute Mode : Default + Utilization + Gpu : 5 % + Memory : 8 % + Encoder : 0 % + Decoder : 0 % + JPEG : 0 % + OFA : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + ECC Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Aggregate + SRAM Correctable : N/A + SRAM Uncorrectable : N/A + DRAM Correctable : N/A + DRAM Uncorrectable : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Remapped Rows : N/A + Temperature + GPU Current Temp : 34 C + GPU T.Limit Temp : N/A + GPU Shutdown Temp : 94 C + GPU Slowdown Temp : 91 C + GPU Max Operating Temp : 89 C + GPU Target Temperature : 84 C + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + GPU Power Readings + Power Draw : 18.10 W + Current Power Limit : 250.00 W + Requested Power Limit : 250.00 W + Default Power Limit : 250.00 W + Min Power Limit : 100.00 W + Max Power Limit : 280.00 W + GPU Memory Power Readings + Power Draw : N/A + Module Power Readings + Power Draw : N/A + Current Power Limit : N/A + Requested Power Limit : N/A + Default Power Limit : N/A + Min Power Limit : N/A + Max Power Limit : N/A + Clocks + Graphics : 300 MHz + SM : 300 MHz + Memory : 405 MHz + Video : 540 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Deferred Clocks + Memory : N/A + Max Clocks + Graphics : 2100 MHz + SM : 2100 MHz + Memory : 7000 MHz + Video : 1950 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Voltage + Graphics : N/A + Fabric + State : N/A + Status : N/A + Processes : None + diff --git a/tests/gpuinfo.sh b/tests/gpuinfo.sh new file mode 100755 index 0000000..6bf5d51 --- /dev/null +++ b/tests/gpuinfo.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +# Test that we can emit per-gpu load data properly. + +set -e + +# Currently testing this only on nvidia. +if [[ ! -e /sys/module/nvidia ]]; then + exit 0 +fi + +( cd .. ; cargo build ) + +# The field is going to be there because cards always have some non-default data (fan speeds, +# performance state, power, clocks). + +loadlines=$(../target/debug/sonar ps --load | grep -E ',"?gpuinfo=' | wc -l) +if [[ $loadlines -ne 1 ]]; then + echo "Did not emit gpuinfo data properly - not exactly 1: $loadlines" + exit 1 +fi + +loadlines=$(../target/debug/sonar ps | grep -E ',"?gpuinfo=' | wc -l) +if [[ $loadlines -ne 0 ]]; then + echo "Did not emit gpuinfo data properly - not exactly 0: $loadlines" + exit 1 +fi diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 45a02e6..32695c3 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -18,6 +18,7 @@ for test in command-line \ exclude-commands \ exclude-system-jobs \ exclude-users \ + gpuinfo \ hostname \ interrupt \ load \