Skip to content

Commit fcfc724

Browse files
author
Lars T Hansen
committed
Fix #200 - Extract more GPU information for sysinfo
1 parent df4d563 commit fcfc724

File tree

4 files changed

+214
-53
lines changed

4 files changed

+214
-53
lines changed

src/amd.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use crate::util::map;
2828
// accelerators present.
2929

3030
fn amd_present() -> bool {
31-
return Path::new("/sys/module/amdgpu").exists()
31+
return Path::new("/sys/module/amdgpu").exists();
3232
}
3333

3434
// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
@@ -49,7 +49,7 @@ fn amd_present() -> bool {
4949

5050
pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
5151
if !amd_present() {
52-
return None
52+
return None;
5353
}
5454
match command::safe_command("rocm-smi", &["--showproductname"], TIMEOUT_SECONDS) {
5555
Ok(raw_text) => {
@@ -61,7 +61,7 @@ pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
6161
if let Some((_, after)) = l.split_once("Card series:") {
6262
cards.push(gpu::Card {
6363
model: after.trim().to_string(),
64-
mem_size_kib: 0,
64+
..Default::default()
6565
});
6666
}
6767
}
@@ -83,7 +83,7 @@ pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
8383

8484
pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
8585
if !amd_present() {
86-
return Ok(vec![])
86+
return Ok(vec![]);
8787
}
8888

8989
// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command

src/gpu.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,19 @@ pub struct Process {
1212

1313
pub const ZOMBIE_UID: usize = 666666;
1414

15-
#[derive(PartialEq)]
15+
#[derive(PartialEq, Default)]
1616
pub struct Card {
17-
pub model: String,
17+
pub bus_addr: String,
18+
pub index: i32, // Card index (changes at boot)
19+
pub model: String, // NVIDIA: Product Name
20+
pub arch: String, // NVIDIA: Product Architecture
21+
pub driver: String, // NVIDIA: driver version
22+
pub firmware: String, // NVIDIA: CUDA version
23+
pub uuid: String, // NVIDIA: The uuid
1824
pub mem_size_kib: i64,
25+
pub power_limit_watt: i32, // "current", but probably changes rarely
26+
pub max_power_limit_watt: i32,
27+
pub min_power_limit_watt: i32,
28+
pub max_ce_clock_mhz: i32,
29+
pub max_mem_clock_mhz: i32,
1930
}

src/nvidia.rs

Lines changed: 149 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,84 +12,191 @@ use crate::ps::UserTable;
1212
use crate::util;
1313
use crate::TIMEOUT_SECONDS;
1414

15-
use std::path::Path;
1615
#[cfg(test)]
1716
use crate::util::map;
17+
use std::path::Path;
1818

1919
// On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are
2020
// nvidia accelerators present.
2121

2222
fn nvidia_present() -> bool {
23-
return Path::new("/sys/module/nvidia").exists()
23+
return Path::new("/sys/module/nvidia").exists();
2424
}
2525

26-
// `nvidia-smi -a` dumps a lot of information about all the cards in a semi-structured form,
27-
// each line a textual keyword/value pair.
26+
// `nvidia-smi -a` (aka `nvidia-smi -q`) dumps a lot of information about all the cards in a
27+
// semi-structured form. It is fairly slow, the reason being it also obtains information about
28+
// running processes. But if we only run this function for sysinfo that's OK.
29+
//
30+
// In brief, the input is a set of lines with a preamble followed by zero or more cards.
31+
// Indentation indicates nesting of sections and subsections. Everything ends implicitly; if an
32+
// indent-4 line is encountered inside a section then that ends the section, if an indent-0 line is
33+
// encountered inside a section or card then that ends the card.
34+
//
35+
// Against that background, these regexes matching full lines describe a state machine:
36+
//
37+
// - a line matching /^CUDA Version\s*:\s*(.*)$/ registers the common CUDA version
38+
// - a line matching /^Driver Version\s*:\s*(.*)$/ registers the common driver version
39+
// - a line matching /^GPU (.*)/ starts a new card, the card is named by $1.
40+
// - a line matching /^\s{4}(${name})\s*:\s*(.*)$/ names a keyword-value pair not in a section
41+
// where $1 is the keyword and $2 is the value; ${name} is /[A-Z][^:]*/
42+
// - a line matching /^\s{4}(${name})$/ is the start of a top-level section
43+
// a line matching /^\s{8}(${name})\s*:\s*(.*)$/ names a keyword-value pair in a section,
44+
// where $1 is the keyword and $2 is the value
45+
// - a line matching /^\s+(.*)$/ but not any of the above is either a subsubsection value,
46+
// a subsubsection start, or other gunk we don't care about
47+
// - a blank line or eof marks the end of the card
2848
//
29-
// "Product Name" names the card. Following the string "FB Memory Usage", "Total" has the
30-
// memory of the card.
49+
// To avoid building a lexer/parser or playing with regexes we can match against the entire line or
50+
// the beginning of line, within a context. Note the use of "==" rather than "starts_with" to enter
51+
// into subsections is deliberate, as several subsections may start with the same word ("Clocks").
3152
//
32-
// Parsing all the output lines in order yields the information about all the cards.
53+
// It looks like nvidia-smi enumerates cards in a consistent order by increasing bus address, so
54+
// just take that to be the card index. (In contrast, the Minor Number does not always follow that
55+
// order.)
3356

3457
pub fn get_nvidia_configuration() -> Option<Vec<gpu::Card>> {
3558
if !nvidia_present() {
36-
return None
59+
return None;
3760
}
61+
3862
match command::safe_command("nvidia-smi", &["-a"], TIMEOUT_SECONDS) {
3963
Ok(raw_text) => {
64+
enum State {
65+
Preamble,
66+
InCard,
67+
FbMemoryUsage,
68+
GpuPowerReadings,
69+
MaxClocks,
70+
}
71+
let mut cuda = "".to_string();
72+
let mut driver = "".to_string();
73+
let mut state = State::Preamble;
4074
let mut cards = vec![];
41-
let mut looking_for_total = false;
42-
let mut model_name = None;
43-
for l in raw_text.lines() {
44-
// The regular expressions that trigger state transitions are really these:
45-
//
46-
// /^\s*Product Name\s*:\s*(.*)$/
47-
// /^\s*FB Memory Usage\s*$/
48-
// /^\s*Total\s*:\s*(\d+)\s*MiB\s*$/
49-
//
50-
// but we simplify a bit and use primitive string manipulation.
51-
let l = l.trim();
52-
if looking_for_total {
53-
if l.starts_with("Total") && l.ends_with("MiB") {
54-
if let Some((_, after)) = l.split_once(':') {
55-
let rest = after.strip_suffix("MiB").expect("Suffix checked").trim();
56-
if let Ok(n) = rest.parse::<i64>() {
57-
if let Some(m) = model_name {
58-
cards.push(gpu::Card {
59-
model: m,
60-
mem_size_kib: n * 1024,
61-
});
62-
model_name = None;
75+
let mut card: gpu::Card = Default::default();
76+
'next_line: for l in raw_text.lines() {
77+
'reprocess_line: loop {
78+
match state {
79+
State::Preamble => {
80+
if l.starts_with("CUDA Version") {
81+
cuda = field_value(l);
82+
} else if l.starts_with("Driver Version") {
83+
driver = field_value(l);
84+
} else if l.starts_with("GPU ") {
85+
if !card.bus_addr.is_empty() {
86+
cards.push(card);
6387
}
88+
card = Default::default();
89+
card.bus_addr = l[4..].to_string();
90+
card.driver = driver.clone();
91+
card.firmware = cuda.clone();
92+
card.index = cards.len() as i32;
93+
state = State::InCard;
6494
}
95+
continue 'next_line;
6596
}
66-
}
67-
} else {
68-
if l.starts_with("Product Name") {
69-
if let Some((_, rest)) = l.split_once(':') {
70-
model_name = Some(rest.trim().to_string());
71-
continue;
97+
State::InCard => {
98+
if !l.starts_with(" ") {
99+
state = State::Preamble;
100+
continue 'reprocess_line;
101+
}
102+
if l.starts_with(" Product Name") {
103+
card.model = field_value(l);
104+
} else if l.starts_with(" Product Architecture") {
105+
card.arch = field_value(l);
106+
} else if l.starts_with(" GPU UUID") {
107+
card.uuid = field_value(l);
108+
} else if l == " FB Memory Usage" {
109+
state = State::FbMemoryUsage;
110+
} else if l == " GPU Power Readings" {
111+
state = State::GpuPowerReadings;
112+
} else if l == " Max Clocks" {
113+
state = State::MaxClocks;
114+
}
115+
continue 'next_line;
116+
}
117+
State::FbMemoryUsage => {
118+
if !l.starts_with(" ") {
119+
state = State::InCard;
120+
continue 'reprocess_line;
121+
}
122+
if l.starts_with(" Total") {
123+
if let Ok(n) = field_value_stripped(l, "MiB").parse::<i64>() {
124+
card.mem_size_kib = n * 1024;
125+
}
126+
}
127+
continue 'next_line;
128+
}
129+
State::GpuPowerReadings => {
130+
if !l.starts_with(" ") {
131+
state = State::InCard;
132+
continue 'reprocess_line;
133+
}
134+
if l.starts_with(" Current Power Limit") {
135+
if let Ok(n) = field_value_stripped(l, "W").parse::<f64>() {
136+
card.power_limit_watt = n.ceil() as i32;
137+
}
138+
} else if l.starts_with(" Min Power Limit") {
139+
if let Ok(n) = field_value_stripped(l, "W").parse::<f64>() {
140+
card.min_power_limit_watt = n.ceil() as i32;
141+
}
142+
} else if l.starts_with(" Max Power Limit") {
143+
if let Ok(n) = field_value_stripped(l, "W").parse::<f64>() {
144+
card.max_power_limit_watt = n.ceil() as i32;
145+
}
146+
}
147+
continue 'next_line;
148+
}
149+
State::MaxClocks => {
150+
if !l.starts_with(" ") {
151+
state = State::InCard;
152+
continue 'reprocess_line;
153+
}
154+
if l.starts_with(" SM") {
155+
if let Ok(n) = field_value_stripped(l, "MHz").parse::<i32>() {
156+
card.max_ce_clock_mhz = n;
157+
}
158+
} else if l.starts_with(" Memory") {
159+
if let Ok(n) = field_value_stripped(l, "MHz").parse::<i32>() {
160+
card.max_mem_clock_mhz = n;
161+
}
162+
}
163+
continue 'next_line;
72164
}
73-
}
74-
if l.starts_with("FB Memory Usage") {
75-
looking_for_total = true;
76-
continue;
77165
}
78166
}
79-
looking_for_total = false;
167+
}
168+
if !card.bus_addr.is_empty() {
169+
cards.push(card);
80170
}
81171
Some(cards)
82172
}
83173
Err(_) => None,
84174
}
85175
}
86176

177+
fn field_value(l: &str) -> String {
178+
if let Some((_, rest)) = l.split_once(':') {
179+
rest.trim().to_string()
180+
} else {
181+
"".to_string()
182+
}
183+
}
184+
185+
fn field_value_stripped(l: &str, suffix: &str) -> String {
186+
if let Some((_, rest)) = l.split_once(':') {
187+
if let Some(s) = rest.strip_suffix(suffix) {
188+
return s.trim().to_string();
189+
}
190+
}
191+
"".to_string()
192+
}
193+
87194
// Err(e) really means the command started running but failed, for the reason given. If the
88195
// command could not be found or no card is present, we return Ok(vec![]).
89196

90197
pub fn get_nvidia_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
91198
if !nvidia_present() {
92-
return Ok(vec![])
199+
return Ok(vec![]);
93200
}
94201
match command::safe_command(NVIDIA_PMON_COMMAND, NVIDIA_PMON_ARGS, TIMEOUT_SECONDS) {
95202
Ok(pmon_raw_text) => {

src/sysinfo.rs

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ fn do_show_system(
4343
} else {
4444
""
4545
};
46-
let (gpu_desc, gpu_cards, gpumem_gb) = if !cards.is_empty() {
46+
let (gpu_desc, gpu_cards, gpumem_gb, gpu_info) = if !cards.is_empty() {
4747
// Sort cards
4848
cards.sort_by(|a: &gpu::Card, b: &gpu::Card| {
4949
if a.model == b.model {
@@ -58,7 +58,10 @@ fn do_show_system(
5858
let mut gpu_desc = "".to_string();
5959
while i < cards.len() {
6060
let first = i;
61-
while i < cards.len() && cards[i] == cards[first] {
61+
while i < cards.len()
62+
&& cards[i].model == cards[first].model
63+
&& cards[i].mem_size_kib == cards[first].mem_size_kib
64+
{
6265
i += 1;
6366
}
6467
let memsize = if cards[first].mem_size_kib > 0 {
@@ -75,9 +78,46 @@ fn do_show_system(
7578
for c in &cards {
7679
total_mem_by += c.mem_size_kib * 1024;
7780
}
78-
(gpu_desc, gpu_cards, total_mem_by / GIB as i64)
81+
82+
// Compute the info blobs
83+
let mut gpu_info = "".to_string();
84+
for c in &cards {
85+
if !gpu_info.is_empty() {
86+
gpu_info += ","
87+
}
88+
let gpu::Card {
89+
bus_addr,
90+
index,
91+
model,
92+
arch,
93+
driver,
94+
firmware,
95+
uuid,
96+
mem_size_kib,
97+
power_limit_watt,
98+
max_power_limit_watt,
99+
min_power_limit_watt,
100+
max_ce_clock_mhz,
101+
max_mem_clock_mhz,
102+
} = c;
103+
let bus_addr = util::json_quote(bus_addr);
104+
let model = util::json_quote(model);
105+
let arch = util::json_quote(arch);
106+
let driver = util::json_quote(driver);
107+
let firmware = util::json_quote(firmware);
108+
gpu_info += &format!(
109+
r###"
110+
{{"bus_addr":"{bus_addr}", "index":{index}, "uuid":"{uuid}",
111+
"model":"{model}", "arch":"{arch}", "driver":"{driver}", "firmware":"{firmware}",
112+
"mem_size_kib":{mem_size_kib},
113+
"power_limit_watt":{power_limit_watt}, "max_power_limit_watt":{max_power_limit_watt}, "min_power_limit_watt":{min_power_limit_watt},
114+
"max_ce_clock_mhz":{max_ce_clock_mhz}, "max_mem_clock_mhz":{max_mem_clock_mhz}}}"###
115+
);
116+
}
117+
118+
(gpu_desc, gpu_cards, total_mem_by / GIB as i64, gpu_info)
79119
} else {
80-
("".to_string(), 0, 0)
120+
("".to_string(), 0, 0, "".to_string())
81121
};
82122
let timestamp = util::json_quote(timestamp);
83123
let hostname = util::json_quote(&hostname);
@@ -89,15 +129,18 @@ fn do_show_system(
89129
// Note the field names here are used by decoders that are developed separately, and they should
90130
// be considered set in stone.
91131

132+
let version = util::json_quote(env!("CARGO_PKG_VERSION"));
92133
let s = format!(
93134
r#"{{
135+
"version": "{version}",
94136
"timestamp": "{timestamp}",
95137
"hostname": "{hostname}",
96138
"description": "{description}",
97139
"cpu_cores": {cpu_cores},
98140
"mem_gb": {mem_gib},
99141
"gpu_cards": {gpu_cards},
100-
"gpumem_gb": {gpumem_gb}
142+
"gpumem_gb": {gpumem_gb},
143+
"gpu_info": [{gpu_info}]
101144
}}
102145
"#
103146
);

0 commit comments

Comments
 (0)