@@ -12,84 +12,191 @@ use crate::ps::UserTable;
12
12
use crate :: util;
13
13
use crate :: TIMEOUT_SECONDS ;
14
14
15
- use std:: path:: Path ;
16
15
#[ cfg( test) ]
17
16
use crate :: util:: map;
17
+ use std:: path:: Path ;
18
18
19
19
// On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are
20
20
// nvidia accelerators present.
21
21
22
22
fn nvidia_present ( ) -> bool {
23
- return Path :: new ( "/sys/module/nvidia" ) . exists ( )
23
+ return Path :: new ( "/sys/module/nvidia" ) . exists ( ) ;
24
24
}
25
25
26
- // `nvidia-smi -a` dumps a lot of information about all the cards in a semi-structured form,
27
- // each line a textual keyword/value pair.
26
+ // `nvidia-smi -a` (aka `nvidia-smi -q`) dumps a lot of information about all the cards in a
27
+ // semi-structured form. It is fairly slow, the reason being it also obtains information about
28
+ // running processes. But if we only run this function for sysinfo that's OK.
29
+ //
30
+ // In brief, the input is a set of lines with a preamble followed by zero or more cards.
31
+ // Indentation indicates nesting of sections and subsections. Everything ends implicitly; if an
32
+ // indent-4 line is encountered inside a section then that ends the section, if an indent-0 line is
33
+ // encountered inside a section or card then that ends the card.
34
+ //
35
+ // Against that background, these regexes matching full lines describe a state machine:
36
+ //
37
+ // - a line matching /^CUDA Version\s*:\s*(.*)$/ registers the common CUDA version
38
+ // - a line matching /^Driver Version\s*:\s*(.*)$/ registers the common driver version
39
+ // - a line matching /^GPU (.*)/ starts a new card, the card is named by $1.
40
+ // - a line matching /^\s{4}(${name})\s*:\s*(.*)$/ names a keyword-value pair not in a section
41
+ // where $1 is the keyword and $2 is the value; ${name} is /[A-Z][^:]*/
42
+ // - a line matching /^\s{4}(${name})$/ is the start of a top-level section
43
+ // a line matching /^\s{8}(${name})\s*:\s*(.*)$/ names a keyword-value pair in a section,
44
+ // where $1 is the keyword and $2 is the value
45
+ // - a line matching /^\s+(.*)$/ but not any of the above is either a subsubsection value,
46
+ // a subsubsection start, or other gunk we don't care about
47
+ // - a blank line or eof marks the end of the card
28
48
//
29
- // "Product Name" names the card. Following the string "FB Memory Usage", "Total" has the
30
- // memory of the card.
49
+ // To avoid building a lexer/parser or playing with regexes we can match against the entire line or
50
+ // the beginning of line, within a context. Note the use of "==" rather than "starts_with" to enter
51
+ // into subsections is deliberate, as several subsections may start with the same word ("Clocks").
31
52
//
32
- // Parsing all the output lines in order yields the information about all the cards.
53
+ // It looks like nvidia-smi enumerates cards in a consistent order by increasing bus address, so
54
+ // just take that to be the card index. (In contrast, the Minor Number does not always follow that
55
+ // order.)
33
56
34
57
pub fn get_nvidia_configuration ( ) -> Option < Vec < gpu:: Card > > {
35
58
if !nvidia_present ( ) {
36
- return None
59
+ return None ;
37
60
}
61
+
38
62
match command:: safe_command ( "nvidia-smi" , & [ "-a" ] , TIMEOUT_SECONDS ) {
39
63
Ok ( raw_text) => {
64
+ enum State {
65
+ Preamble ,
66
+ InCard ,
67
+ FbMemoryUsage ,
68
+ GpuPowerReadings ,
69
+ MaxClocks ,
70
+ }
71
+ let mut cuda = "" . to_string ( ) ;
72
+ let mut driver = "" . to_string ( ) ;
73
+ let mut state = State :: Preamble ;
40
74
let mut cards = vec ! [ ] ;
41
- let mut looking_for_total = false ;
42
- let mut model_name = None ;
43
- for l in raw_text. lines ( ) {
44
- // The regular expressions that trigger state transitions are really these:
45
- //
46
- // /^\s*Product Name\s*:\s*(.*)$/
47
- // /^\s*FB Memory Usage\s*$/
48
- // /^\s*Total\s*:\s*(\d+)\s*MiB\s*$/
49
- //
50
- // but we simplify a bit and use primitive string manipulation.
51
- let l = l. trim ( ) ;
52
- if looking_for_total {
53
- if l. starts_with ( "Total" ) && l. ends_with ( "MiB" ) {
54
- if let Some ( ( _, after) ) = l. split_once ( ':' ) {
55
- let rest = after. strip_suffix ( "MiB" ) . expect ( "Suffix checked" ) . trim ( ) ;
56
- if let Ok ( n) = rest. parse :: < i64 > ( ) {
57
- if let Some ( m) = model_name {
58
- cards. push ( gpu:: Card {
59
- model : m,
60
- mem_size_kib : n * 1024 ,
61
- } ) ;
62
- model_name = None ;
75
+ let mut card: gpu:: Card = Default :: default ( ) ;
76
+ ' next_line: for l in raw_text. lines ( ) {
77
+ ' reprocess_line: loop {
78
+ match state {
79
+ State :: Preamble => {
80
+ if l. starts_with ( "CUDA Version" ) {
81
+ cuda = field_value ( l) ;
82
+ } else if l. starts_with ( "Driver Version" ) {
83
+ driver = field_value ( l) ;
84
+ } else if l. starts_with ( "GPU " ) {
85
+ if !card. bus_addr . is_empty ( ) {
86
+ cards. push ( card) ;
63
87
}
88
+ card = Default :: default ( ) ;
89
+ card. bus_addr = l[ 4 ..] . to_string ( ) ;
90
+ card. driver = driver. clone ( ) ;
91
+ card. firmware = cuda. clone ( ) ;
92
+ card. index = cards. len ( ) as i32 ;
93
+ state = State :: InCard ;
64
94
}
95
+ continue ' next_line;
65
96
}
66
- }
67
- } else {
68
- if l. starts_with ( "Product Name" ) {
69
- if let Some ( ( _, rest) ) = l. split_once ( ':' ) {
70
- model_name = Some ( rest. trim ( ) . to_string ( ) ) ;
71
- continue ;
97
+ State :: InCard => {
98
+ if !l. starts_with ( " " ) {
99
+ state = State :: Preamble ;
100
+ continue ' reprocess_line;
101
+ }
102
+ if l. starts_with ( " Product Name" ) {
103
+ card. model = field_value ( l) ;
104
+ } else if l. starts_with ( " Product Architecture" ) {
105
+ card. arch = field_value ( l) ;
106
+ } else if l. starts_with ( " GPU UUID" ) {
107
+ card. uuid = field_value ( l) ;
108
+ } else if l == " FB Memory Usage" {
109
+ state = State :: FbMemoryUsage ;
110
+ } else if l == " GPU Power Readings" {
111
+ state = State :: GpuPowerReadings ;
112
+ } else if l == " Max Clocks" {
113
+ state = State :: MaxClocks ;
114
+ }
115
+ continue ' next_line;
116
+ }
117
+ State :: FbMemoryUsage => {
118
+ if !l. starts_with ( " " ) {
119
+ state = State :: InCard ;
120
+ continue ' reprocess_line;
121
+ }
122
+ if l. starts_with ( " Total" ) {
123
+ if let Ok ( n) = field_value_stripped ( l, "MiB" ) . parse :: < i64 > ( ) {
124
+ card. mem_size_kib = n * 1024 ;
125
+ }
126
+ }
127
+ continue ' next_line;
128
+ }
129
+ State :: GpuPowerReadings => {
130
+ if !l. starts_with ( " " ) {
131
+ state = State :: InCard ;
132
+ continue ' reprocess_line;
133
+ }
134
+ if l. starts_with ( " Current Power Limit" ) {
135
+ if let Ok ( n) = field_value_stripped ( l, "W" ) . parse :: < f64 > ( ) {
136
+ card. power_limit_watt = n. ceil ( ) as i32 ;
137
+ }
138
+ } else if l. starts_with ( " Min Power Limit" ) {
139
+ if let Ok ( n) = field_value_stripped ( l, "W" ) . parse :: < f64 > ( ) {
140
+ card. min_power_limit_watt = n. ceil ( ) as i32 ;
141
+ }
142
+ } else if l. starts_with ( " Max Power Limit" ) {
143
+ if let Ok ( n) = field_value_stripped ( l, "W" ) . parse :: < f64 > ( ) {
144
+ card. max_power_limit_watt = n. ceil ( ) as i32 ;
145
+ }
146
+ }
147
+ continue ' next_line;
148
+ }
149
+ State :: MaxClocks => {
150
+ if !l. starts_with ( " " ) {
151
+ state = State :: InCard ;
152
+ continue ' reprocess_line;
153
+ }
154
+ if l. starts_with ( " SM" ) {
155
+ if let Ok ( n) = field_value_stripped ( l, "MHz" ) . parse :: < i32 > ( ) {
156
+ card. max_ce_clock_mhz = n;
157
+ }
158
+ } else if l. starts_with ( " Memory" ) {
159
+ if let Ok ( n) = field_value_stripped ( l, "MHz" ) . parse :: < i32 > ( ) {
160
+ card. max_mem_clock_mhz = n;
161
+ }
162
+ }
163
+ continue ' next_line;
72
164
}
73
- }
74
- if l. starts_with ( "FB Memory Usage" ) {
75
- looking_for_total = true ;
76
- continue ;
77
165
}
78
166
}
79
- looking_for_total = false ;
167
+ }
168
+ if !card. bus_addr . is_empty ( ) {
169
+ cards. push ( card) ;
80
170
}
81
171
Some ( cards)
82
172
}
83
173
Err ( _) => None ,
84
174
}
85
175
}
86
176
177
+ fn field_value ( l : & str ) -> String {
178
+ if let Some ( ( _, rest) ) = l. split_once ( ':' ) {
179
+ rest. trim ( ) . to_string ( )
180
+ } else {
181
+ "" . to_string ( )
182
+ }
183
+ }
184
+
185
+ fn field_value_stripped ( l : & str , suffix : & str ) -> String {
186
+ if let Some ( ( _, rest) ) = l. split_once ( ':' ) {
187
+ if let Some ( s) = rest. strip_suffix ( suffix) {
188
+ return s. trim ( ) . to_string ( ) ;
189
+ }
190
+ }
191
+ "" . to_string ( )
192
+ }
193
+
87
194
// Err(e) really means the command started running but failed, for the reason given. If the
88
195
// command could not be found or no card is present, we return Ok(vec![]).
89
196
90
197
pub fn get_nvidia_information ( user_by_pid : & UserTable ) -> Result < Vec < gpu:: Process > , String > {
91
198
if !nvidia_present ( ) {
92
- return Ok ( vec ! [ ] )
199
+ return Ok ( vec ! [ ] ) ;
93
200
}
94
201
match command:: safe_command ( NVIDIA_PMON_COMMAND , NVIDIA_PMON_ARGS , TIMEOUT_SECONDS ) {
95
202
Ok ( pmon_raw_text) => {
0 commit comments