@@ -19,10 +19,18 @@ use crate::ps::UserTable;
19
19
use crate :: TIMEOUT_SECONDS ;
20
20
21
21
use std:: cmp:: Ordering ;
22
+ use std:: path:: Path ;
22
23
23
24
#[ cfg( test) ]
24
25
use crate :: util:: map;
25
26
27
+ // On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD
28
+ // accelerators present.
29
+
30
+ fn amd_present ( ) -> bool {
31
+ return Path :: new ( "/sys/module/amdgpu" ) . exists ( )
32
+ }
33
+
26
34
// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
27
35
// is installed on each card on this machine, so this is pretty limited. But we are at least able
28
36
// to extract gross information about the installed cards.
@@ -40,6 +48,9 @@ use crate::util::map;
40
48
// too small. This is presumably all driver dependent.)
41
49
42
50
pub fn get_amd_configuration ( ) -> Option < Vec < gpu:: Card > > {
51
+ if !amd_present ( ) {
52
+ return None
53
+ }
43
54
match command:: safe_command ( "rocm-smi" , & [ "--showproductname" ] , TIMEOUT_SECONDS ) {
44
55
Ok ( raw_text) => {
45
56
let mut cards = vec ! [ ] ;
@@ -65,12 +76,16 @@ pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
65
76
}
66
77
}
67
78
68
- /// Get information about AMD cards.
69
- ///
70
- /// Err(e) really means the command started running but failed, for the reason given. If the
71
- /// command could not be found, we return Ok(vec![]).
79
+ // Get information about AMD cards.
80
+ //
81
+ // Err(e) really means the command started running but failed, for the reason given. If the
82
+ // command could not be found or no card is present , we return Ok(vec![]).
72
83
73
84
pub fn get_amd_information ( user_by_pid : & UserTable ) -> Result < Vec < gpu:: Process > , String > {
85
+ if !amd_present ( ) {
86
+ return Ok ( vec ! [ ] )
87
+ }
88
+
74
89
// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command
75
90
// twice. Not a happy situation.
76
91
0 commit comments