Skip to content

Commit 0c618d0

Browse files
author
Lars T Hansen
committed
Fix #44 - check for presence of GPUs before running probes
1 parent 3a503b1 commit 0c618d0

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

src/amd.rs

+19-4
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,18 @@ use crate::ps::UserTable;
1919
use crate::TIMEOUT_SECONDS;
2020

2121
use std::cmp::Ordering;
22+
use std::path::Path;
2223

2324
#[cfg(test)]
2425
use crate::util::map;
2526

27+
// On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD
28+
// accelerators present.
29+
30+
fn amd_present() -> bool {
31+
return Path::new("/sys/module/amdgpu").exists()
32+
}
33+
2634
// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
2735
// is installed on each card on this machine, so this is pretty limited. But we are at least able
2836
// to extract gross information about the installed cards.
@@ -40,6 +48,9 @@ use crate::util::map;
4048
// too small. This is presumably all driver dependent.)
4149

4250
pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
51+
if !amd_present() {
52+
return None
53+
}
4354
match command::safe_command("rocm-smi", &["--showproductname"], TIMEOUT_SECONDS) {
4455
Ok(raw_text) => {
4556
let mut cards = vec![];
@@ -65,12 +76,16 @@ pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
6576
}
6677
}
6778

68-
/// Get information about AMD cards.
69-
///
70-
/// Err(e) really means the command started running but failed, for the reason given. If the
71-
/// command could not be found, we return Ok(vec![]).
79+
// Get information about AMD cards.
80+
//
81+
// Err(e) really means the command started running but failed, for the reason given. If the
82+
// command could not be found or no card is present, we return Ok(vec![]).
7283

7384
pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
85+
if !amd_present() {
86+
return Ok(vec![])
87+
}
88+
7489
// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command
7590
// twice. Not a happy situation.
7691

src/nvidia.rs

+15-1
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,17 @@ use crate::ps::UserTable;
1212
use crate::util;
1313
use crate::TIMEOUT_SECONDS;
1414

15+
use std::path::Path;
1516
#[cfg(test)]
1617
use crate::util::map;
1718

19+
// On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are
20+
// nvidia accelerators present.
21+
22+
fn nvidia_present() -> bool {
23+
return Path::new("/sys/module/nvidia").exists()
24+
}
25+
1826
// `nvidia-smi -a` dumps a lot of information about all the cards in a semi-structured form,
1927
// each line a textual keyword/value pair.
2028
//
@@ -24,6 +32,9 @@ use crate::util::map;
2432
// Parsing all the output lines in order yields the information about all the cards.
2533

2634
pub fn get_nvidia_configuration() -> Option<Vec<gpu::Card>> {
35+
if !nvidia_present() {
36+
return None
37+
}
2738
match command::safe_command("nvidia-smi", &["-a"], TIMEOUT_SECONDS) {
2839
Ok(raw_text) => {
2940
let mut cards = vec![];
@@ -74,9 +85,12 @@ pub fn get_nvidia_configuration() -> Option<Vec<gpu::Card>> {
7485
}
7586

7687
// Err(e) really means the command started running but failed, for the reason given. If the
77-
// command could not be found, we return Ok(vec![]).
88+
// command could not be found or no card is present, we return Ok(vec![]).
7889

7990
pub fn get_nvidia_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
91+
if !nvidia_present() {
92+
return Ok(vec![])
93+
}
8094
match command::safe_command(NVIDIA_PMON_COMMAND, NVIDIA_PMON_ARGS, TIMEOUT_SECONDS) {
8195
Ok(pmon_raw_text) => {
8296
let mut processes = parse_pmon_output(&pmon_raw_text, user_by_pid)?;

0 commit comments

Comments
 (0)