Skip to content

Commit

Permalink
Fix #201 - report per-gpu per-sample statistics, akin to 'load'
Browse files Browse the repository at this point in the history
  • Loading branch information
Lars T Hansen committed Nov 15, 2024
1 parent 9d623ce commit 634f427
Show file tree
Hide file tree
Showing 7 changed files with 325 additions and 63 deletions.
32 changes: 14 additions & 18 deletions src/amd.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
/// Get info about AMD graphics cards by parsing the output of rocm-smi.
///
/// This is pretty hacky! Something better than this is likely needed and hopefully possible.
///
/// The returned information is keyed by (device, pid) so that if a process uses multiple devices,
/// the total utilization for the process must be summed across devices. We do this to be
/// compatible with the NVIDIA module (nvidia.rs).
///
/// There is no information here about absolute memory usage numbers. The cards I have don't
/// support getting that information. Other cards might. In that case, the --showmemusage switch
/// (can be combined with --showgpupids in a single invocation) might be useful.
///
/// Even though the output is presented in the same format as for NVIDIA, we only have partial stats
/// about the usage of various processes on the various devices. We divide the utilization of a
/// device by the number of processes on the device. This is approximate.
// Get info about AMD graphics cards by parsing the output of rocm-smi.
//
// This is pretty hacky! Something better than this is likely needed and hopefully possible.

use crate::command::{self, CmdError};
use crate::gpu;
use crate::ps::UserTable;
Expand All @@ -35,17 +24,24 @@ pub fn probe() -> Option<Box<dyn gpu::GPU>> {
}

impl gpu::GPU for AmdGPU {
fn get_manufacturer(&self) -> String {
fn get_manufacturer(&mut self) -> String {
"AMD".to_string()
}

fn get_configuration(&self) -> Result<Vec<gpu::Card>, String> {
fn get_card_configuration(&mut self) -> Result<Vec<gpu::Card>, String> {
get_amd_configuration()
}

fn get_utilization(&self, user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
fn get_process_utilization(
&mut self,
user_by_pid: &UserTable,
) -> Result<Vec<gpu::Process>, String> {
get_amd_utilization(user_by_pid)
}

fn get_card_utilization(&mut self) -> Result<Vec<gpu::CardState>, String> {
Ok(vec![])
}
}

// On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD
Expand Down
51 changes: 45 additions & 6 deletions src/gpu.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::amd;
use crate::nvidia;
use crate::ps::UserTable;
use crate::ps;

#[derive(PartialEq)]
// Per-sample process information, across cards

#[derive(PartialEq, Default, Clone)]
pub struct Process {
pub device: Option<usize>, // Device ID
pub pid: usize, // Process ID
Expand All @@ -14,9 +16,13 @@ pub struct Process {
pub command: String, // The command, _unknown_ for zombies, _noinfo_ if not known
}

// Used to tag a Process entry when the uid can't be determined

pub const ZOMBIE_UID: usize = 666666;

#[derive(PartialEq, Default)]
// Sample-invariant card information

#[derive(PartialEq, Default, Clone)]
pub struct Card {
pub bus_addr: String,
pub index: i32, // Card index (changes at boot)
Expand All @@ -33,12 +39,45 @@ pub struct Card {
pub max_mem_clock_mhz: i32,
}

// Per-sample card information, across processes

#[derive(PartialEq, Default, Clone)]
pub struct CardState {
pub index: i32, // Stable card identifier
pub fan_speed_pct: f32,
pub compute_mode: String,
pub perf_state: String,
pub mem_reserved_kib: i64,
pub mem_used_kib: i64,
pub gpu_utilization_pct: f32,
pub mem_utilization_pct: f32,
pub temp_c: i32,
pub power_watt: i32,
pub power_limit_watt: i32,
pub ce_clock_mhz: i32,
pub mem_clock_mhz: i32,
}

// Abstract GPU information across GPU types.
//
// As get_manufacturer() is for the GPU object as a whole and not per-card, we are currently
// assuming that nodes don't have cards from multiple manufacturers.
//
// get_card_configuration() and get_card_utilization() return vectors that are sorted by their index
// fields, and indices shall be tightly packed.

pub trait GPU {
fn get_manufacturer(&self) -> String;
fn get_configuration(&self) -> Result<Vec<Card>, String>;
fn get_utilization(&self, user_by_pid: &UserTable) -> Result<Vec<Process>, String>;
fn get_manufacturer(&mut self) -> String;
fn get_card_configuration(&mut self) -> Result<Vec<Card>, String>;
fn get_process_utilization(
&mut self,
user_by_pid: &ps::UserTable,
) -> Result<Vec<Process>, String>;
fn get_card_utilization(&mut self) -> Result<Vec<CardState>, String>;
}

// Probe the system for GPUs.

pub fn probe() -> Option<Box<dyn GPU>> {
if let Some(nvidia) = nvidia::probe() {
Some(nvidia)
Expand Down
Loading

0 comments on commit 634f427

Please sign in to comment.