From 5c655a3fec596a2e63b07dd4860c09d6b02a3f9a Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Thu, 20 Jun 2024 10:23:54 +0200 Subject: [PATCH 1/4] Fix #178 - print system load data --- .zenodo.json | 3 +- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 31 +++++++++++++++-- src/procfs.rs | 68 ++++++++++++++++++++++++++++++------- src/ps.rs | 93 ++++++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 171 insertions(+), 28 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index d955a28..3b14c0b 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -9,6 +9,7 @@ "name": "Bockwoldt, Mathias" }, { + "affiliation": "University of Oslo", "name": "Hansen, Lars T." }, { @@ -25,5 +26,5 @@ "id": "GPL-3.0" }, "title": "sonar: Tool to profile usage of HPC resources by regularly probing processes using ps.", - "version": "0.11.0" + "version": "0.12.0" } diff --git a/Cargo.lock b/Cargo.lock index 46d10a9..ee17e31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,7 +10,7 @@ checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "sonar" -version = "0.11.1" +version = "0.12.0" dependencies = [ "libc", "subprocess", diff --git a/Cargo.toml b/Cargo.toml index e665c79..1fc94e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sonar" -version = "0.11.1" +version = "0.12.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/README.md b/README.md index 48a551d..6132fb5 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,6 @@ Usage: sonar Commands: ps Take a snapshot of the currently running processes sysinfo Extract system information - analyze Not yet implemented help Print this message or the help of the given subcommand(s) Options: @@ -50,6 +49,11 @@ affect the output nevertheless, ie, most changes not covered by changes to the m These rules are new with v0.8.0. +### Changes in v0.12.x + +**System load data introduced**. Added the `load` field which is printed with one of the records +per sonar invocation. (v0.12.0) + ### Changes in v0.11.x **Better `ps` data**. More data points. (v0.11.0) @@ -179,6 +183,27 @@ v=0.7.0,time=2023-08-10T11:09:41+02:00,host=somehost,cores=8,user=someone,job=0, v=0.7.0,time=2023-08-10T11:09:41+02:00,host=somehost,cores=8,user=someone,job=0,cmd=slack,cpu%=3.9,cpukib=716924,gpus=none,gpu%=0,gpumem%=0,gpukib=0,cputime_sec=266 ``` +### Version 0.12.0 `ps` output format + +Version 0.12.0 adds one field: + +`load` (optional, default blank): This is an encoding of the per-cpu time usage in seconds on the +node since boot. It is the same for all records and is therefore printed only with one of them per +sonar invocation. The encoding is an array of N+1 u64 values for an N-cpu node. The first value is +the "base" value, it is to be added to all the subsequent values. The remaining are per-cpu values +in order from cpu0 through cpuN-1. Each value is encoded little-endian base-45, with the initial +character of each value chosen from a different set than the subsequent characters. The character +sets are: + +``` +INITIAL = "(){}[]<>+-abcdefghijklmnopqrstuvwxyz!@#$%^&*_" +SUBSEQUENT = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ~|';:.?/`" +``` + +The base-45 digits of the value `897` are (in little-endian order) 42 and 19, and the encoding of +this value is thus `&J`. As the initial character is from a different character set, no explicit +separator is needed in the array - the initial digit acts as a separator. + ### Version 0.11.0 `ps` output format Version 0.11.0 adds one field: @@ -366,9 +391,9 @@ Numeric fields that are zero may or may not be omitted by the producer. Note the v0.9.0 `sysinfo` output does not carry a version number. -## Collect results with `sonar analyze` :construction: +## Collect and analyze results -The `analyze` command is work in progress. Sonar data are used by two other tools: +Sonar data are used by two other tools: * [JobGraph](https://github.com/NordicHPC/jobgraph) provides high-level plots of system activity. Mapping files for JobGraph can be found in the [data](data) folder. diff --git a/src/procfs.rs b/src/procfs.rs index cfc56b0..01f372d 100644 --- a/src/procfs.rs +++ b/src/procfs.rs @@ -103,18 +103,58 @@ fn i32_field(l: &str) -> Result { pub fn get_process_information( fs: &dyn procfsapi::ProcfsAPI, memtotal_kib: usize, -) -> Result, String> { - // The boot time is the `btime` field of /proc/stat. It is measured in seconds since epoch. We - // need this to compute the process's real time, which we need to compute ps-compatible cpu - // utilization. +) -> Result<(HashMap, u64, Vec), String> { + // We need this for a lot of things. On x86 and x64 this is always 100 but in principle it + // might be something else, so read the true value. + + let ticks_per_sec = fs.clock_ticks_per_sec() as u64; + if ticks_per_sec == 0 { + return Err("Could not get a sensible CLK_TCK".to_string()); + } + + // Extract system data from /proc/stat. https://man7.org/linux/man-pages/man5/procfs.5.html. + // + // The per-CPU usage is the sum of some fields of the `cpuN` lines. These are in ticks since + // boot. In addition there is an across-the-system line called simply `cpu` with the same + // format. These data are useful for analyzing core bindings. + // + // The boot time is first field of the `btime` line of /proc/stat. It is measured in seconds + // since epoch. We need this to compute the process's real time, which we need to compute + // ps-compatible cpu utilization. let mut boot_time = 0; + let mut cpu_total_secs = 0; + let mut per_cpu_secs = vec![]; let stat_s = fs.read_to_string("stat")?; for l in stat_s.split('\n') { - if l.starts_with("btime ") { + if l.starts_with("cpu") { + // Based on sysstat sources, the "nice" time is not included in the "user" time. (But + // guest times, which we ignore here, are included in their overall times.) And + // irq/softirq numbers can be a substantial fraction of "system" time. So sum user, + // nice, sys, irq, and softirq as a sensible proxy for time spent on "work" on the CPU. + const STAT_FIELDS: [usize; 5] = [1, 2, 3, 6, 7]; + + let fields = l.split_ascii_whitespace().collect::>(); + let mut sum = 0; + for i in STAT_FIELDS { + sum += parse_usize_field(&fields, i, l, "stat", 0, "cpu")? as u64; + } + if l.starts_with("cpu ") { + cpu_total_secs = sum / ticks_per_sec; + } else { + let cpu_no = + match fields[0][3..].parse::() { + Ok(x) => x, + Err(_) => { continue } // Too harsh to error out + }; + if per_cpu_secs.len() < cpu_no + 1 { + per_cpu_secs.resize(cpu_no + 1, 0u64); + } + per_cpu_secs[cpu_no] = sum / ticks_per_sec; + } + } else if l.starts_with("btime ") { let fields = l.split_ascii_whitespace().collect::>(); boot_time = parse_usize_field(&fields, 1, l, "stat", 0, "btime")? as u64; - break; } } if boot_time == 0 { @@ -138,10 +178,7 @@ pub fn get_process_information( let mut result = HashMap::::new(); let mut ppids = HashSet::::new(); let mut user_table = UserTable::new(); - let clock_ticks_per_sec = fs.clock_ticks_per_sec() as f64; - if clock_ticks_per_sec == 0.0 { - return Err("Could not get a sensible CLK_TCK".to_string()); - } + let clock_ticks_per_sec = ticks_per_sec as f64; for (pid, uid) in pids { // Basic system variables. Intermediate time values are represented in ticks to prevent @@ -375,7 +412,7 @@ pub fn get_process_information( p.has_children = ppids.contains(&p.pid); } - Ok(result) + Ok((result, cpu_total_secs, per_cpu_secs)) } // The UserTable optimizes uid -> name lookup. @@ -518,7 +555,7 @@ DirectMap1G: 11534336 kB let fs = procfsapi::MockFS::new(files, pids, users, now); let memtotal_kib = get_memtotal_kib(&fs).unwrap(); - let mut info = get_process_information(&fs, memtotal_kib).unwrap(); + let (mut info, total_secs, per_cpu_secs) = get_process_information(&fs, memtotal_kib).unwrap(); assert!(info.len() == 1); let mut xs = info.drain(); let p = xs.next().expect("Something").1; @@ -542,6 +579,11 @@ DirectMap1G: 11534336 kB assert!(p.mem_size_kib == size); assert!(p.rssanon_kib == rssanon); + + assert!(total_secs == (241155 + 582 + 127006 + 0 + 3816) / 100); // "cpu " line of "stat" data + assert!(per_cpu_secs.len() == 8); + assert!(per_cpu_secs[0] == (32528 + 189 + 19573 + 0 + 1149) / 100); // "cpu0 " line of "stat" data + assert!(per_cpu_secs[7] == (27582 + 61 + 12558 + 0 + 426) / 100); // "cpu7 " line of "stat" data } #[test] @@ -585,7 +627,7 @@ pub fn procfs_dead_and_undead_test() { let fs = procfsapi::MockFS::new(files, pids, users, procfsapi::unix_now()); let memtotal_kib = get_memtotal_kib(&fs).unwrap(); - let mut info = get_process_information(&fs, memtotal_kib).unwrap(); + let (mut info, _, _) = get_process_information(&fs, memtotal_kib).unwrap(); // 4020 should be dropped - it's dead assert!(info.len() == 2); diff --git a/src/ps.rs b/src/ps.rs index 26482e2..f1539be 100644 --- a/src/ps.rs +++ b/src/ps.rs @@ -285,13 +285,14 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta } }; - let procinfo_output = match procfs::get_process_information(&fs, memtotal_kib) { - Ok(result) => result, - Err(msg) => { - log::error(&format!("procfs failed: {msg}")); - return; - } - }; + let (procinfo_output, _cpu_total_secs, per_cpu_secs) = + match procfs::get_process_information(&fs, memtotal_kib) { + Ok(result) => result, + Err(msg) => { + log::error(&format!("procfs failed: {msg}")); + return; + } + }; let pprocinfo_output = &procinfo_output; @@ -514,7 +515,16 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta let mut did_print = false; for c in candidates { - match print_record(&mut writer, &print_params, &c) { + match print_record( + &mut writer, + &print_params, + &c, + if !did_print { + Some(&per_cpu_secs) + } else { + None + }, + ) { Ok(did_print_one) => did_print = did_print_one || did_print, Err(_) => { // Discard the error: there's nothing very sensible we can do at this point if the @@ -550,7 +560,16 @@ fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timesta gpu_status: GpuStatus::Ok, }; // Discard the error, see above. - let _ = print_record(&mut writer, &print_params, &synth); + let _ = print_record( + &mut writer, + &print_params, + &synth, + if !did_print { + Some(&per_cpu_secs) + } else { + None + }, + ); } // Discard the error code, see above. @@ -627,6 +646,7 @@ fn print_record( writer: &mut dyn io::Write, params: &PrintParameters, proc_info: &ProcInfo, + per_cpu_secs: Option<&[u64]>, ) -> Result { // Mandatory fields. @@ -699,6 +719,11 @@ fn print_record( if proc_info.rolledup > 0 { fields.push(format!("rolledup={}", proc_info.rolledup)); } + if let Some(cpu_secs) = per_cpu_secs { + if cpu_secs.len() > 0 { + fields.push(format!("load={}", encode_cpu_secs_base45el(cpu_secs))) + } + } let mut s = "".to_string(); for f in fields { @@ -713,3 +738,53 @@ fn print_record( Ok(true) } + +// Encode a nonempty u64 array compactly. +// +// The values to be represented are always cpu seconds of active time since boot, one item per cpu. +// The output must be ASCII text (32 <= c < 128), ideally without ',' or '"' or '\' or ' ' to not +// make it difficult for the various output formats we use. +// +// We have many encodings to choose from, see https://github.com/NordicHPC/sonar/issues/178. +// +// The encoding here first finds the minimum input value and subtracts that from all entries. The +// minimum value, and all the entries, are then emitted as unsigned little-endian base-45 with the +// initial digit chosen from a different character set to indicate that it is initial. The +// algorithm is simple and reasonably fast for our data volumes. + +fn encode_cpu_secs_base45el(cpu_secs: &[u64]) -> String { + let base = *cpu_secs + .iter() + .reduce(std::cmp::min) + .expect("Non-empty"); + let mut s = encode_u64_base45el(base); + for x in cpu_secs { + s += encode_u64_base45el(*x - base).as_str(); + } + return s; +} + +// The only character unused by the encoding, other than the ones we're not allowed to use, is '='. +const BASE: u64 = 45; +const INITIAL: &[u8] = "(){}[]<>+-abcdefghijklmnopqrstuvwxyz!@#$%^&*_".as_bytes(); +const SUBSEQUENT: &[u8] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ~|';:.?/`".as_bytes(); + +fn encode_u64_base45el(mut x: u64) -> String { + let mut s = String::from(INITIAL[(x % BASE) as usize] as char); + x /= BASE; + while x > 0 { + s.push(SUBSEQUENT[(x % BASE) as usize] as char); + x /= BASE; + } + return s; +} + +#[test] +pub fn test_encoding() { + assert!(INITIAL.len() == BASE as usize); + assert!(SUBSEQUENT.len() == BASE as usize); + // This should be *1, *0, *29, *43, 1, *11 with * denoting an INITIAL char. + let v = vec![1, 30, 89, 12]; + println!("{}", encode_cpu_secs_base45el(&v)); + assert!(encode_cpu_secs_base45el(&v) == ")(t*1b"); +} From 00702d9181488d724d336f50e15f98263f754bfd Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Mon, 24 Jun 2024 08:51:06 +0200 Subject: [PATCH 2/4] Print load= only when --load argument present --- src/main.rs | 10 ++++++++++ src/ps.rs | 9 ++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 452d4f2..3ae3601 100644 --- a/src/main.rs +++ b/src/main.rs @@ -51,6 +51,10 @@ enum Commands { /// Create a per-host lockfile in this directory and exit early if the file exists on startup [default: none] lockdir: Option, + + /// One output record per Sonar invocation will contain a load= field with an encoding of + /// the per-cpu usage since boot. + load: bool, }, /// Extract system information Sysinfo {}, @@ -76,6 +80,7 @@ fn main() { exclude_users, exclude_commands, lockdir, + load, } => { let opts = ps::PsOptions { rollup: *rollup, @@ -84,6 +89,7 @@ fn main() { min_mem_percent: *min_mem_percent, min_cpu_time: *min_cpu_time, exclude_system_jobs: *exclude_system_jobs, + load: *load, exclude_users: if let Some(s) = exclude_users { s.split(',').collect::>() } else { @@ -131,6 +137,7 @@ fn command_line() -> Commands { let mut exclude_users = None; let mut exclude_commands = None; let mut lockdir = None; + let mut load = false; while next < args.len() { let arg = args[next].as_ref(); next += 1; @@ -138,6 +145,8 @@ fn command_line() -> Commands { (next, batchless) = (new_next, true); } else if let Some(new_next) = bool_arg(arg, &args, next, "--rollup") { (next, rollup) = (new_next, true); + } else if let Some(new_next) = bool_arg(arg, &args, next, "--load") { + (next, load) = (new_next, true); } else if let Some(new_next) = bool_arg(arg, &args, next, "--exclude-system-jobs") { @@ -192,6 +201,7 @@ fn command_line() -> Commands { exclude_users, exclude_commands, lockdir, + load, } } "sysinfo" => Commands::Sysinfo {}, diff --git a/src/ps.rs b/src/ps.rs index f1539be..8246ee4 100644 --- a/src/ps.rs +++ b/src/ps.rs @@ -173,6 +173,7 @@ pub struct PsOptions<'a> { pub exclude_users: Vec<&'a str>, pub exclude_commands: Vec<&'a str>, pub lockdir: Option, + pub load: bool, } pub fn create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timestamp: &str) { @@ -719,9 +720,11 @@ fn print_record( if proc_info.rolledup > 0 { fields.push(format!("rolledup={}", proc_info.rolledup)); } - if let Some(cpu_secs) = per_cpu_secs { - if cpu_secs.len() > 0 { - fields.push(format!("load={}", encode_cpu_secs_base45el(cpu_secs))) + if params.opts.load { + if let Some(cpu_secs) = per_cpu_secs { + if cpu_secs.len() > 0 { + fields.push(format!("load={}", encode_cpu_secs_base45el(cpu_secs))) + } } } From 5ac0cbe625ca05ab4ca8d85f533b39d7cf34943c Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Mon, 24 Jun 2024 09:48:03 +0200 Subject: [PATCH 3/4] Improve a comment --- src/ps.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/ps.rs b/src/ps.rs index 8246ee4..51103d6 100644 --- a/src/ps.rs +++ b/src/ps.rs @@ -744,16 +744,19 @@ fn print_record( // Encode a nonempty u64 array compactly. // -// The values to be represented are always cpu seconds of active time since boot, one item per cpu. // The output must be ASCII text (32 <= c < 128), ideally without ',' or '"' or '\' or ' ' to not -// make it difficult for the various output formats we use. +// make it difficult for the various output formats we use. Also avoid DEL, because it is a weird +// control character. // // We have many encodings to choose from, see https://github.com/NordicHPC/sonar/issues/178. // -// The encoding here first finds the minimum input value and subtracts that from all entries. The +// The values to be represented are always cpu seconds of active time since boot, one item per cpu, +// and it is assumed that they are roughly in the vicinity of each other (the largest is rarely more +// than 4x the smallest, say). The assumption does not affect correctness, only compactness. +// +// The encoding first finds the minimum input value and subtracts that from all entries. The // minimum value, and all the entries, are then emitted as unsigned little-endian base-45 with the -// initial digit chosen from a different character set to indicate that it is initial. The -// algorithm is simple and reasonably fast for our data volumes. +// initial digit chosen from a different character set to indicate that it is initial. fn encode_cpu_secs_base45el(cpu_secs: &[u64]) -> String { let base = *cpu_secs From 9972eaf3b8a9e058a65c30e67d54091450196b17 Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Mon, 24 Jun 2024 12:39:51 +0200 Subject: [PATCH 4/4] Add integration test --- tests/load.sh | 18 ++++++++++++++++++ tests/run_tests.sh | 1 + 2 files changed, 19 insertions(+) create mode 100755 tests/load.sh diff --git a/tests/load.sh b/tests/load.sh new file mode 100755 index 0000000..cffd26d --- /dev/null +++ b/tests/load.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# +# Test that we can emit per-cpu load data properly. + +set -e + +( cd .. ; cargo build ) +loadlines=$(../target/debug/sonar ps --load | grep ',load=' | wc -l) +if [[ $loadlines -ne 1 ]]; then + echo "Did not emit load data properly - not exactly 1" + exit 1 +fi + +loadlines=$(../target/debug/sonar ps | grep ',load=' | wc -l) +if [[ $loadlines -ne 0 ]]; then + echo "Did not emit load data properly - not exactly 0" + exit 1 +fi diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 26c1bf6..45a02e6 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -20,6 +20,7 @@ for test in command-line \ exclude-users \ hostname \ interrupt \ + load \ lockfile \ min-cpu-time \ ps-syntax \