Skip to content

Commit

Permalink
enter the gpu destroyer
Browse files Browse the repository at this point in the history
  • Loading branch information
d3v-null committed Jan 11, 2024
1 parent 6100a73 commit 2de4d8b
Show file tree
Hide file tree
Showing 5 changed files with 271 additions and 28 deletions.
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ authors = [
"Christopher H. Jordan <christopherjordan87@gmail.com>",
"Jack L. B. Line <jack.line@curtin.edu.au>",
"Marcin Sokolowski <marcin.sokolowski@curtin.edu.au>",
"Dev Null <dev.null@curtin.edu.au>",
]
edition = "2021"
rust-version = "1.64"
Expand All @@ -14,6 +15,7 @@ description = "Primary beam code for the Murchison Widefield Array (MWA) radio t
repository = "https://github.com/MWATelescope/mwa_hyperbeam"
homepage = "https://github.com/MWATelescope/mwa_hyperbeam"
exclude = [".github/*", "fee_pols.pdf"]
autotests = true

# Make a rust library, as well as static and C-compatible dynamic libraries
# available as "libmwa_hyperbeam.a" and "libmwa_hyperbeam.so".
Expand All @@ -36,8 +38,8 @@ hip = ["hip-sys", "cc"]
gpu-single = []

[profile.release]
lto = "thin"
codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
# lto = "thin"
# codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations

[dependencies]
cfg-if = "1.0.0"
Expand Down Expand Up @@ -100,6 +102,9 @@ required-features = ["cuda"]
name = "analytic_hip"
required-features = ["hip"]

[[example]]
name = "gpu_destroyer"

[patch.crates-io]
# marlu = { path = "../Marlu" }
# marlu = { git = "https://github.com/MWATelescope/Marlu", branch = "DUT1" }
27 changes: 25 additions & 2 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,30 @@ mod gpu {
#[cfg(feature = "cuda-static")]
println!("cargo:rustc-link-lib=static=cudart_static");

match env::var("DEBUG").as_deref() {
Ok("false") => (),
_ => {cuda_target.flag("-G");},
};

cuda_target
};

#[cfg(feature = "hip")]
let mut gpu_target = {
const DEFAULT_HIP_ARCHES: &[&str] = &["gfx90a"];

let hip_path = hip_sys::hiprt::get_hip_path();
println!("cargo:rerun-if-env-changed=HIP_PATH");
let hip_path = match env::var_os("HIP_PATH") {
Some(p) => {
println!("cargo:warning=HIP_PATH set from env {}", p.to_string_lossy());
std::path::PathBuf::from(p)
}
None => {
let hip_path = hip_sys::hiprt::get_hip_path();
println!("cargo:warning=HIP_PATH set from hip_sys {}", hip_path.display());
hip_path
},
};
if !hip_path.exists() {
panic!("Couldn't find HIP path at {}", hip_path.display());
}
Expand All @@ -243,13 +259,15 @@ mod gpu {
let mut hip_target = cc::Build::new();
hip_target
.compiler(compiler)
.include(hip_path.join("include/hip"))
// .include(hip_path.join("include/hip"))
.include("src/gpu_common/")
.file("src/fee/gpu/fee.cu")
.file("src/analytic/gpu/analytic.cu");

hip_target.flag("-O0"); // <- hip can't handle optimizations

println!("cargo:rerun-if-env-changed=ROCM_VER");
println!("cargo:rerun-if-env-changed=ROCM_PATH");
println!("cargo:rerun-if-env-changed=HYPERBEAM_HIP_ARCH");
println!("cargo:rerun-if-env-changed=HYPERDRIVE_HIP_ARCH");
let arches: Vec<String> = match (
Expand All @@ -276,6 +294,11 @@ mod gpu {
hip_target.flag(&format!("--offload-arch={arch}"));
}

match env::var("DEBUG").as_deref() {
Ok("false") => (),
_ => hip_target.flag("-ggdb"),
};

hip_target
};

Expand Down
170 changes: 170 additions & 0 deletions examples/gpu_destroyer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

//! Example code using hyperbeam's GPU code with Rust.
//!
//! Build and run with something like:
//! `cargo run --release --features=hip --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
//! `cargo run --release --features=hip,gpu-single --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
//!
//! If the "gpu-single" feature is given, then single-precision floats are used
//! on the GPU. This trades precision for speed. The speed gain is considerable if
//! using a desktop GPU.
//!
//! If you want to use hyperbeam in your own Rust crate, then check out the latest
//! version on crates.io:
//!
//! https://crates.io/crates/mwa_hyperbeam
use std::f64::consts::{FRAC_PI_2, PI};

use mwa_hyperbeam::{fee::FEEBeam, AzEl, GpuFloat, Jones};
use ndarray::prelude::*;
use rayon::prelude::*;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut args = std::env::args().skip(1);
let num_directions: usize = args
.next()
.expect("number of directions supplied")
.parse()
.expect("number of directions is a number");

println!(
"GPU float precision is {} bits",
std::mem::size_of::<GpuFloat>() * 8
);

let beam_file = args.next();
// If we were given a file, use it. Otherwise, fall back on MWA_BEAM_FILE.
let beam = match beam_file {
Some(f) => FEEBeam::new(f)?,
None => FEEBeam::new_from_env()?,
};

// Set up our "GPU beam".
let num_freqs = 1;
let freqs_hz = (0..num_freqs).map(|i| (150_000 + 1000 * i) as u32).collect::<Vec<_>>();
let latitude_rad = Some(-0.4660608448386394); // MWA
let iau_order = true;

// Delays and amps correspond to dipoles in the "M&C order". See
// https://wiki.mwatelescope.org/pages/viewpage.action?pageId=48005139) for
// more info. Here, each row of the 2D array corresponds to a tile. In this
// example, all delays and amps are the same, but they are allowed to vary
// between tiles.
let num_tiles = 1;
let delays = Array2::zeros((num_tiles, 16));
let amps = Array2::ones((num_tiles, 16));
let norm_to_zenith = true;

// Set up the directions to test. The type depends on the GPU precision.
let (az, za): (Vec<_>, Vec<_>) = (0..num_directions)
.map(|i| {
(
0.4 + 0.3 * PI * (i as f64 / num_directions as f64),
0.3 + 0.4 * FRAC_PI_2 * (i as f64 / num_directions as f64),
)
})
.unzip();

for ((az, za), i) in az.iter().zip(za.iter()).zip(0..) {
println!("i {:3} az={:.3} za={:.3}", i, az, za);
}

// compute on CPU for comparison.
let mut cpu_jones =
Array3::from_elem((delays.dim().0, freqs_hz.len(), az.len()), Jones::default());

for ((mut out, delays), amps) in cpu_jones
.outer_iter_mut()
.zip(delays.outer_iter())
.zip(amps.outer_iter())
{
for (mut out, &freq) in out.outer_iter_mut().zip(freqs_hz.iter()) {
let cpu_results = beam
.calc_jones_array_pair(
&az,
&za,
freq,
delays.as_slice().unwrap(),
amps.as_slice().unwrap(),
norm_to_zenith,
latitude_rad,
iau_order,
)
.unwrap();

// Demote the CPU results if we have to.
#[cfg(feature = "gpu-single")]
let cpu_results: Vec<Jones<f32>> = cpu_results.into_iter().map(|j| j.into()).collect();

out.assign(&Array1::from(cpu_results));
}
}

// let gpu_az = az.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
// let gpu_za = za.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
let azels = az.iter().zip(za.iter()).map(|(&az, &za)| AzEl { az, el: FRAC_PI_2 - za }).collect::<Vec<_>>();

let num_attempts = 9999;
(0..num_attempts).into_par_iter().for_each(|i| {
let gpu_beam =
unsafe { beam.gpu_prepare(freqs_hz.as_slice(), delays.view(), amps.view(), norm_to_zenith).expect("beam.gpu_prepare") };

// Call hyperbeam GPU code.
let gpu_jones = gpu_beam.calc_jones(azels.as_slice(), latitude_rad, iau_order).expect("gpu_beam.calc_jones");

assert_eq!(gpu_jones.dim(), cpu_jones.dim());

// Compare the differences with the CPU-generated Jones matrices
#[cfg(not(feature = "gpu-single"))]
let mut min_norm = [f64::MAX; 4];
#[cfg(feature = "gpu-single")]
let mut min_norm = [f32::MAX; 4];
for (((&cpu, &gpu), az), za) in cpu_jones
.iter()
.zip(gpu_jones.iter())
.zip(az.iter())
.zip(za.iter())
{
let norm = (cpu - gpu).norm_sqr();
#[cfg(not(feature = "gpu-single"))]
let norm_sum: f64 = norm.iter().sum();
#[cfg(feature = "gpu-single")]
let norm_sum: f32 = norm.iter().sum();

if norm_sum < min_norm.iter().sum() {
min_norm = norm;
}
#[cfg(not(feature = "gpu-single"))]
if norm_sum < 1e-12_f64 { continue }
#[cfg(feature = "gpu-single")]
if norm_sum < 1e-6_f32 { continue }

panic!("attempt {i} failed az={az:.3} za={za:.3} norm={norm:?}")
}

println!("attempt {i} passed, min_norm={min_norm:?}");
});

Ok(())
}

/*
DEBUG=1 cargo build --example gpu_destroyer --features=hip,hdf5-static --profile dev
RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999
cat > rocgdbinit <<EOF
set auto-load safe-path /
dir $PYTHONPATH
set amdgpu precise-memory on
set breakpoint pending on
set disassemble-next-line on
break examples/gpu_destroyer.rs:133
run
info reg
thread apply all backtrace
EOF
RAYON_NUM_THREADS=3 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 9999
*/
26 changes: 19 additions & 7 deletions src/fee/ffi/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,12 @@ fn test_calc_jones_gpu_via_ffi() {
let delays = array![[3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]];
let amps =
array![[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]];
let (az, za): (Vec<_>, Vec<_>) = (0..1025)
let n_dirs = std::env::var("N_DIRS")
.unwrap_or_else(|_| "1025".to_string()) // 192 passes, 193 fails on rocm-5.7.1
.parse::<usize>()
.unwrap();
assert!(n_dirs < 26904);
let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
.map(|i| {
(
0.45 + i as GpuFloat / 10000.0,
Expand Down Expand Up @@ -350,7 +355,7 @@ fn test_calc_jones_gpu_via_ffi() {
// Compare with CPU results.
let mut jones_cpu = Array3::zeros((delays.dim().0, freqs.len(), az.len()));
// Maybe need to regenerate the directions, depending on the GPU precision.
let (az, za): (Vec<_>, Vec<_>) = (0..1025)
let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
.map(|i| (0.45 + i as f64 / 10000.0, 0.45 + i as f64 / 10000.0))
.unzip();
for ((mut out, delays), amps) in jones_cpu
Expand Down Expand Up @@ -387,12 +392,19 @@ fn test_calc_jones_gpu_via_ffi() {
free_fee_beam(beam);
}

#[cfg(not(feature = "gpu-single"))]
assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-15);

#[cfg(feature = "gpu-single")]
// The errors are heavily dependent on the directions.
assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-6);
for ((gpu, cpu), az) in jones_gpu
.iter()
.zip(jones_cpu.iter())
.zip(az.iter())
{
#[cfg(not(feature = "gpu-single"))]
assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-15), "az: {az} cpu: {cpu} gpu: {gpu}");

#[cfg(feature = "gpu-single")]
assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-6), "az: {az} cpu: {cpu} gpu: {gpu}");
}

}

// Tests to expose errors follow.
Expand Down
Loading

0 comments on commit 2de4d8b

Please sign in to comment.