Skip to content

Commit a9534f4

Browse files
committed
enter the gpu destroyer
1 parent 6100a73 commit a9534f4

File tree

5 files changed

+346
-27
lines changed

5 files changed

+346
-27
lines changed

Cargo.toml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ authors = [
55
"Christopher H. Jordan <christopherjordan87@gmail.com>",
66
"Jack L. B. Line <jack.line@curtin.edu.au>",
77
"Marcin Sokolowski <marcin.sokolowski@curtin.edu.au>",
8+
"Dev Null <dev.null@curtin.edu.au>",
89
]
910
edition = "2021"
1011
rust-version = "1.64"
@@ -14,6 +15,7 @@ description = "Primary beam code for the Murchison Widefield Array (MWA) radio t
1415
repository = "https://github.com/MWATelescope/mwa_hyperbeam"
1516
homepage = "https://github.com/MWATelescope/mwa_hyperbeam"
1617
exclude = [".github/*", "fee_pols.pdf"]
18+
autotests = true
1719

1820
# Make a rust library, as well as static and C-compatible dynamic libraries
1921
# available as "libmwa_hyperbeam.a" and "libmwa_hyperbeam.so".
@@ -36,8 +38,8 @@ hip = ["hip-sys", "cc"]
3638
gpu-single = []
3739

3840
[profile.release]
39-
lto = "thin"
40-
codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
41+
# lto = "thin"
42+
# codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
4143

4244
[dependencies]
4345
cfg-if = "1.0.0"
@@ -100,6 +102,9 @@ required-features = ["cuda"]
100102
name = "analytic_hip"
101103
required-features = ["hip"]
102104

105+
[[example]]
106+
name = "gpu_destroyer"
107+
103108
[patch.crates-io]
104109
# marlu = { path = "../Marlu" }
105110
# marlu = { git = "https://github.com/MWATelescope/Marlu", branch = "DUT1" }

build.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,14 +215,30 @@ mod gpu {
215215
#[cfg(feature = "cuda-static")]
216216
println!("cargo:rustc-link-lib=static=cudart_static");
217217

218+
match env::var("DEBUG").as_deref() {
219+
Ok("false") => (),
220+
_ => {cuda_target.flag("-G");},
221+
};
222+
218223
cuda_target
219224
};
220225

221226
#[cfg(feature = "hip")]
222227
let mut gpu_target = {
223228
const DEFAULT_HIP_ARCHES: &[&str] = &["gfx90a"];
224229

225-
let hip_path = hip_sys::hiprt::get_hip_path();
230+
println!("cargo:rerun-if-env-changed=HIP_PATH");
231+
let hip_path = match env::var_os("HIP_PATH") {
232+
Some(p) => {
233+
println!("cargo:warning=HIP_PATH set from env {}", p.to_string_lossy());
234+
std::path::PathBuf::from(p)
235+
}
236+
None => {
237+
let hip_path = hip_sys::hiprt::get_hip_path();
238+
println!("cargo:warning=HIP_PATH set from hip_sys {}", hip_path.display());
239+
hip_path
240+
},
241+
};
226242
if !hip_path.exists() {
227243
panic!("Couldn't find HIP path at {}", hip_path.display());
228244
}
@@ -248,8 +264,19 @@ mod gpu {
248264
.file("src/fee/gpu/fee.cu")
249265
.file("src/analytic/gpu/analytic.cu");
250266

267+
println!("cargo:rerun-if-env-changed=HIP_FLAGS");
268+
match env::var_os("HIP_FLAGS") {
269+
Some(p) => {
270+
println!("cargo:warning=HIP_FLAGS set from env {}", p.to_string_lossy());
271+
hip_target.flag(&p.to_string_lossy());
272+
}
273+
None => (),
274+
}
275+
251276
hip_target.flag("-O0"); // <- hip can't handle optimizations
252277

278+
println!("cargo:rerun-if-env-changed=ROCM_VER");
279+
println!("cargo:rerun-if-env-changed=ROCM_PATH");
253280
println!("cargo:rerun-if-env-changed=HYPERBEAM_HIP_ARCH");
254281
println!("cargo:rerun-if-env-changed=HYPERDRIVE_HIP_ARCH");
255282
let arches: Vec<String> = match (
@@ -276,6 +303,11 @@ mod gpu {
276303
hip_target.flag(&format!("--offload-arch={arch}"));
277304
}
278305

306+
match env::var("DEBUG").as_deref() {
307+
Ok("false") => (),
308+
_ => {hip_target.flag("-ggdb");},
309+
};
310+
279311
hip_target
280312
};
281313

examples/gpu_destroyer.rs

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4+
5+
//! Example code using hyperbeam's GPU code with Rust.
6+
//!
7+
//! Build and run with something like:
8+
//! `cargo run --release --features=hip --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
9+
//! `cargo run --release --features=hip,gpu-single --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
10+
//!
11+
//! If the "gpu-single" feature is given, then single-precision floats are used
12+
//! on the GPU. This trades precision for speed. The speed gain is considerable if
13+
//! using a desktop GPU.
14+
//!
15+
//! If you want to use hyperbeam in your own Rust crate, then check out the latest
16+
//! version on crates.io:
17+
//!
18+
//! https://crates.io/crates/mwa_hyperbeam
19+
20+
use std::f64::consts::{FRAC_PI_2, PI};
21+
22+
use mwa_hyperbeam::{fee::FEEBeam, AzEl, GpuFloat, Jones};
23+
use ndarray::prelude::*;
24+
use rayon::prelude::*;
25+
26+
// #[inline(never)]
27+
pub fn paniq_() -> () {}
28+
29+
fn main() -> Result<(), Box<dyn std::error::Error>> {
30+
let mut args = std::env::args().skip(1);
31+
let num_directions: usize = args
32+
.next()
33+
.expect("number of directions supplied")
34+
.parse()
35+
.expect("number of directions is a number");
36+
37+
println!(
38+
"GPU float precision is {} bits",
39+
std::mem::size_of::<GpuFloat>() * 8
40+
);
41+
42+
let beam_file = args.next();
43+
// If we were given a file, use it. Otherwise, fall back on MWA_BEAM_FILE.
44+
let beam = match beam_file {
45+
Some(f) => FEEBeam::new(f)?,
46+
None => FEEBeam::new_from_env()?,
47+
};
48+
49+
// Set up our "GPU beam".
50+
let num_freqs = 1;
51+
let freqs_hz = (0..num_freqs).map(|i| (150_000 + 1000 * i) as u32).collect::<Vec<_>>();
52+
let latitude_rad = Some(-0.4660608448386394); // MWA
53+
let iau_order = true;
54+
55+
// Delays and amps correspond to dipoles in the "M&C order". See
56+
// https://wiki.mwatelescope.org/pages/viewpage.action?pageId=48005139) for
57+
// more info. Here, each row of the 2D array corresponds to a tile. In this
58+
// example, all delays and amps are the same, but they are allowed to vary
59+
// between tiles.
60+
let num_tiles = 1;
61+
let delays = Array2::zeros((num_tiles, 16));
62+
let amps = Array2::ones((num_tiles, 16));
63+
let norm_to_zenith = true;
64+
65+
// Set up the directions to test. The type depends on the GPU precision.
66+
let (az, za): (Vec<_>, Vec<_>) = (0..num_directions)
67+
.map(|i| {
68+
(
69+
0.4 + 0.3 * PI * (i as f64 / num_directions as f64),
70+
0.3 + 0.4 * FRAC_PI_2 * (i as f64 / num_directions as f64),
71+
)
72+
})
73+
.unzip();
74+
75+
// for ((az, za), i) in az.iter().zip(za.iter()).zip(0..) {
76+
// println!("i {:3} az={:.3} za={:.3}", i, az, za);
77+
// }
78+
79+
// compute on CPU for comparison.
80+
let mut cpu_jones =
81+
Array3::from_elem((delays.dim().0, freqs_hz.len(), az.len()), Jones::default());
82+
83+
for ((mut out, delays), amps) in cpu_jones
84+
.outer_iter_mut()
85+
.zip(delays.outer_iter())
86+
.zip(amps.outer_iter())
87+
{
88+
for (mut out, &freq) in out.outer_iter_mut().zip(freqs_hz.iter()) {
89+
let cpu_results = beam
90+
.calc_jones_array_pair(
91+
&az,
92+
&za,
93+
freq,
94+
delays.as_slice().unwrap(),
95+
amps.as_slice().unwrap(),
96+
norm_to_zenith,
97+
latitude_rad,
98+
iau_order,
99+
)
100+
.unwrap();
101+
102+
// Demote the CPU results if we have to.
103+
#[cfg(feature = "gpu-single")]
104+
let cpu_results: Vec<Jones<f32>> = cpu_results.into_iter().map(|j| j.into()).collect();
105+
106+
out.assign(&Array1::from(cpu_results));
107+
}
108+
}
109+
110+
// let gpu_az = az.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
111+
// let gpu_za = za.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
112+
let azels: Vec<_> = az.iter().zip(za.iter()).map(|(&az, &za)| AzEl { az, el: FRAC_PI_2 - za }).collect();
113+
114+
let num_attempts = 9999;
115+
(0..num_attempts).into_par_iter().for_each(|i| {
116+
let gpu_beam =
117+
unsafe { beam.gpu_prepare(freqs_hz.as_slice(), delays.view(), amps.view(), norm_to_zenith).expect("beam.gpu_prepare") };
118+
119+
// Call hyperbeam GPU code.
120+
let gpu_jones = gpu_beam.calc_jones(azels.as_slice(), latitude_rad, iau_order).expect("gpu_beam.calc_jones");
121+
122+
// assert_eq!(gpu_jones.dim(), cpu_jones.dim());
123+
124+
// Compare the differences with the CPU-generated Jones matrices
125+
let mut pass = true;
126+
for (&cpu, &gpu) in cpu_jones
127+
.iter()
128+
.zip(gpu_jones.iter())
129+
{
130+
let norm = (cpu - gpu).norm_sqr();
131+
// #[cfg(feature = "gpu-single")]
132+
// if norm.iter().sum::<f32>() > 1e-6_f32 { pass = false; break }
133+
// #[cfg(not(feature = "gpu-single"))]
134+
if norm.iter().sum::<f64>() > 1e-12_f64 { pass = false; paniq_(); break; }
135+
}
136+
let (min_norm, max_norm) = cpu_jones.iter().zip(gpu_jones.iter()).map(|(&cpu, &gpu)|
137+
(cpu - gpu).norm_sqr()
138+
).fold((f64::MAX, f64::MIN),
139+
|(min, max), norm|
140+
// #[cfg(feature = "gpu-single")]
141+
// {let s:f32=norm.iter().sum(); (min.min(s), max.max(s))}
142+
// #[cfg(not(feature = "gpu-single"))]
143+
{let s:f64=norm.iter().sum(); (min.min(s), max.max(s))}
144+
);
145+
if pass {
146+
eprintln!("attempt {:4} passed, min_norm={:?} max_norm={:?}", i, min_norm, max_norm);
147+
} else {
148+
eprintln!("attempt {:4} failed, min_norm={:?} max_norm={:?}", i, min_norm, max_norm);
149+
}
150+
151+
});
152+
153+
Ok(())
154+
}
155+
156+
/*
157+
DEBUG=1 cargo build --example gpu_destroyer --features=hip,hdf5-static --profile dev
158+
RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999
159+
cat > rocgdbinit <<EOF
160+
set auto-load safe-path /
161+
dir $PYTHONPATH
162+
set amdgpu precise-memory on
163+
set breakpoint pending on
164+
set disassemble-next-line on
165+
166+
break gpu_destroyer::paniq_
167+
commands
168+
up
169+
info locals
170+
thread apply all backtrace 5
171+
quit 1
172+
end
173+
break mwa_hyperbeam::fee::FEEBeam::gpu_prepare
174+
commands
175+
info args
176+
continue
177+
end
178+
break mwa_hyperbeam::fee::gpu::FEEBeamGpu::calc_jones
179+
commands
180+
info args
181+
continue
182+
end
183+
184+
run
185+
EOF
186+
# info reg
187+
# thread apply all backtrace
188+
# info threads
189+
# quit 1
190+
# EOF
191+
export MWA_BEAM_FILE="mwa_full_embedded_element_pattern.h5"
192+
[ -f $MWA_BEAM_FILE ] || wget -O "$MWA_BEAM_FILE" $'http://ws.mwatelescope.org/static/'$MWA_BEAM_FILE
193+
export ROCM_VER=5.4.6 RUST_VER=stable
194+
for ROCM_VER in system 5.4.6 5.5.3 5.6.1 5.7.0 5.7.1; do
195+
load_rocm $ROCM_VER
196+
cargo +$RUST_VER build --features=hdf5-static,hip --example gpu_destroyer && \
197+
RAYON_NUM_THREADS=2 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 1 \
198+
| tee gpu_destroyer-$ROCM_VER.log 2>&1
199+
done
200+
201+
202+
load_rocm $ROCM_VER
203+
cargo +$RUST_VER build --features=hdf5-static,hip --example gpu_destroyer
204+
RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999
205+
RAYON_NUM_THREADS=4 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 9999
206+
*/
207+
208+
/*
209+
attempt 676 passed, min_norm=[0.0, 0.0, 0.0, 0.0] [17/1978]
210+
attempt 2524 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
211+
attempt 343 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
212+
thread '<unnamed>' panicked at examples/gpu_destroyer.rs:146:13:
213+
attempt 1341 failed az=0.400 za=0.300 norm=[1.7982786046614562e-6, 0.0022572476886556668, 0.0023788106642785314, 9.57502419389
214+
7969e-8]
215+
attempt 5026 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
216+
attempt 3769 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
217+
attempt 677 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
218+
attempt 5027 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
219+
attempt 1346 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
220+
attempt 2525 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
221+
attempt 344 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
222+
attempt 3770 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
223+
attempt 678 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
224+
thread '<unnamed>' panicked at examples/gpu_destroyer.rs:146:13:
225+
attempt 345 failed az=0.400 za=0.300 norm=[1.7982786046614562e-6, 0.0022572476886556668, 0.0023788106642785314, 9.575024193897
226+
969e-8]
227+
attempt 1347 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
228+
attempt 3771 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
229+
attempt 2526 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
230+
attempt 1348 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
231+
attempt 5028 passed, min_norm=[1.4095292761800317e-18, 4.554040319507578e-16, 4.808375260085222e-16, 2.1365382960150389e-19]
232+
attempt 679 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
233+
attempt 346 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
234+
attempt 2527 passed, min_norm=[0.0, 0.0, 0.0, 0.0]
235+
236+
*/
237+

src/fee/ffi/tests.rs

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,12 @@ fn test_calc_jones_gpu_via_ffi() {
295295
let delays = array![[3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]];
296296
let amps =
297297
array![[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]];
298-
let (az, za): (Vec<_>, Vec<_>) = (0..1025)
298+
let n_dirs = std::env::var("N_DIRS")
299+
.unwrap_or_else(|_| "1025".to_string()) // 192 passes, 193 fails on rocm-5.7.1
300+
.parse::<usize>()
301+
.unwrap();
302+
assert!(n_dirs < 26904);
303+
let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
299304
.map(|i| {
300305
(
301306
0.45 + i as GpuFloat / 10000.0,
@@ -350,7 +355,7 @@ fn test_calc_jones_gpu_via_ffi() {
350355
// Compare with CPU results.
351356
let mut jones_cpu = Array3::zeros((delays.dim().0, freqs.len(), az.len()));
352357
// Maybe need to regenerate the directions, depending on the GPU precision.
353-
let (az, za): (Vec<_>, Vec<_>) = (0..1025)
358+
let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
354359
.map(|i| (0.45 + i as f64 / 10000.0, 0.45 + i as f64 / 10000.0))
355360
.unzip();
356361
for ((mut out, delays), amps) in jones_cpu
@@ -387,12 +392,19 @@ fn test_calc_jones_gpu_via_ffi() {
387392
free_fee_beam(beam);
388393
}
389394

390-
#[cfg(not(feature = "gpu-single"))]
391-
assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-15);
392-
393-
#[cfg(feature = "gpu-single")]
394395
// The errors are heavily dependent on the directions.
395-
assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-6);
396+
for ((gpu, cpu), az) in jones_gpu
397+
.iter()
398+
.zip(jones_cpu.iter())
399+
.zip(az.iter())
400+
{
401+
#[cfg(not(feature = "gpu-single"))]
402+
assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-15), "az: {az} cpu: {cpu} gpu: {gpu}");
403+
404+
#[cfg(feature = "gpu-single")]
405+
assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-6), "az: {az} cpu: {cpu} gpu: {gpu}");
406+
}
407+
396408
}
397409

398410
// Tests to expose errors follow.

0 commit comments

Comments
 (0)