|
| 1 | +// This Source Code Form is subject to the terms of the Mozilla Public |
| 2 | +// License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 | +// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 4 | + |
| 5 | +//! Example code using hyperbeam's GPU code with Rust. |
| 6 | +//! |
| 7 | +//! Build and run with something like: |
| 8 | +//! `cargo run --release --features=hip --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5` |
| 9 | +//! `cargo run --release --features=hip,gpu-single --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5` |
| 10 | +//! |
| 11 | +//! If the "gpu-single" feature is given, then single-precision floats are used |
| 12 | +//! on the GPU. This trades precision for speed. The speed gain is considerable if |
| 13 | +//! using a desktop GPU. |
| 14 | +//! |
| 15 | +//! If you want to use hyperbeam in your own Rust crate, then check out the latest |
| 16 | +//! version on crates.io: |
| 17 | +//! |
| 18 | +//! https://crates.io/crates/mwa_hyperbeam |
| 19 | +
|
| 20 | +use std::f64::consts::{FRAC_PI_2, PI}; |
| 21 | + |
| 22 | +use mwa_hyperbeam::{fee::FEEBeam, AzEl, GpuFloat, Jones}; |
| 23 | +use ndarray::prelude::*; |
| 24 | +use rayon::prelude::*; |
| 25 | + |
| 26 | +// #[inline(never)] |
| 27 | +pub fn paniq_() -> () {} |
| 28 | + |
| 29 | +fn main() -> Result<(), Box<dyn std::error::Error>> { |
| 30 | + let mut args = std::env::args().skip(1); |
| 31 | + let num_directions: usize = args |
| 32 | + .next() |
| 33 | + .expect("number of directions supplied") |
| 34 | + .parse() |
| 35 | + .expect("number of directions is a number"); |
| 36 | + |
| 37 | + println!( |
| 38 | + "GPU float precision is {} bits", |
| 39 | + std::mem::size_of::<GpuFloat>() * 8 |
| 40 | + ); |
| 41 | + |
| 42 | + let beam_file = args.next(); |
| 43 | + // If we were given a file, use it. Otherwise, fall back on MWA_BEAM_FILE. |
| 44 | + let beam = match beam_file { |
| 45 | + Some(f) => FEEBeam::new(f)?, |
| 46 | + None => FEEBeam::new_from_env()?, |
| 47 | + }; |
| 48 | + |
| 49 | + // Set up our "GPU beam". |
| 50 | + let num_freqs = 1; |
| 51 | + let freqs_hz = (0..num_freqs).map(|i| (150_000 + 1000 * i) as u32).collect::<Vec<_>>(); |
| 52 | + let latitude_rad = Some(-0.4660608448386394); // MWA |
| 53 | + let iau_order = true; |
| 54 | + |
| 55 | + // Delays and amps correspond to dipoles in the "M&C order". See |
| 56 | + // https://wiki.mwatelescope.org/pages/viewpage.action?pageId=48005139) for |
| 57 | + // more info. Here, each row of the 2D array corresponds to a tile. In this |
| 58 | + // example, all delays and amps are the same, but they are allowed to vary |
| 59 | + // between tiles. |
| 60 | + let num_tiles = 1; |
| 61 | + let delays = Array2::zeros((num_tiles, 16)); |
| 62 | + let amps = Array2::ones((num_tiles, 16)); |
| 63 | + let norm_to_zenith = true; |
| 64 | + |
| 65 | + // Set up the directions to test. The type depends on the GPU precision. |
| 66 | + let (az, za): (Vec<_>, Vec<_>) = (0..num_directions) |
| 67 | + .map(|i| { |
| 68 | + ( |
| 69 | + 0.4 + 0.3 * PI * (i as f64 / num_directions as f64), |
| 70 | + 0.3 + 0.4 * FRAC_PI_2 * (i as f64 / num_directions as f64), |
| 71 | + ) |
| 72 | + }) |
| 73 | + .unzip(); |
| 74 | + |
| 75 | + // for ((az, za), i) in az.iter().zip(za.iter()).zip(0..) { |
| 76 | + // println!("i {:3} az={:.3} za={:.3}", i, az, za); |
| 77 | + // } |
| 78 | + |
| 79 | + // compute on CPU for comparison. |
| 80 | + let mut cpu_jones = |
| 81 | + Array3::from_elem((delays.dim().0, freqs_hz.len(), az.len()), Jones::default()); |
| 82 | + |
| 83 | + for ((mut out, delays), amps) in cpu_jones |
| 84 | + .outer_iter_mut() |
| 85 | + .zip(delays.outer_iter()) |
| 86 | + .zip(amps.outer_iter()) |
| 87 | + { |
| 88 | + for (mut out, &freq) in out.outer_iter_mut().zip(freqs_hz.iter()) { |
| 89 | + let cpu_results = beam |
| 90 | + .calc_jones_array_pair( |
| 91 | + &az, |
| 92 | + &za, |
| 93 | + freq, |
| 94 | + delays.as_slice().unwrap(), |
| 95 | + amps.as_slice().unwrap(), |
| 96 | + norm_to_zenith, |
| 97 | + latitude_rad, |
| 98 | + iau_order, |
| 99 | + ) |
| 100 | + .unwrap(); |
| 101 | + |
| 102 | + // Demote the CPU results if we have to. |
| 103 | + #[cfg(feature = "gpu-single")] |
| 104 | + let cpu_results: Vec<Jones<f32>> = cpu_results.into_iter().map(|j| j.into()).collect(); |
| 105 | + |
| 106 | + out.assign(&Array1::from(cpu_results)); |
| 107 | + } |
| 108 | + } |
| 109 | + |
| 110 | + // let gpu_az = az.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>(); |
| 111 | + // let gpu_za = za.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>(); |
| 112 | + let azels: Vec<_> = az.iter().zip(za.iter()).map(|(&az, &za)| AzEl { az, el: FRAC_PI_2 - za }).collect(); |
| 113 | + |
| 114 | + let num_attempts = 9999; |
| 115 | + (0..num_attempts).into_par_iter().for_each(|i| { |
| 116 | + let gpu_beam = |
| 117 | + unsafe { beam.gpu_prepare(freqs_hz.as_slice(), delays.view(), amps.view(), norm_to_zenith).expect("beam.gpu_prepare") }; |
| 118 | + |
| 119 | + // Call hyperbeam GPU code. |
| 120 | + let gpu_jones = gpu_beam.calc_jones(azels.as_slice(), latitude_rad, iau_order).expect("gpu_beam.calc_jones"); |
| 121 | + |
| 122 | + // assert_eq!(gpu_jones.dim(), cpu_jones.dim()); |
| 123 | + |
| 124 | + // Compare the differences with the CPU-generated Jones matrices |
| 125 | + let mut pass = true; |
| 126 | + for (&cpu, &gpu) in cpu_jones |
| 127 | + .iter() |
| 128 | + .zip(gpu_jones.iter()) |
| 129 | + { |
| 130 | + let norm = (cpu - gpu).norm_sqr(); |
| 131 | + // #[cfg(feature = "gpu-single")] |
| 132 | + // if norm.iter().sum::<f32>() > 1e-6_f32 { pass = false; break } |
| 133 | + // #[cfg(not(feature = "gpu-single"))] |
| 134 | + if norm.iter().sum::<f64>() > 1e-12_f64 { pass = false; paniq_(); break; } |
| 135 | + } |
| 136 | + let (min_norm, max_norm) = cpu_jones.iter().zip(gpu_jones.iter()).map(|(&cpu, &gpu)| |
| 137 | + (cpu - gpu).norm_sqr() |
| 138 | + ).fold((f64::MAX, f64::MIN), |
| 139 | + |(min, max), norm| |
| 140 | + // #[cfg(feature = "gpu-single")] |
| 141 | + // {let s:f32=norm.iter().sum(); (min.min(s), max.max(s))} |
| 142 | + // #[cfg(not(feature = "gpu-single"))] |
| 143 | + {let s:f64=norm.iter().sum(); (min.min(s), max.max(s))} |
| 144 | + ); |
| 145 | + if pass { |
| 146 | + eprintln!("attempt {:4} passed, min_norm={:?} max_norm={:?}", i, min_norm, max_norm); |
| 147 | + } else { |
| 148 | + eprintln!("attempt {:4} failed, min_norm={:?} max_norm={:?}", i, min_norm, max_norm); |
| 149 | + } |
| 150 | + |
| 151 | + }); |
| 152 | + |
| 153 | + Ok(()) |
| 154 | +} |
| 155 | + |
| 156 | +/* |
| 157 | +DEBUG=1 cargo build --example gpu_destroyer --features=hip,hdf5-static --profile dev |
| 158 | +RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999 |
| 159 | +cat > rocgdbinit <<EOF |
| 160 | +set auto-load safe-path / |
| 161 | +dir $PYTHONPATH |
| 162 | +set amdgpu precise-memory on |
| 163 | +set breakpoint pending on |
| 164 | +set disassemble-next-line on |
| 165 | +
|
| 166 | +break gpu_destroyer::paniq_ |
| 167 | +commands |
| 168 | + up |
| 169 | + info locals |
| 170 | + thread apply all backtrace 5 |
| 171 | + quit 1 |
| 172 | +end |
| 173 | +break mwa_hyperbeam::fee::FEEBeam::gpu_prepare |
| 174 | +commands |
| 175 | + info args |
| 176 | + continue |
| 177 | +end |
| 178 | +break mwa_hyperbeam::fee::gpu::FEEBeamGpu::calc_jones |
| 179 | +commands |
| 180 | + info args |
| 181 | + continue |
| 182 | +end |
| 183 | +
|
| 184 | +run |
| 185 | +EOF |
| 186 | +# info reg |
| 187 | +# thread apply all backtrace |
| 188 | +# info threads |
| 189 | +# quit 1 |
| 190 | +# EOF |
| 191 | +export MWA_BEAM_FILE="mwa_full_embedded_element_pattern.h5" |
| 192 | +[ -f $MWA_BEAM_FILE ] || wget -O "$MWA_BEAM_FILE" $'http://ws.mwatelescope.org/static/'$MWA_BEAM_FILE |
| 193 | +export ROCM_VER=5.4.6 RUST_VER=stable |
| 194 | +for ROCM_VER in system 5.4.6 5.5.3 5.6.1 5.7.0 5.7.1; do |
| 195 | + load_rocm $ROCM_VER |
| 196 | + cargo +$RUST_VER build --features=hdf5-static,hip --example gpu_destroyer && \ |
| 197 | + RAYON_NUM_THREADS=2 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 1 \ |
| 198 | + | tee gpu_destroyer-$ROCM_VER.log 2>&1 |
| 199 | +done |
| 200 | +
|
| 201 | +
|
| 202 | +load_rocm $ROCM_VER |
| 203 | +cargo +$RUST_VER build --features=hdf5-static,hip --example gpu_destroyer |
| 204 | +RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999 |
| 205 | +RAYON_NUM_THREADS=4 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 9999 |
| 206 | +*/ |
| 207 | + |
| 208 | +/* |
| 209 | +attempt 676 passed, min_norm=[0.0, 0.0, 0.0, 0.0] [17/1978] |
| 210 | +attempt 2524 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 211 | +attempt 343 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 212 | +thread '<unnamed>' panicked at examples/gpu_destroyer.rs:146:13: |
| 213 | +attempt 1341 failed az=0.400 za=0.300 norm=[1.7982786046614562e-6, 0.0022572476886556668, 0.0023788106642785314, 9.57502419389 |
| 214 | +7969e-8] |
| 215 | +attempt 5026 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 216 | +attempt 3769 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 217 | +attempt 677 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 218 | +attempt 5027 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 219 | +attempt 1346 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 220 | +attempt 2525 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 221 | +attempt 344 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 222 | +attempt 3770 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 223 | +attempt 678 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 224 | +thread '<unnamed>' panicked at examples/gpu_destroyer.rs:146:13: |
| 225 | +attempt 345 failed az=0.400 za=0.300 norm=[1.7982786046614562e-6, 0.0022572476886556668, 0.0023788106642785314, 9.575024193897 |
| 226 | +969e-8] |
| 227 | +attempt 1347 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 228 | +attempt 3771 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 229 | +attempt 2526 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 230 | +attempt 1348 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 231 | +attempt 5028 passed, min_norm=[1.4095292761800317e-18, 4.554040319507578e-16, 4.808375260085222e-16, 2.1365382960150389e-19] |
| 232 | +attempt 679 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 233 | +attempt 346 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 234 | +attempt 2527 passed, min_norm=[0.0, 0.0, 0.0, 0.0] |
| 235 | +
|
| 236 | +*/ |
| 237 | + |
0 commit comments