-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
271 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
// This Source Code Form is subject to the terms of the Mozilla Public | ||
// License, v. 2.0. If a copy of the MPL was not distributed with this | ||
// file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
//! Example code using hyperbeam's GPU code with Rust. | ||
//! | ||
//! Build and run with something like: | ||
//! `cargo run --release --features=hip --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5` | ||
//! `cargo run --release --features=hip,gpu-single --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5` | ||
//! | ||
//! If the "gpu-single" feature is given, then single-precision floats are used | ||
//! on the GPU. This trades precision for speed. The speed gain is considerable if | ||
//! using a desktop GPU. | ||
//! | ||
//! If you want to use hyperbeam in your own Rust crate, then check out the latest | ||
//! version on crates.io: | ||
//! | ||
//! https://crates.io/crates/mwa_hyperbeam | ||
use std::f64::consts::{FRAC_PI_2, PI}; | ||
|
||
use mwa_hyperbeam::{fee::FEEBeam, AzEl, GpuFloat, Jones}; | ||
use ndarray::prelude::*; | ||
use rayon::prelude::*; | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let mut args = std::env::args().skip(1); | ||
let num_directions: usize = args | ||
.next() | ||
.expect("number of directions supplied") | ||
.parse() | ||
.expect("number of directions is a number"); | ||
|
||
println!( | ||
"GPU float precision is {} bits", | ||
std::mem::size_of::<GpuFloat>() * 8 | ||
); | ||
|
||
let beam_file = args.next(); | ||
// If we were given a file, use it. Otherwise, fall back on MWA_BEAM_FILE. | ||
let beam = match beam_file { | ||
Some(f) => FEEBeam::new(f)?, | ||
None => FEEBeam::new_from_env()?, | ||
}; | ||
|
||
// Set up our "GPU beam". | ||
let num_freqs = 1; | ||
let freqs_hz = (0..num_freqs).map(|i| (150_000 + 1000 * i) as u32).collect::<Vec<_>>(); | ||
let latitude_rad = Some(-0.4660608448386394); // MWA | ||
let iau_order = true; | ||
|
||
// Delays and amps correspond to dipoles in the "M&C order". See | ||
// https://wiki.mwatelescope.org/pages/viewpage.action?pageId=48005139) for | ||
// more info. Here, each row of the 2D array corresponds to a tile. In this | ||
// example, all delays and amps are the same, but they are allowed to vary | ||
// between tiles. | ||
let num_tiles = 1; | ||
let delays = Array2::zeros((num_tiles, 16)); | ||
let amps = Array2::ones((num_tiles, 16)); | ||
let norm_to_zenith = true; | ||
|
||
// Set up the directions to test. The type depends on the GPU precision. | ||
let (az, za): (Vec<_>, Vec<_>) = (0..num_directions) | ||
.map(|i| { | ||
( | ||
0.4 + 0.3 * PI * (i as f64 / num_directions as f64), | ||
0.3 + 0.4 * FRAC_PI_2 * (i as f64 / num_directions as f64), | ||
) | ||
}) | ||
.unzip(); | ||
|
||
for ((az, za), i) in az.iter().zip(za.iter()).zip(0..) { | ||
println!("i {:3} az={:.3} za={:.3}", i, az, za); | ||
} | ||
|
||
// compute on CPU for comparison. | ||
let mut cpu_jones = | ||
Array3::from_elem((delays.dim().0, freqs_hz.len(), az.len()), Jones::default()); | ||
|
||
for ((mut out, delays), amps) in cpu_jones | ||
.outer_iter_mut() | ||
.zip(delays.outer_iter()) | ||
.zip(amps.outer_iter()) | ||
{ | ||
for (mut out, &freq) in out.outer_iter_mut().zip(freqs_hz.iter()) { | ||
let cpu_results = beam | ||
.calc_jones_array_pair( | ||
&az, | ||
&za, | ||
freq, | ||
delays.as_slice().unwrap(), | ||
amps.as_slice().unwrap(), | ||
norm_to_zenith, | ||
latitude_rad, | ||
iau_order, | ||
) | ||
.unwrap(); | ||
|
||
// Demote the CPU results if we have to. | ||
#[cfg(feature = "gpu-single")] | ||
let cpu_results: Vec<Jones<f32>> = cpu_results.into_iter().map(|j| j.into()).collect(); | ||
|
||
out.assign(&Array1::from(cpu_results)); | ||
} | ||
} | ||
|
||
// let gpu_az = az.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>(); | ||
// let gpu_za = za.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>(); | ||
let azels = az.iter().zip(za.iter()).map(|(&az, &za)| AzEl { az, el: FRAC_PI_2 - za }).collect::<Vec<_>>(); | ||
|
||
let num_attempts = 9999; | ||
(0..num_attempts).into_par_iter().for_each(|i| { | ||
let gpu_beam = | ||
unsafe { beam.gpu_prepare(freqs_hz.as_slice(), delays.view(), amps.view(), norm_to_zenith).expect("beam.gpu_prepare") }; | ||
|
||
// Call hyperbeam GPU code. | ||
let gpu_jones = gpu_beam.calc_jones(azels.as_slice(), latitude_rad, iau_order).expect("gpu_beam.calc_jones"); | ||
|
||
assert_eq!(gpu_jones.dim(), cpu_jones.dim()); | ||
|
||
// Compare the differences with the CPU-generated Jones matrices | ||
#[cfg(not(feature = "gpu-single"))] | ||
let mut min_norm = [f64::MAX; 4]; | ||
#[cfg(feature = "gpu-single")] | ||
let mut min_norm = [f32::MAX; 4]; | ||
for (((&cpu, &gpu), az), za) in cpu_jones | ||
.iter() | ||
.zip(gpu_jones.iter()) | ||
.zip(az.iter()) | ||
.zip(za.iter()) | ||
{ | ||
let norm = (cpu - gpu).norm_sqr(); | ||
#[cfg(not(feature = "gpu-single"))] | ||
let norm_sum: f64 = norm.iter().sum(); | ||
#[cfg(feature = "gpu-single")] | ||
let norm_sum: f32 = norm.iter().sum(); | ||
|
||
if norm_sum < min_norm.iter().sum() { | ||
min_norm = norm; | ||
} | ||
#[cfg(not(feature = "gpu-single"))] | ||
if norm_sum < 1e-12_f64 { continue } | ||
#[cfg(feature = "gpu-single")] | ||
if norm_sum < 1e-6_f32 { continue } | ||
|
||
panic!("attempt {i} failed az={az:.3} za={za:.3} norm={norm:?}") | ||
} | ||
|
||
println!("attempt {i} passed, min_norm={min_norm:?}"); | ||
}); | ||
|
||
Ok(()) | ||
} | ||
|
||
/* | ||
DEBUG=1 cargo build --example gpu_destroyer --features=hip,hdf5-static --profile dev | ||
RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999 | ||
cat > rocgdbinit <<EOF | ||
set auto-load safe-path / | ||
dir $PYTHONPATH | ||
set amdgpu precise-memory on | ||
set breakpoint pending on | ||
set disassemble-next-line on | ||
break examples/gpu_destroyer.rs:133 | ||
run | ||
info reg | ||
thread apply all backtrace | ||
EOF | ||
RAYON_NUM_THREADS=3 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 9999 | ||
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.