enter the gpu destroyer

MWATelescope · Jan 11, 2024 · 2de4d8b · 2de4d8b
1 parent 6100a73
commit 2de4d8b
Show file tree

Hide file tree

Showing 5 changed files with 271 additions and 28 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ authors = [
     "Christopher H. Jordan <christopherjordan87@gmail.com>",
     "Jack L. B. Line <jack.line@curtin.edu.au>",
     "Marcin Sokolowski <marcin.sokolowski@curtin.edu.au>",
+    "Dev Null <dev.null@curtin.edu.au>",
 ]
 edition = "2021"
 rust-version = "1.64"
@@ -14,6 +15,7 @@ description = "Primary beam code for the Murchison Widefield Array (MWA) radio t
 repository = "https://github.com/MWATelescope/mwa_hyperbeam"
 homepage = "https://github.com/MWATelescope/mwa_hyperbeam"
 exclude = [".github/*", "fee_pols.pdf"]
+autotests = true
 
 # Make a rust library, as well as static and C-compatible dynamic libraries
 # available as "libmwa_hyperbeam.a" and "libmwa_hyperbeam.so".
@@ -36,8 +38,8 @@ hip = ["hip-sys", "cc"]
 gpu-single = []
 
 [profile.release]
-lto = "thin"
-codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
+# lto = "thin"
+# codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
 
 [dependencies]
 cfg-if = "1.0.0"
@@ -100,6 +102,9 @@ required-features = ["cuda"]
 name = "analytic_hip"
 required-features = ["hip"]
 
+[[example]]
+name = "gpu_destroyer"
+
 [patch.crates-io]
 # marlu = { path = "../Marlu" }
 # marlu = { git = "https://github.com/MWATelescope/Marlu", branch = "DUT1" }
diff --git a/build.rs b/build.rs
@@ -215,14 +215,30 @@ mod gpu {
             #[cfg(feature = "cuda-static")]
             println!("cargo:rustc-link-lib=static=cudart_static");
 
+            match env::var("DEBUG").as_deref() {
+                Ok("false") => (),
+                _ => {cuda_target.flag("-G");},
+            };
+
             cuda_target
         };
 
         #[cfg(feature = "hip")]
         let mut gpu_target = {
             const DEFAULT_HIP_ARCHES: &[&str] = &["gfx90a"];
 
-            let hip_path = hip_sys::hiprt::get_hip_path();
+            println!("cargo:rerun-if-env-changed=HIP_PATH");
+            let hip_path = match env::var_os("HIP_PATH") {
+                Some(p) => {
+                    println!("cargo:warning=HIP_PATH set from env {}", p.to_string_lossy());
+                    std::path::PathBuf::from(p)
+                }
+                None => {
+                    let hip_path = hip_sys::hiprt::get_hip_path();
+                    println!("cargo:warning=HIP_PATH set from hip_sys {}", hip_path.display());
+                    hip_path
+                },
+            };
             if !hip_path.exists() {
                 panic!("Couldn't find HIP path at {}", hip_path.display());
             }
@@ -243,13 +259,15 @@ mod gpu {
             let mut hip_target = cc::Build::new();
             hip_target
                 .compiler(compiler)
-                .include(hip_path.join("include/hip"))
+                // .include(hip_path.join("include/hip"))
                 .include("src/gpu_common/")
                 .file("src/fee/gpu/fee.cu")
                 .file("src/analytic/gpu/analytic.cu");
 
             hip_target.flag("-O0"); // <- hip can't handle optimizations
 
+            println!("cargo:rerun-if-env-changed=ROCM_VER");
+            println!("cargo:rerun-if-env-changed=ROCM_PATH");
             println!("cargo:rerun-if-env-changed=HYPERBEAM_HIP_ARCH");
             println!("cargo:rerun-if-env-changed=HYPERDRIVE_HIP_ARCH");
             let arches: Vec<String> = match (
@@ -276,6 +294,11 @@ mod gpu {
                 hip_target.flag(&format!("--offload-arch={arch}"));
             }
 
+            match env::var("DEBUG").as_deref() {
+                Ok("false") => (),
+                _ => hip_target.flag("-ggdb"),
+            };
+
             hip_target
         };
 

diff --git a/examples/gpu_destroyer.rs b/examples/gpu_destroyer.rs
@@ -0,0 +1,170 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+//! Example code using hyperbeam's GPU code with Rust.
+//!
+//! Build and run with something like:
+//! `cargo run --release --features=hip --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
+//! `cargo run --release --features=hip,gpu-single --example gpu_destroyer -- 1000000 mwa_full_embedded_element_pattern.h5`
+//!
+//! If the "gpu-single" feature is given, then single-precision floats are used
+//! on the GPU. This trades precision for speed. The speed gain is considerable if
+//! using a desktop GPU.
+//!
+//! If you want to use hyperbeam in your own Rust crate, then check out the latest
+//! version on crates.io:
+//!
+//! https://crates.io/crates/mwa_hyperbeam
+
+use std::f64::consts::{FRAC_PI_2, PI};
+
+use mwa_hyperbeam::{fee::FEEBeam, AzEl, GpuFloat, Jones};
+use ndarray::prelude::*;
+use rayon::prelude::*;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut args = std::env::args().skip(1);
+    let num_directions: usize = args
+        .next()
+        .expect("number of directions supplied")
+        .parse()
+        .expect("number of directions is a number");
+
+    println!(
+        "GPU float precision is {} bits",
+        std::mem::size_of::<GpuFloat>() * 8
+    );
+
+    let beam_file = args.next();
+    // If we were given a file, use it. Otherwise, fall back on MWA_BEAM_FILE.
+    let beam = match beam_file {
+        Some(f) => FEEBeam::new(f)?,
+        None => FEEBeam::new_from_env()?,
+    };
+
+    // Set up our "GPU beam".
+    let num_freqs = 1;
+    let freqs_hz = (0..num_freqs).map(|i| (150_000 + 1000 * i) as u32).collect::<Vec<_>>();
+    let latitude_rad = Some(-0.4660608448386394); // MWA
+    let iau_order = true;
+
+    // Delays and amps correspond to dipoles in the "M&C order". See
+    // https://wiki.mwatelescope.org/pages/viewpage.action?pageId=48005139) for
+    // more info. Here, each row of the 2D array corresponds to a tile. In this
+    // example, all delays and amps are the same, but they are allowed to vary
+    // between tiles.
+    let num_tiles = 1;
+    let delays = Array2::zeros((num_tiles, 16));
+    let amps = Array2::ones((num_tiles, 16));
+    let norm_to_zenith = true;
+
+    // Set up the directions to test. The type depends on the GPU precision.
+    let (az, za): (Vec<_>, Vec<_>) = (0..num_directions)
+        .map(|i| {
+            (
+                0.4 + 0.3 * PI * (i as f64 / num_directions as f64),
+                0.3 + 0.4 * FRAC_PI_2 * (i as f64 / num_directions as f64),
+            )
+        })
+        .unzip();
+
+    for ((az, za), i) in az.iter().zip(za.iter()).zip(0..) {
+        println!("i {:3} az={:.3} za={:.3}", i, az, za);
+    }
+
+    // compute on CPU for comparison.
+    let mut cpu_jones =
+        Array3::from_elem((delays.dim().0, freqs_hz.len(), az.len()), Jones::default());
+
+    for ((mut out, delays), amps) in cpu_jones
+        .outer_iter_mut()
+        .zip(delays.outer_iter())
+        .zip(amps.outer_iter())
+    {
+        for (mut out, &freq) in out.outer_iter_mut().zip(freqs_hz.iter()) {
+            let cpu_results = beam
+                .calc_jones_array_pair(
+                    &az,
+                    &za,
+                    freq,
+                    delays.as_slice().unwrap(),
+                    amps.as_slice().unwrap(),
+                    norm_to_zenith,
+                    latitude_rad,
+                    iau_order,
+                )
+                .unwrap();
+
+            // Demote the CPU results if we have to.
+            #[cfg(feature = "gpu-single")]
+            let cpu_results: Vec<Jones<f32>> = cpu_results.into_iter().map(|j| j.into()).collect();
+
+            out.assign(&Array1::from(cpu_results));
+        }
+    }
+
+    // let gpu_az = az.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
+    // let gpu_za = za.iter().map(|&a| a as GpuFloat).collect::<Vec<_>>();
+    let azels = az.iter().zip(za.iter()).map(|(&az, &za)| AzEl { az, el: FRAC_PI_2 - za }).collect::<Vec<_>>();
+
+    let num_attempts = 9999;
+    (0..num_attempts).into_par_iter().for_each(|i| {
+        let gpu_beam =
+            unsafe { beam.gpu_prepare(freqs_hz.as_slice(), delays.view(), amps.view(), norm_to_zenith).expect("beam.gpu_prepare") };
+
+        // Call hyperbeam GPU code.
+        let gpu_jones = gpu_beam.calc_jones(azels.as_slice(), latitude_rad, iau_order).expect("gpu_beam.calc_jones");
+
+        assert_eq!(gpu_jones.dim(), cpu_jones.dim());
+
+        // Compare the differences with the CPU-generated Jones matrices
+        #[cfg(not(feature = "gpu-single"))]
+        let mut min_norm = [f64::MAX; 4];
+        #[cfg(feature = "gpu-single")]
+        let mut min_norm = [f32::MAX; 4];
+        for (((&cpu, &gpu), az), za) in cpu_jones
+            .iter()
+            .zip(gpu_jones.iter())
+            .zip(az.iter())
+            .zip(za.iter())
+        {
+            let norm = (cpu - gpu).norm_sqr();
+            #[cfg(not(feature = "gpu-single"))]
+            let norm_sum: f64 = norm.iter().sum();
+            #[cfg(feature = "gpu-single")]
+            let norm_sum: f32 = norm.iter().sum();
+
+            if norm_sum < min_norm.iter().sum() {
+                min_norm = norm;
+            }
+            #[cfg(not(feature = "gpu-single"))]
+            if norm_sum < 1e-12_f64 { continue }
+            #[cfg(feature = "gpu-single")]
+            if norm_sum < 1e-6_f32 { continue }
+
+            panic!("attempt {i} failed az={az:.3} za={za:.3} norm={norm:?}")
+        }
+
+        println!("attempt {i} passed, min_norm={min_norm:?}");
+    });
+
+    Ok(())
+}
+
+/*
+DEBUG=1 cargo build --example gpu_destroyer --features=hip,hdf5-static --profile dev
+RAYON_NUM_THREADS=3 target/debug/examples/gpu_destroyer 9999
+cat > rocgdbinit <<EOF
+set auto-load safe-path /
+dir $PYTHONPATH
+set amdgpu precise-memory on
+set breakpoint pending on
+set disassemble-next-line on
+break examples/gpu_destroyer.rs:133
+run
+info reg
+thread apply all backtrace
+EOF
+RAYON_NUM_THREADS=3 rocgdb -x rocgdbinit --args target/debug/examples/gpu_destroyer 9999
+ */
diff --git a/src/fee/ffi/tests.rs b/src/fee/ffi/tests.rs
@@ -295,7 +295,12 @@ fn test_calc_jones_gpu_via_ffi() {
     let delays = array![[3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]];
     let amps =
         array![[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]];
-    let (az, za): (Vec<_>, Vec<_>) = (0..1025)
+    let n_dirs = std::env::var("N_DIRS")
+        .unwrap_or_else(|_| "1025".to_string()) // 192 passes, 193 fails on rocm-5.7.1
+        .parse::<usize>()
+        .unwrap();
+    assert!(n_dirs < 26904);
+    let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
         .map(|i| {
             (
                 0.45 + i as GpuFloat / 10000.0,
@@ -350,7 +355,7 @@ fn test_calc_jones_gpu_via_ffi() {
     // Compare with CPU results.
     let mut jones_cpu = Array3::zeros((delays.dim().0, freqs.len(), az.len()));
     // Maybe need to regenerate the directions, depending on the GPU precision.
-    let (az, za): (Vec<_>, Vec<_>) = (0..1025)
+    let (az, za): (Vec<_>, Vec<_>) = (0..n_dirs)
         .map(|i| (0.45 + i as f64 / 10000.0, 0.45 + i as f64 / 10000.0))
         .unzip();
     for ((mut out, delays), amps) in jones_cpu
@@ -387,12 +392,19 @@ fn test_calc_jones_gpu_via_ffi() {
         free_fee_beam(beam);
     }
 
-    #[cfg(not(feature = "gpu-single"))]
-    assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-15);
-
-    #[cfg(feature = "gpu-single")]
     // The errors are heavily dependent on the directions.
-    assert_abs_diff_eq!(jones_gpu, jones_cpu, epsilon = 1e-6);
+    for ((gpu, cpu), az) in jones_gpu
+        .iter()
+        .zip(jones_cpu.iter())
+        .zip(az.iter())
+    {
+        #[cfg(not(feature = "gpu-single"))]
+        assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-15), "az: {az} cpu: {cpu} gpu: {gpu}");
+
+        #[cfg(feature = "gpu-single")]
+        assert!(abs_diff_eq!(gpu, cpu, epsilon = 1e-6), "az: {az} cpu: {cpu} gpu: {gpu}");
+    }
+
 }
 
 // Tests to expose errors follow.