[cpp] Use std::vector for sim3d arrays

tobanteAudio · Sep 24, 2024 · 31da921 · 31da921
1 parent 52946b4
commit 31da921
Show file tree

Hide file tree

Showing 6 changed files with 199 additions and 217 deletions.
diff --git a/src/cpp/pffdtd/engine_cpu_3d.cpp b/src/cpp/pffdtd/engine_cpu_3d.cpp
@@ -91,24 +91,24 @@ auto run(Simulation3D<Real> const& sd) -> void {
   int64_t const Nb   = sd.Nb;
   int64_t const Nbl  = sd.Nbl;
   int64_t const Nba  = sd.Nba;
-  int8_t* Mb         = sd.Mb;
 
   // keep local copies of pointers (style choice)
-  int64_t* bn_ixyz         = sd.bn_ixyz;
-  int64_t* bnl_ixyz        = sd.bnl_ixyz;
-  int64_t* bna_ixyz        = sd.bna_ixyz;
-  int64_t* in_ixyz         = sd.in_ixyz;
-  int64_t* out_ixyz        = sd.out_ixyz;
-  uint16_t* adj_bn         = sd.adj_bn;
-  uint8_t* bn_mask         = sd.bn_mask;
-  int8_t* mat_bnl          = sd.mat_bnl;
-  int8_t* Q_bna            = sd.Q_bna;
-  double* in_sigs          = sd.in_sigs;
-  double* u_out            = sd.u_out;
-  int8_t const fcc_flag    = sd.fcc_flag;
-  Real* ssaf_bnl           = sd.ssaf_bnl;
-  Real* mat_beta           = sd.mat_beta;
-  MatQuad<Real>* mat_quads = sd.mat_quads;
+  int8_t const* Mb               = sd.Mb.data();
+  int64_t const* bn_ixyz         = sd.bn_ixyz.data();
+  int64_t const* bnl_ixyz        = sd.bnl_ixyz.data();
+  int64_t const* bna_ixyz        = sd.bna_ixyz.data();
+  int64_t const* in_ixyz         = sd.in_ixyz.data();
+  int64_t const* out_ixyz        = sd.out_ixyz.data();
+  uint16_t const* adj_bn         = sd.adj_bn.data();
+  uint8_t const* bn_mask         = sd.bn_mask.data();
+  int8_t const* mat_bnl          = sd.mat_bnl.data();
+  int8_t* Q_bna                  = sd.Q_bna;
+  double const* in_sigs          = sd.in_sigs.data();
+  int8_t const fcc_flag          = sd.fcc_flag;
+  Real const* ssaf_bnl           = sd.ssaf_bnl.data();
+  Real const* mat_beta           = sd.mat_beta.data();
+  MatQuad<Real> const* mat_quads = sd.mat_quads.data();
+  double* u_out                  = sd.u_out.get();
 
   // allocate memory
   auto u0_buf   = std::vector<Real>(static_cast<size_t>(Npts));

diff --git a/src/cpp/pffdtd/engine_cuda_3d.cu b/src/cpp/pffdtd/engine_cuda_3d.cu
@@ -106,21 +106,21 @@ __constant__ int8_t cuMb[MNm]; // to store Mb per mat
 // this is data on host, sometimes copied and recomputed for copy to GPU devices
 // (indices), sometimes just aliased pointers (scalar arrays)
 template<typename Real>
-struct HostData {      // arrays on host (for copy), mirrors gpu local data
-  double* in_sigs{};   // aliased
-  Real* u_out_buf{};   // aliased
-  double* u_out{};     // aliased
-  Real* ssaf_bnl{};    // aliased
-  int64_t* in_ixyz{};  // recomputed
-  int64_t* out_ixyz{}; // recomputed
-  int64_t* bn_ixyz{};  // recomputed
-  int64_t* bnl_ixyz{}; // recomputed
-  int64_t* bna_ixyz{}; // recomputed
-  int8_t* Q_bna{};     // aliased
-  uint16_t* adj_bn{};  // aliased
-  int8_t* mat_bnl{};   // aliased
-  uint8_t* bn_mask{};  // recomputed
-  int8_t* K_bn{};      // aliased
+struct HostData {           // arrays on host (for copy), mirrors gpu local data
+  double const* in_sigs{};  // aliased
+  Real* u_out_buf{};        // aliased
+  double* u_out{};          // aliased
+  Real const* ssaf_bnl{};   // aliased
+  int64_t* in_ixyz{};       // recomputed
+  int64_t* out_ixyz{};      // recomputed
+  int64_t* bn_ixyz{};       // recomputed
+  int64_t* bnl_ixyz{};      // recomputed
+  int64_t* bna_ixyz{};      // recomputed
+  int8_t* Q_bna{};          // aliased
+  uint16_t const* adj_bn{}; // aliased
+  int8_t const* mat_bnl{};  // aliased
+  uint8_t* bn_mask{};       // recomputed
+  int8_t const* K_bn{};     // aliased
   int64_t Ns{};
   int64_t Nr{};
   int64_t Npts{};
@@ -546,16 +546,16 @@ auto print_gpu_details(int i) -> uint64_t {
 // input indices need to be sorted for multi-device allocation
 template<typename Real>
 void checkSorted(Simulation3D<Real> const& sim) {
-  int64_t* bn_ixyz  = sim.bn_ixyz;
-  int64_t* bnl_ixyz = sim.bnl_ixyz;
-  int64_t* bna_ixyz = sim.bna_ixyz;
-  int64_t* in_ixyz  = sim.in_ixyz;
-  int64_t* out_ixyz = sim.out_ixyz;
-  int64_t const Nb  = sim.Nb;
-  int64_t const Nbl = sim.Nbl;
-  int64_t const Nba = sim.Nba;
-  int64_t const Ns  = sim.Ns;
-  int64_t const Nr  = sim.Nr;
+  int64_t const* bn_ixyz  = sim.bn_ixyz.data();
+  int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
+  int64_t const* bna_ixyz = sim.bna_ixyz.data();
+  int64_t const* in_ixyz  = sim.in_ixyz.data();
+  int64_t const* out_ixyz = sim.out_ixyz.data();
+  int64_t const Nb        = sim.Nb;
+  int64_t const Nbl       = sim.Nbl;
+  int64_t const Nba       = sim.Nba;
+  int64_t const Ns        = sim.Ns;
+  int64_t const Nr        = sim.Nr;
   for (int64_t i = 1; i < Nb; i++) {
     PFFDTD_ASSERT(bn_ixyz[i] > bn_ixyz[i - 1]); // check save_gpu_folder
   }
@@ -624,8 +624,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   }
 
   // bn_ixyz - Nb
-  int64_t* bn_ixyz = sim.bn_ixyz;
-  int64_t const Nb = sim.Nb;
+  int64_t const* bn_ixyz = sim.bn_ixyz.data();
+  int64_t const Nb       = sim.Nb;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nb; i++) {
@@ -644,8 +644,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nb_check == Nb);
 
   // bnl_ixyz - Nbl
-  int64_t* bnl_ixyz = sim.bnl_ixyz;
-  int64_t const Nbl = sim.Nbl;
+  int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
+  int64_t const Nbl       = sim.Nbl;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nbl; i++) {
@@ -664,8 +664,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nbl_check == Nbl);
 
   // bna_ixyz - Nba
-  int64_t* bna_ixyz = sim.bna_ixyz;
-  int64_t const Nba = sim.Nba;
+  int64_t const* bna_ixyz = sim.bna_ixyz.data();
+  int64_t const Nba       = sim.Nba;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nba; i++) {
@@ -684,8 +684,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nba_check == Nba);
 
   // in_ixyz - Ns
-  int64_t* in_ixyz = sim.in_ixyz;
-  int64_t const Ns = sim.Ns;
+  int64_t const* in_ixyz = sim.in_ixyz.data();
+  int64_t const Ns       = sim.Ns;
   {
     int gid = 0;
     for (int64_t i = 0; i < Ns; i++) {
@@ -704,8 +704,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Ns_check == Ns);
 
   // out_ixyz - Nr
-  int64_t* out_ixyz = sim.out_ixyz;
-  int64_t const Nr  = sim.Nr;
+  int64_t const* out_ixyz = sim.out_ixyz.data();
+  int64_t const Nr        = sim.Nr;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nr; i++) {
@@ -829,13 +829,13 @@ auto run(Simulation3D<Real> const& sim) -> void {
     std::printf("Nx=%ld Ns=%ld Nr=%ld Nb=%ld, Npts=%ld\n", host.Nx, host.Ns, host.Nr, host.Nb, host.Npts);
 
     // aliased pointers (to memory already allocated)
-    host.in_sigs   = sim.in_sigs + Ns_read * sim.Nt;
-    host.ssaf_bnl  = sim.ssaf_bnl + Nbl_read;
-    host.adj_bn    = sim.adj_bn + Nb_read;
-    host.mat_bnl   = sim.mat_bnl + Nbl_read;
-    host.K_bn      = sim.K_bn + Nb_read;
+    host.in_sigs   = sim.in_sigs.data() + Ns_read * sim.Nt;
+    host.ssaf_bnl  = sim.ssaf_bnl.data() + Nbl_read;
+    host.adj_bn    = sim.adj_bn.data() + Nb_read;
+    host.mat_bnl   = sim.mat_bnl.data() + Nbl_read;
+    host.K_bn      = sim.K_bn.data() + Nb_read;
     host.Q_bna     = sim.Q_bna + Nba_read;
-    host.u_out     = sim.u_out + Nr_read * sim.Nt;
+    host.u_out     = sim.u_out.get() + Nr_read * sim.Nt;
     host.u_out_buf = u_out_buf + Nr_read;
 
     // recalculate indices, these are associated host versions to copy over to devices
@@ -941,12 +941,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
     gpuErrchk(cudaMemcpy(gpu.mat_bnl, host.mat_bnl, (size_t)host.Nbl * sizeof(int8_t), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_beta), (size_t)sim.Nm * sizeof(Real)));
-    gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta, (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
+    gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_quads), (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>)));
-    gpuErrchk(
-        cudaMemcpy(gpu.mat_quads, sim.mat_quads, (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>), cudaMemcpyHostToDevice)
-    );
+    gpuErrchk(cudaMemcpy(
+        gpu.mat_quads,
+        sim.mat_quads.data(),
+        (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>),
+        cudaMemcpyHostToDevice
+    ));
 
     gpuErrchk(cudaMalloc(&(gpu.bn_mask), (size_t)(host.Nbm * sizeof(uint8_t))));
     gpuErrchk(cudaMemcpy(gpu.bn_mask, host.bn_mask, (size_t)host.Nbm * sizeof(uint8_t), cudaMemcpyHostToDevice));
@@ -969,15 +972,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
     std::printf("\n");
 
     // swapping x and z here (CUDA has first dim contiguous)
+    // same for all devices
     gpuErrchk(cudaMemcpyToSymbol(cuNx, &(sim.Nz), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNy, &(sim.Ny), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNz, &(host.Nxh), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNb, &(host.Nb), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNbl, &(host.Nbl), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNba, &(host.Nba), sizeof(int64_t)));
-    gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb, sim.Nm * sizeof(int8_t)));
-    gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy,
-                                 sizeof(int64_t))); // same for all devices
+    gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb.data(), sim.Nm * sizeof(int8_t)));
+    gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy, sizeof(int64_t)));
 
     if constexpr (std::is_same_v<Real, float>) {
       gpuErrchk(cudaMemcpyToSymbol(c1_f32, &a1, sizeof(float)));

diff --git a/src/cpp/pffdtd/engine_sycl_3d.cpp b/src/cpp/pffdtd/engine_sycl_3d.cpp
@@ -54,7 +54,6 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto const Nb   = sim.Nb;
   auto const Nbl  = sim.Nbl;
   auto const Nba  = sim.Nba;
-  auto const Nm   = sim.Nm;
   auto const Nr   = sim.Nr;
   auto const Ns   = sim.Ns;
   auto const Nt   = sim.Nt;
@@ -66,29 +65,29 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto const a2  = static_cast<Real>(sim.a2);
 
   auto Q_bna_buf     = sycl::buffer{sim.Q_bna, sycl::range(size_t(Nba))};
-  auto bn_mask_buf   = sycl::buffer{sim.bn_mask, sycl::range(size_t(Npts))};
-  auto adj_bn_buf    = sycl::buffer{sim.adj_bn, sycl::range(size_t(Nb))};
-  auto bn_ixyz_buf   = sycl::buffer{sim.bn_ixyz, sycl::range(size_t(Nb))};
-  auto bnl_ixyz_buf  = sycl::buffer{sim.bnl_ixyz, sycl::range(size_t(Nb))};
-  auto bna_ixyz_buf  = sycl::buffer{sim.bna_ixyz, sycl::range(size_t(Nba))};
-  auto in_ixyz_buf   = sycl::buffer{sim.in_ixyz, sycl::range(size_t(Ns))};
-  auto out_ixyz_buf  = sycl::buffer{sim.out_ixyz, sycl::range(size_t(Nr))};
-  auto in_sigs_buf   = sycl::buffer{sim.in_sigs, sycl::range(size_t(Ns * Nt))};
-  auto mat_beta_buf  = sycl::buffer{sim.mat_beta, sycl::range(size_t(Nm))};
-  auto mat_bnl_buf   = sycl::buffer{sim.mat_bnl, sycl::range(size_t(Nbl))};
-  auto mat_quads_buf = sycl::buffer{sim.mat_quads, sycl::range(size_t(Nm * MMb))};
-  auto Mb_buf        = sycl::buffer{sim.Mb, sycl::range(size_t(Nm))};
-  auto ssaf_bnl_buf  = sycl::buffer{sim.ssaf_bnl, sycl::range(size_t(Nbl))};
-
-  auto u0_buf    = sycl::buffer<Real>(size_t(Npts));
-  auto u1_buf    = sycl::buffer<Real>(size_t(Npts));
-  auto u0b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u1b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u2b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u2ba_buf  = sycl::buffer<Real>(size_t(Nba));
-  auto vh1_buf   = sycl::buffer<Real>(size_t(Nbl * MMb));
-  auto gh1_buf   = sycl::buffer<Real>(size_t(Nbl * MMb));
-  auto u_out_buf = sycl::buffer<Real>(size_t(Nr * Nt));
+  auto bn_mask_buf   = sycl::buffer{sim.bn_mask};
+  auto adj_bn_buf    = sycl::buffer{sim.adj_bn};
+  auto bn_ixyz_buf   = sycl::buffer{sim.bn_ixyz};
+  auto bnl_ixyz_buf  = sycl::buffer{sim.bnl_ixyz};
+  auto bna_ixyz_buf  = sycl::buffer{sim.bna_ixyz};
+  auto in_ixyz_buf   = sycl::buffer{sim.in_ixyz};
+  auto out_ixyz_buf  = sycl::buffer{sim.out_ixyz};
+  auto in_sigs_buf   = sycl::buffer{sim.in_sigs};
+  auto mat_beta_buf  = sycl::buffer{sim.mat_beta};
+  auto mat_bnl_buf   = sycl::buffer{sim.mat_bnl};
+  auto mat_quads_buf = sycl::buffer{sim.mat_quads};
+  auto Mb_buf        = sycl::buffer{sim.Mb};
+  auto ssaf_bnl_buf  = sycl::buffer{sim.ssaf_bnl};
+
+  auto u0_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
+  auto u1_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
+  auto u0b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u1b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u2b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u2ba_buf  = sycl::buffer<Real>(static_cast<size_t>(Nba));
+  auto vh1_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
+  auto gh1_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
+  auto u_out_buf = sycl::buffer<Real>(static_cast<size_t>(Nr * Nt));
 
   auto elapsedAir      = std::chrono::nanoseconds{0};
   auto elapsedBoundary = std::chrono::nanoseconds{0};

diff --git a/src/cpp/pffdtd/hdf.hpp b/src/cpp/pffdtd/hdf.hpp
@@ -79,35 +79,41 @@ struct H5FReader {
     auto set   = H5Dopen(_handle, dataset, H5P_DEFAULT);
     auto space = H5Dget_space(set);
 
-    auto ndims = 1;
+    // auto ndims = 1;
+    // PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);
+
+    auto ndims = H5Sget_simple_extent_ndims(space);
     auto dims  = std::array<hsize_t, 3>{};
-    PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);
     H5Sget_simple_extent_dims(space, dims.data(), nullptr);
 
     auto size = ndims == 1 ? dims[0] : dims[0] * dims[1];
 
-    if constexpr (std::is_same_v<T, uint8_t>) {
+    if constexpr (std::is_same_v<T, int64_t>) {
+      auto type = H5T_NATIVE_INT64;
+      auto buf  = std::vector<T>(size);
+      auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
+      checkErrorAndCloseDataset(dataset, set, err);
+      return buf;
+    } else if constexpr (std::is_same_v<T, uint8_t>) {
       auto type = H5T_NATIVE_UINT8;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
-    }
-
-    if constexpr (std::is_same_v<T, int64_t>) {
-      auto type = H5T_NATIVE_INT64;
+    } else if constexpr (std::is_same_v<T, int8_t>) {
+      auto type = H5T_NATIVE_INT8;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
-    }
-
-    if constexpr (std::is_same_v<T, double>) {
+    } else if constexpr (std::is_same_v<T, double>) {
       auto type = H5T_NATIVE_DOUBLE;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
+    } else {
+      static_assert(always_false<T>);
     }
   }