Skip to content

Commit

Permalink
[cpp] Use std::vector for sim3d arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiashienzsch committed Sep 24, 2024
1 parent 52946b4 commit 31da921
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 217 deletions.
32 changes: 16 additions & 16 deletions src/cpp/pffdtd/engine_cpu_3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,24 +91,24 @@ auto run(Simulation3D<Real> const& sd) -> void {
int64_t const Nb = sd.Nb;
int64_t const Nbl = sd.Nbl;
int64_t const Nba = sd.Nba;
int8_t* Mb = sd.Mb;

// keep local copies of pointers (style choice)
int64_t* bn_ixyz = sd.bn_ixyz;
int64_t* bnl_ixyz = sd.bnl_ixyz;
int64_t* bna_ixyz = sd.bna_ixyz;
int64_t* in_ixyz = sd.in_ixyz;
int64_t* out_ixyz = sd.out_ixyz;
uint16_t* adj_bn = sd.adj_bn;
uint8_t* bn_mask = sd.bn_mask;
int8_t* mat_bnl = sd.mat_bnl;
int8_t* Q_bna = sd.Q_bna;
double* in_sigs = sd.in_sigs;
double* u_out = sd.u_out;
int8_t const fcc_flag = sd.fcc_flag;
Real* ssaf_bnl = sd.ssaf_bnl;
Real* mat_beta = sd.mat_beta;
MatQuad<Real>* mat_quads = sd.mat_quads;
int8_t const* Mb = sd.Mb.data();
int64_t const* bn_ixyz = sd.bn_ixyz.data();
int64_t const* bnl_ixyz = sd.bnl_ixyz.data();
int64_t const* bna_ixyz = sd.bna_ixyz.data();
int64_t const* in_ixyz = sd.in_ixyz.data();
int64_t const* out_ixyz = sd.out_ixyz.data();
uint16_t const* adj_bn = sd.adj_bn.data();
uint8_t const* bn_mask = sd.bn_mask.data();
int8_t const* mat_bnl = sd.mat_bnl.data();
int8_t* Q_bna = sd.Q_bna;
double const* in_sigs = sd.in_sigs.data();
int8_t const fcc_flag = sd.fcc_flag;
Real const* ssaf_bnl = sd.ssaf_bnl.data();
Real const* mat_beta = sd.mat_beta.data();
MatQuad<Real> const* mat_quads = sd.mat_quads.data();
double* u_out = sd.u_out.get();

// allocate memory
auto u0_buf = std::vector<Real>(static_cast<size_t>(Npts));
Expand Down
99 changes: 51 additions & 48 deletions src/cpp/pffdtd/engine_cuda_3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,21 @@ __constant__ int8_t cuMb[MNm]; // to store Mb per mat
// this is data on host, sometimes copied and recomputed for copy to GPU devices
// (indices), sometimes just aliased pointers (scalar arrays)
template<typename Real>
struct HostData { // arrays on host (for copy), mirrors gpu local data
double* in_sigs{}; // aliased
Real* u_out_buf{}; // aliased
double* u_out{}; // aliased
Real* ssaf_bnl{}; // aliased
int64_t* in_ixyz{}; // recomputed
int64_t* out_ixyz{}; // recomputed
int64_t* bn_ixyz{}; // recomputed
int64_t* bnl_ixyz{}; // recomputed
int64_t* bna_ixyz{}; // recomputed
int8_t* Q_bna{}; // aliased
uint16_t* adj_bn{}; // aliased
int8_t* mat_bnl{}; // aliased
uint8_t* bn_mask{}; // recomputed
int8_t* K_bn{}; // aliased
struct HostData { // arrays on host (for copy), mirrors gpu local data
double const* in_sigs{}; // aliased
Real* u_out_buf{}; // aliased
double* u_out{}; // aliased
Real const* ssaf_bnl{}; // aliased
int64_t* in_ixyz{}; // recomputed
int64_t* out_ixyz{}; // recomputed
int64_t* bn_ixyz{}; // recomputed
int64_t* bnl_ixyz{}; // recomputed
int64_t* bna_ixyz{}; // recomputed
int8_t* Q_bna{}; // aliased
uint16_t const* adj_bn{}; // aliased
int8_t const* mat_bnl{}; // aliased
uint8_t* bn_mask{}; // recomputed
int8_t const* K_bn{}; // aliased
int64_t Ns{};
int64_t Nr{};
int64_t Npts{};
Expand Down Expand Up @@ -546,16 +546,16 @@ auto print_gpu_details(int i) -> uint64_t {
// input indices need to be sorted for multi-device allocation
template<typename Real>
void checkSorted(Simulation3D<Real> const& sim) {
int64_t* bn_ixyz = sim.bn_ixyz;
int64_t* bnl_ixyz = sim.bnl_ixyz;
int64_t* bna_ixyz = sim.bna_ixyz;
int64_t* in_ixyz = sim.in_ixyz;
int64_t* out_ixyz = sim.out_ixyz;
int64_t const Nb = sim.Nb;
int64_t const Nbl = sim.Nbl;
int64_t const Nba = sim.Nba;
int64_t const Ns = sim.Ns;
int64_t const Nr = sim.Nr;
int64_t const* bn_ixyz = sim.bn_ixyz.data();
int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
int64_t const* bna_ixyz = sim.bna_ixyz.data();
int64_t const* in_ixyz = sim.in_ixyz.data();
int64_t const* out_ixyz = sim.out_ixyz.data();
int64_t const Nb = sim.Nb;
int64_t const Nbl = sim.Nbl;
int64_t const Nba = sim.Nba;
int64_t const Ns = sim.Ns;
int64_t const Nr = sim.Nr;
for (int64_t i = 1; i < Nb; i++) {
PFFDTD_ASSERT(bn_ixyz[i] > bn_ixyz[i - 1]); // check save_gpu_folder
}
Expand Down Expand Up @@ -624,8 +624,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
}

// bn_ixyz - Nb
int64_t* bn_ixyz = sim.bn_ixyz;
int64_t const Nb = sim.Nb;
int64_t const* bn_ixyz = sim.bn_ixyz.data();
int64_t const Nb = sim.Nb;
{
int gid = 0;
for (int64_t i = 0; i < Nb; i++) {
Expand All @@ -644,8 +644,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
PFFDTD_ASSERT(Nb_check == Nb);

// bnl_ixyz - Nbl
int64_t* bnl_ixyz = sim.bnl_ixyz;
int64_t const Nbl = sim.Nbl;
int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
int64_t const Nbl = sim.Nbl;
{
int gid = 0;
for (int64_t i = 0; i < Nbl; i++) {
Expand All @@ -664,8 +664,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
PFFDTD_ASSERT(Nbl_check == Nbl);

// bna_ixyz - Nba
int64_t* bna_ixyz = sim.bna_ixyz;
int64_t const Nba = sim.Nba;
int64_t const* bna_ixyz = sim.bna_ixyz.data();
int64_t const Nba = sim.Nba;
{
int gid = 0;
for (int64_t i = 0; i < Nba; i++) {
Expand All @@ -684,8 +684,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
PFFDTD_ASSERT(Nba_check == Nba);

// in_ixyz - Ns
int64_t* in_ixyz = sim.in_ixyz;
int64_t const Ns = sim.Ns;
int64_t const* in_ixyz = sim.in_ixyz.data();
int64_t const Ns = sim.Ns;
{
int gid = 0;
for (int64_t i = 0; i < Ns; i++) {
Expand All @@ -704,8 +704,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
PFFDTD_ASSERT(Ns_check == Ns);

// out_ixyz - Nr
int64_t* out_ixyz = sim.out_ixyz;
int64_t const Nr = sim.Nr;
int64_t const* out_ixyz = sim.out_ixyz.data();
int64_t const Nr = sim.Nr;
{
int gid = 0;
for (int64_t i = 0; i < Nr; i++) {
Expand Down Expand Up @@ -829,13 +829,13 @@ auto run(Simulation3D<Real> const& sim) -> void {
std::printf("Nx=%ld Ns=%ld Nr=%ld Nb=%ld, Npts=%ld\n", host.Nx, host.Ns, host.Nr, host.Nb, host.Npts);

// aliased pointers (to memory already allocated)
host.in_sigs = sim.in_sigs + Ns_read * sim.Nt;
host.ssaf_bnl = sim.ssaf_bnl + Nbl_read;
host.adj_bn = sim.adj_bn + Nb_read;
host.mat_bnl = sim.mat_bnl + Nbl_read;
host.K_bn = sim.K_bn + Nb_read;
host.in_sigs = sim.in_sigs.data() + Ns_read * sim.Nt;
host.ssaf_bnl = sim.ssaf_bnl.data() + Nbl_read;
host.adj_bn = sim.adj_bn.data() + Nb_read;
host.mat_bnl = sim.mat_bnl.data() + Nbl_read;
host.K_bn = sim.K_bn.data() + Nb_read;
host.Q_bna = sim.Q_bna + Nba_read;
host.u_out = sim.u_out + Nr_read * sim.Nt;
host.u_out = sim.u_out.get() + Nr_read * sim.Nt;
host.u_out_buf = u_out_buf + Nr_read;

// recalculate indices, these are associated host versions to copy over to devices
Expand Down Expand Up @@ -941,12 +941,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
gpuErrchk(cudaMemcpy(gpu.mat_bnl, host.mat_bnl, (size_t)host.Nbl * sizeof(int8_t), cudaMemcpyHostToDevice));

gpuErrchk(cudaMalloc(&(gpu.mat_beta), (size_t)sim.Nm * sizeof(Real)));
gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta, (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));

gpuErrchk(cudaMalloc(&(gpu.mat_quads), (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>)));
gpuErrchk(
cudaMemcpy(gpu.mat_quads, sim.mat_quads, (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>), cudaMemcpyHostToDevice)
);
gpuErrchk(cudaMemcpy(
gpu.mat_quads,
sim.mat_quads.data(),
(size_t)sim.Nm * MMb * sizeof(MatQuad<Real>),
cudaMemcpyHostToDevice
));

gpuErrchk(cudaMalloc(&(gpu.bn_mask), (size_t)(host.Nbm * sizeof(uint8_t))));
gpuErrchk(cudaMemcpy(gpu.bn_mask, host.bn_mask, (size_t)host.Nbm * sizeof(uint8_t), cudaMemcpyHostToDevice));
Expand All @@ -969,15 +972,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
std::printf("\n");

// swapping x and z here (CUDA has first dim contiguous)
// same for all devices
gpuErrchk(cudaMemcpyToSymbol(cuNx, &(sim.Nz), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNy, &(sim.Ny), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNz, &(host.Nxh), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNb, &(host.Nb), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNbl, &(host.Nbl), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNba, &(host.Nba), sizeof(int64_t)));
gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb, sim.Nm * sizeof(int8_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy,
sizeof(int64_t))); // same for all devices
gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb.data(), sim.Nm * sizeof(int8_t)));
gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy, sizeof(int64_t)));

if constexpr (std::is_same_v<Real, float>) {
gpuErrchk(cudaMemcpyToSymbol(c1_f32, &a1, sizeof(float)));
Expand Down
47 changes: 23 additions & 24 deletions src/cpp/pffdtd/engine_sycl_3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ auto run(Simulation3D<Real> const& sim) -> void {
auto const Nb = sim.Nb;
auto const Nbl = sim.Nbl;
auto const Nba = sim.Nba;
auto const Nm = sim.Nm;
auto const Nr = sim.Nr;
auto const Ns = sim.Ns;
auto const Nt = sim.Nt;
Expand All @@ -66,29 +65,29 @@ auto run(Simulation3D<Real> const& sim) -> void {
auto const a2 = static_cast<Real>(sim.a2);

auto Q_bna_buf = sycl::buffer{sim.Q_bna, sycl::range(size_t(Nba))};
auto bn_mask_buf = sycl::buffer{sim.bn_mask, sycl::range(size_t(Npts))};
auto adj_bn_buf = sycl::buffer{sim.adj_bn, sycl::range(size_t(Nb))};
auto bn_ixyz_buf = sycl::buffer{sim.bn_ixyz, sycl::range(size_t(Nb))};
auto bnl_ixyz_buf = sycl::buffer{sim.bnl_ixyz, sycl::range(size_t(Nb))};
auto bna_ixyz_buf = sycl::buffer{sim.bna_ixyz, sycl::range(size_t(Nba))};
auto in_ixyz_buf = sycl::buffer{sim.in_ixyz, sycl::range(size_t(Ns))};
auto out_ixyz_buf = sycl::buffer{sim.out_ixyz, sycl::range(size_t(Nr))};
auto in_sigs_buf = sycl::buffer{sim.in_sigs, sycl::range(size_t(Ns * Nt))};
auto mat_beta_buf = sycl::buffer{sim.mat_beta, sycl::range(size_t(Nm))};
auto mat_bnl_buf = sycl::buffer{sim.mat_bnl, sycl::range(size_t(Nbl))};
auto mat_quads_buf = sycl::buffer{sim.mat_quads, sycl::range(size_t(Nm * MMb))};
auto Mb_buf = sycl::buffer{sim.Mb, sycl::range(size_t(Nm))};
auto ssaf_bnl_buf = sycl::buffer{sim.ssaf_bnl, sycl::range(size_t(Nbl))};

auto u0_buf = sycl::buffer<Real>(size_t(Npts));
auto u1_buf = sycl::buffer<Real>(size_t(Npts));
auto u0b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u1b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u2b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u2ba_buf = sycl::buffer<Real>(size_t(Nba));
auto vh1_buf = sycl::buffer<Real>(size_t(Nbl * MMb));
auto gh1_buf = sycl::buffer<Real>(size_t(Nbl * MMb));
auto u_out_buf = sycl::buffer<Real>(size_t(Nr * Nt));
auto bn_mask_buf = sycl::buffer{sim.bn_mask};
auto adj_bn_buf = sycl::buffer{sim.adj_bn};
auto bn_ixyz_buf = sycl::buffer{sim.bn_ixyz};
auto bnl_ixyz_buf = sycl::buffer{sim.bnl_ixyz};
auto bna_ixyz_buf = sycl::buffer{sim.bna_ixyz};
auto in_ixyz_buf = sycl::buffer{sim.in_ixyz};
auto out_ixyz_buf = sycl::buffer{sim.out_ixyz};
auto in_sigs_buf = sycl::buffer{sim.in_sigs};
auto mat_beta_buf = sycl::buffer{sim.mat_beta};
auto mat_bnl_buf = sycl::buffer{sim.mat_bnl};
auto mat_quads_buf = sycl::buffer{sim.mat_quads};
auto Mb_buf = sycl::buffer{sim.Mb};
auto ssaf_bnl_buf = sycl::buffer{sim.ssaf_bnl};

auto u0_buf = sycl::buffer<Real>(static_cast<size_t>(Npts));
auto u1_buf = sycl::buffer<Real>(static_cast<size_t>(Npts));
auto u0b_buf = sycl::buffer<Real>(static_cast<size_t>(Nbl));
auto u1b_buf = sycl::buffer<Real>(static_cast<size_t>(Nbl));
auto u2b_buf = sycl::buffer<Real>(static_cast<size_t>(Nbl));
auto u2ba_buf = sycl::buffer<Real>(static_cast<size_t>(Nba));
auto vh1_buf = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
auto gh1_buf = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
auto u_out_buf = sycl::buffer<Real>(static_cast<size_t>(Nr * Nt));

auto elapsedAir = std::chrono::nanoseconds{0};
auto elapsedBoundary = std::chrono::nanoseconds{0};
Expand Down
26 changes: 16 additions & 10 deletions src/cpp/pffdtd/hdf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,35 +79,41 @@ struct H5FReader {
auto set = H5Dopen(_handle, dataset, H5P_DEFAULT);
auto space = H5Dget_space(set);

auto ndims = 1;
// auto ndims = 1;
// PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);

auto ndims = H5Sget_simple_extent_ndims(space);
auto dims = std::array<hsize_t, 3>{};
PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);
H5Sget_simple_extent_dims(space, dims.data(), nullptr);

auto size = ndims == 1 ? dims[0] : dims[0] * dims[1];

if constexpr (std::is_same_v<T, uint8_t>) {
if constexpr (std::is_same_v<T, int64_t>) {
auto type = H5T_NATIVE_INT64;
auto buf = std::vector<T>(size);
auto err = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
checkErrorAndCloseDataset(dataset, set, err);
return buf;
} else if constexpr (std::is_same_v<T, uint8_t>) {
auto type = H5T_NATIVE_UINT8;
auto buf = std::vector<T>(size);
auto err = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
checkErrorAndCloseDataset(dataset, set, err);
return buf;
}

if constexpr (std::is_same_v<T, int64_t>) {
auto type = H5T_NATIVE_INT64;
} else if constexpr (std::is_same_v<T, int8_t>) {
auto type = H5T_NATIVE_INT8;
auto buf = std::vector<T>(size);
auto err = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
checkErrorAndCloseDataset(dataset, set, err);
return buf;
}

if constexpr (std::is_same_v<T, double>) {
} else if constexpr (std::is_same_v<T, double>) {
auto type = H5T_NATIVE_DOUBLE;
auto buf = std::vector<T>(size);
auto err = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
checkErrorAndCloseDataset(dataset, set, err);
return buf;
} else {
static_assert(always_false<T>);
}
}

Expand Down
Loading

0 comments on commit 31da921

Please sign in to comment.