Skip to content

Commit f1be291

Browse files
authored
Merge pull request #406 from STEllAR-GROUP/fix_kokkos_reconstruct_tiling
Fix Kokkos reconstruct tiling and at no-amc optimization
2 parents d734bb7 + 5b75fc2 commit f1be291

File tree

1 file changed

+60
-21
lines changed

1 file changed

+60
-21
lines changed

octotiger/unitiger/hydro_impl/hydro_kokkos_kernel.hpp

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ void flux_impl_teamless(hpx::kokkos::executor<kokkos_backend_t>& executor,
5151
// Supported team_sizes need to be the power of two! Team size of 1 is a special case for usage
5252
// with the serial kokkos backend:
5353
assert((team_size == 1));
54-
auto policy = Kokkos::Experimental::require(Kokkos::RangePolicy<decltype(executor.instance())>(
55-
executor.instance(), 0, number_blocks),
54+
auto policy = Kokkos::Experimental::require(
55+
Kokkos::RangePolicy<decltype(executor.instance())>(executor.instance(), 0, number_blocks),
5656
Kokkos::Experimental::WorkItemProperty::HintLightWeight);
5757

5858
// Start kernel using policy (and through it the passed executor):
@@ -324,6 +324,7 @@ void flux_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const kokkos_b
324324
});
325325
}
326326

327+
/// Reconstruct with or without am
327328
template <typename kokkos_backend_t, typename kokkos_buffer_t, typename kokkos_int_buffer_t>
328329
void reconstruct_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const double omega,
329330
const int nf_, const int angmom_index_, const kokkos_int_buffer_t& smooth_field_,
@@ -361,6 +362,38 @@ void reconstruct_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const d
361362
});
362363
}
363364

365+
/// Optimized for reconstruct without am correction
366+
template <typename kokkos_backend_t, typename kokkos_buffer_t, typename kokkos_int_buffer_t>
367+
void reconstruct_no_amc_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const double omega,
368+
const int nf_, const int angmom_index_, const kokkos_int_buffer_t& smooth_field_,
369+
const kokkos_int_buffer_t& disc_detect_, kokkos_buffer_t& combined_q,
370+
const kokkos_buffer_t& combined_x, kokkos_buffer_t& combined_u, kokkos_buffer_t& AM,
371+
const double dx, const kokkos_buffer_t& cdiscs, const int n_species_, const int ndir,
372+
const int nangmom, const Kokkos::Array<long, 3>&& tiling_config) {
373+
const int blocks = q_inx3 / 64 + 1;
374+
auto policy = Kokkos::Experimental::require(
375+
Kokkos::MDRangePolicy<decltype(executor.instance()), Kokkos::Rank<3>>(
376+
executor.instance(), {0, 0, 0}, {blocks, 8, 8}, tiling_config),
377+
Kokkos::Experimental::WorkItemProperty::HintLightWeight);
378+
Kokkos::parallel_for(
379+
"kernel hydro reconstruct", policy, KOKKOS_LAMBDA(int idx, int idy, int idz) {
380+
const int q_i = (idx) *64 + (idy) *8 + (idz);
381+
const int i = ((q_i / q_inx2) + 2) * inx_large * inx_large +
382+
(((q_i % q_inx2) / q_inx) + 2) * inx_large + (((q_i % q_inx2) % q_inx) + 2);
383+
if (q_i < q_inx3) {
384+
for (int d = 0; d < ndir; d++) {
385+
cell_reconstruct_inner_loop_p1(nf_, angmom_index_, smooth_field_, disc_detect_,
386+
combined_q, combined_u, AM, dx, cdiscs, d, i, q_i, ndir, nangmom);
387+
}
388+
// Phase 2
389+
for (int d = 0; d < ndir; d++) {
390+
cell_reconstruct_inner_loop_p2(omega, angmom_index_, combined_q, combined_x,
391+
combined_u, AM, dx, d, i, q_i, ndir, nangmom, n_species_);
392+
}
393+
}
394+
});
395+
}
396+
364397
template <typename kokkos_backend_t, typename kokkos_buffer_t>
365398
void hydro_pre_recon_impl(hpx::kokkos::executor<kokkos_backend_t>& executor,
366399
const kokkos_buffer_t& large_x, const double omega, const bool angmom, kokkos_buffer_t& u,
@@ -431,8 +464,14 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
431464
Kokkos::deep_copy(exec.instance(), device_smooth_field, smooth_field);
432465
device_buffer<double> q(nf * 27 * q_inx3 + padding);
433466
device_buffer<double> AM(NDIM * q_inx3 + padding);
434-
reconstruct_impl(exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q, x,
435-
u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8});
467+
468+
if (angmom_index > -1) {
469+
reconstruct_impl(exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q,
470+
x, u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8});
471+
} else {
472+
reconstruct_no_amc_impl(exec, omega, nf, angmom_index, device_smooth_field,
473+
device_disc_detect, q, x, u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8});
474+
}
436475

437476
// Flux
438477
const device_buffer<bool>& masks =
@@ -461,7 +500,7 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
461500
current_max_slot = dim_i;
462501
} else if (host_amax[dim_i] == host_amax[current_max_slot]) {
463502
if (host_amax_indices[dim_i] < host_amax_indices[current_max_slot])
464-
current_max_slot = dim_i;
503+
current_max_slot = dim_i;
465504
}
466505
}
467506

@@ -486,13 +525,13 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
486525
ts.ul = std::move(URs);
487526
ts.ur = std::move(ULs);
488527
ts.dim = current_dim;
489-
/* int ix = current_max_index / (10 * 10);
490-
int iy = (current_max_index % (10 * 10)) / 10;
491-
int iz = (current_max_index % (10 * 10)) % 10;
492-
std::cout << "xzy" << ix << " " << iy << " " << iz << std::endl;
493-
std::cout << "kokkos_cuda Max index: " << current_max_index << " Max dim: " << current_dim <<
494-
std::endl;
495-
std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
528+
/* int ix = current_max_index / (10 * 10);
529+
int iy = (current_max_index % (10 * 10)) / 10;
530+
int iz = (current_max_index % (10 * 10)) % 10;
531+
std::cout << "xzy" << ix << " " << iy << " " << iz << std::endl;
532+
std::cout << "kokkos_cuda Max index: " << current_max_index << " Max dim: " << current_dim <<
533+
std::endl;
534+
std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
496535
return ts;
497536
}
498537

@@ -540,7 +579,7 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
540579
current_max_slot = dim_i;
541580
} else if (amax[dim_i] == amax[current_max_slot]) {
542581
if (amax_indices[dim_i] < amax_indices[current_max_slot])
543-
current_max_slot = dim_i;
582+
current_max_slot = dim_i;
544583
}
545584
}
546585

@@ -564,14 +603,14 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
564603
ts.ul = std::move(URs);
565604
ts.ur = std::move(ULs);
566605
ts.dim = current_dim;
567-
int x = current_max_index / (10 * 10);
568-
int y = (current_max_index % (10 * 10)) / 10;
569-
int z = (current_max_index % (10 * 10)) % 10;
570-
/*std::cout << "xzy" << x << " " << y << " " << z << std::endl;
571-
std::cout << "Max index: " << current_max_index << " Max dim: " << current_dim <<
572-
std::endl;
573-
std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
574-
// std::cin.get();
606+
int x = current_max_index / (10 * 10);
607+
int y = (current_max_index % (10 * 10)) / 10;
608+
int z = (current_max_index % (10 * 10)) % 10;
609+
/*std::cout << "xzy" << x << " " << y << " " << z << std::endl;
610+
std::cout << "Max index: " << current_max_index << " Max dim: " << current_dim <<
611+
std::endl;
612+
std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
613+
// std::cin.get();
575614
// std::cout << ts.a << std::endl;
576615
return ts;
577616
}

0 commit comments

Comments
 (0)