@@ -51,8 +51,8 @@ void flux_impl_teamless(hpx::kokkos::executor<kokkos_backend_t>& executor,
51
51
// Supported team_sizes need to be the power of two! Team size of 1 is a special case for usage
52
52
// with the serial kokkos backend:
53
53
assert ((team_size == 1 ));
54
- auto policy = Kokkos::Experimental::require (Kokkos::RangePolicy< decltype (executor. instance ())>(
55
- executor.instance (), 0 , number_blocks),
54
+ auto policy = Kokkos::Experimental::require (
55
+ Kokkos::RangePolicy< decltype (executor. instance ())>( executor.instance (), 0 , number_blocks),
56
56
Kokkos::Experimental::WorkItemProperty::HintLightWeight);
57
57
58
58
// Start kernel using policy (and through it the passed executor):
@@ -324,6 +324,7 @@ void flux_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const kokkos_b
324
324
});
325
325
}
326
326
327
+ // / Reconstruct with or without am
327
328
template <typename kokkos_backend_t , typename kokkos_buffer_t , typename kokkos_int_buffer_t >
328
329
void reconstruct_impl (hpx::kokkos::executor<kokkos_backend_t >& executor, const double omega,
329
330
const int nf_, const int angmom_index_, const kokkos_int_buffer_t & smooth_field_,
@@ -361,6 +362,38 @@ void reconstruct_impl(hpx::kokkos::executor<kokkos_backend_t>& executor, const d
361
362
});
362
363
}
363
364
365
+ // / Optimized for reconstruct without am correction
366
+ template <typename kokkos_backend_t , typename kokkos_buffer_t , typename kokkos_int_buffer_t >
367
+ void reconstruct_no_amc_impl (hpx::kokkos::executor<kokkos_backend_t >& executor, const double omega,
368
+ const int nf_, const int angmom_index_, const kokkos_int_buffer_t & smooth_field_,
369
+ const kokkos_int_buffer_t & disc_detect_, kokkos_buffer_t & combined_q,
370
+ const kokkos_buffer_t & combined_x, kokkos_buffer_t & combined_u, kokkos_buffer_t & AM,
371
+ const double dx, const kokkos_buffer_t & cdiscs, const int n_species_, const int ndir,
372
+ const int nangmom, const Kokkos::Array<long , 3 >&& tiling_config) {
373
+ const int blocks = q_inx3 / 64 + 1 ;
374
+ auto policy = Kokkos::Experimental::require (
375
+ Kokkos::MDRangePolicy<decltype (executor.instance ()), Kokkos::Rank<3 >>(
376
+ executor.instance (), {0 , 0 , 0 }, {blocks, 8 , 8 }, tiling_config),
377
+ Kokkos::Experimental::WorkItemProperty::HintLightWeight);
378
+ Kokkos::parallel_for (
379
+ " kernel hydro reconstruct" , policy, KOKKOS_LAMBDA (int idx, int idy, int idz) {
380
+ const int q_i = (idx) *64 + (idy) *8 + (idz);
381
+ const int i = ((q_i / q_inx2) + 2 ) * inx_large * inx_large +
382
+ (((q_i % q_inx2) / q_inx) + 2 ) * inx_large + (((q_i % q_inx2) % q_inx) + 2 );
383
+ if (q_i < q_inx3) {
384
+ for (int d = 0 ; d < ndir; d++) {
385
+ cell_reconstruct_inner_loop_p1 (nf_, angmom_index_, smooth_field_, disc_detect_,
386
+ combined_q, combined_u, AM, dx, cdiscs, d, i, q_i, ndir, nangmom);
387
+ }
388
+ // Phase 2
389
+ for (int d = 0 ; d < ndir; d++) {
390
+ cell_reconstruct_inner_loop_p2 (omega, angmom_index_, combined_q, combined_x,
391
+ combined_u, AM, dx, d, i, q_i, ndir, nangmom, n_species_);
392
+ }
393
+ }
394
+ });
395
+ }
396
+
364
397
template <typename kokkos_backend_t , typename kokkos_buffer_t >
365
398
void hydro_pre_recon_impl (hpx::kokkos::executor<kokkos_backend_t >& executor,
366
399
const kokkos_buffer_t & large_x, const double omega, const bool angmom, kokkos_buffer_t & u,
@@ -431,8 +464,14 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
431
464
Kokkos::deep_copy (exec.instance (), device_smooth_field, smooth_field);
432
465
device_buffer<double > q (nf * 27 * q_inx3 + padding);
433
466
device_buffer<double > AM (NDIM * q_inx3 + padding);
434
- reconstruct_impl (exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q, x,
435
- u, AM, dx, disc, n_species, ndir, nangmom, {1 , 8 , 8 });
467
+
468
+ if (angmom_index > -1 ) {
469
+ reconstruct_impl (exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q,
470
+ x, u, AM, dx, disc, n_species, ndir, nangmom, {1 , 8 , 8 });
471
+ } else {
472
+ reconstruct_no_amc_impl (exec, omega, nf, angmom_index, device_smooth_field,
473
+ device_disc_detect, q, x, u, AM, dx, disc, n_species, ndir, nangmom, {1 , 8 , 8 });
474
+ }
436
475
437
476
// Flux
438
477
const device_buffer<bool >& masks =
@@ -461,7 +500,7 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
461
500
current_max_slot = dim_i;
462
501
} else if (host_amax[dim_i] == host_amax[current_max_slot]) {
463
502
if (host_amax_indices[dim_i] < host_amax_indices[current_max_slot])
464
- current_max_slot = dim_i;
503
+ current_max_slot = dim_i;
465
504
}
466
505
}
467
506
@@ -486,13 +525,13 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
486
525
ts.ul = std::move (URs);
487
526
ts.ur = std::move (ULs);
488
527
ts.dim = current_dim;
489
- /* int ix = current_max_index / (10 * 10);
490
- int iy = (current_max_index % (10 * 10)) / 10;
491
- int iz = (current_max_index % (10 * 10)) % 10;
492
- std::cout << "xzy" << ix << " " << iy << " " << iz << std::endl;
493
- std::cout << "kokkos_cuda Max index: " << current_max_index << " Max dim: " << current_dim <<
494
- std::endl;
495
- std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
528
+ /* int ix = current_max_index / (10 * 10);
529
+ int iy = (current_max_index % (10 * 10)) / 10;
530
+ int iz = (current_max_index % (10 * 10)) % 10;
531
+ std::cout << "xzy" << ix << " " << iy << " " << iz << std::endl;
532
+ std::cout << "kokkos_cuda Max index: " << current_max_index << " Max dim: " << current_dim <<
533
+ std::endl;
534
+ std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
496
535
return ts;
497
536
}
498
537
@@ -540,7 +579,7 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
540
579
current_max_slot = dim_i;
541
580
} else if (amax[dim_i] == amax[current_max_slot]) {
542
581
if (amax_indices[dim_i] < amax_indices[current_max_slot])
543
- current_max_slot = dim_i;
582
+ current_max_slot = dim_i;
544
583
}
545
584
}
546
585
@@ -564,14 +603,14 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer<dou
564
603
ts.ul = std::move (URs);
565
604
ts.ur = std::move (ULs);
566
605
ts.dim = current_dim;
567
- int x = current_max_index / (10 * 10 );
568
- int y = (current_max_index % (10 * 10 )) / 10 ;
569
- int z = (current_max_index % (10 * 10 )) % 10 ;
570
- /* std::cout << "xzy" << x << " " << y << " " << z << std::endl;
571
- std::cout << "Max index: " << current_max_index << " Max dim: " << current_dim <<
572
- std::endl;
573
- std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
574
- // std::cin.get();
606
+ int x = current_max_index / (10 * 10 );
607
+ int y = (current_max_index % (10 * 10 )) / 10 ;
608
+ int z = (current_max_index % (10 * 10 )) % 10 ;
609
+ /* std::cout << "xzy" << x << " " << y << " " << z << std::endl;
610
+ std::cout << "Max index: " << current_max_index << " Max dim: " << current_dim <<
611
+ std::endl;
612
+ std::cout << ts.x << " " << ts.y << " " << ts.z << std::endl;*/
613
+ // std::cin.get();
575
614
// std::cout << ts.a << std::endl;
576
615
return ts;
577
616
}
0 commit comments