diff --git a/docs/documentation/expectedPerformance.md b/docs/documentation/expectedPerformance.md index 3c3501b72b..d0dca521e0 100644 --- a/docs/documentation/expectedPerformance.md +++ b/docs/documentation/expectedPerformance.md @@ -5,21 +5,22 @@ This page shows a summary of these results. ## Expected time-steps/hour -The following table outlines observed performance as nanoseconds per grid point (ns/GP) per right-hand side evaluation (lower is better). +The following table outlines observed performance as nanoseconds per grid point (ns/GP) per equation (eq) per right-hand side (rhs) evaluation (lower is better). We solve an example 3D, inviscid, 5-equation model problem with two advected species (a total of 8 PDEs). The numerics are WENO5 and the HLLC approximate Riemann solver. +This case is located in `examples/3D_performance_test`. We report results for various numbers of grid points per CPU die (or GPU device) and hardware. | Hardware | | 1M GPs | 4M GPs | 8M GPs | Compiler | Computer | | ---: | :----: | :----: | :---: | :---: | :----: | :--- | -| NVIDIA V100 | 1 device | 96 | 104 | 104 | NVHPC 22.11 | PACE Phoenix | -| NVIDIA V100 | 1 device | 101 | 104 | 104 | NVHPC 22.11 | OLCF Summit | -| NVIDIA A100 | 1 device | 71 | 56 | 59 | NVHPC 23.5 | Wingtip | -| AMD MI250X | 1 GCD | 108 | 90 | 96 | CCE 16.0.1 | OLCF Frontier | -| Intel Xeon Gold 6226 | 12 cores | 1963 | 1688 | 1686 | GNU 10.3.0 | PACE Phoenix | -| Apple M2 | 6 cores | 2919 | 245 | 4500 | GNU 13.2.0 | N/A | - -__All results are in nanoseconds (ns) per grid point (gp) per right-hand side (rhs) evaluation. Lower is better.__ +| NVIDIA V100 | 1 device | 12.0 | 13.0 | 13.0 | NVHPC 22.11 | PACE Phoenix | +| NVIDIA V100 | 1 device | 12.6 | 13.0 | 13.0 | NVHPC 22.11 | OLCF Summit | +| NVIDIA A100 | 1 device | 8.9 | 7.0 | 7.4 | NVHPC 23.5 | Wingtip | +| AMD MI250X | 1 GCD | 13.5 | 11.3 | 12 | CCE 16.0.1 | OLCF Frontier | +| Intel Xeon Gold 6226 | 12 cores | 245 | 211 | 211 | GNU 10.3.0 | PACE Phoenix | +| Apple M2 | 6 cores | 365 | 306 | 563 | GNU 13.2.0 | N/A | + +__All results are in nanoseconds (ns) per grid point (gp) per equation (eq) per right-hand side (rhs) evaluation, so X ns/gp/eq/rhs. Lower is better.__ ## Weak scaling diff --git a/examples/3D_performance_test/case.py b/examples/3D_performance_test/case.py new file mode 100644 index 0000000000..1af220748e --- /dev/null +++ b/examples/3D_performance_test/case.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +import json + +# Configuring case dictionary +print(json.dumps({ + # Logistics ================================================================ + 'run_time_info' : 'T', + # ========================================================================== + + # Computational Domain Parameters ========================================== + 'x_domain%beg' : 0.E+00, + 'x_domain%end' : 4.E-03/1.E-03, + 'y_domain%beg' : 0.E+00, + 'y_domain%end' : 4.E-03/1.E-03, + 'z_domain%beg' : 0.E+00, + 'z_domain%end' : 4.E-03/1.E-03, + 'stretch_x' : 'T', + 'a_x' : 4.E+00, + 'x_a' : -1.5E-03/1.E-03, + 'x_b' : 1.5E-03/1.E-03, + 'stretch_y' : 'T', + 'a_y' : 4.E+00, + 'y_a' : -1.5E-03/1.E-03, + 'y_b' : 1.5E-03/1.E-03, + 'stretch_z' : 'T', + 'a_z' : 4.E+00, + 'z_a' : -1.5E-03/1.E-03, + 'z_b' : 1.5E-03/1.E-03, + 'cyl_coord' : 'F', + 'm' : 200, + 'n' : 200, + 'p' : 200, + 'dt' : 0.2E-09/1.E-03, + 't_step_start' : 0, + 't_step_stop' : 30, + 't_step_save' : 30, + # ========================================================================== + + # Simulation Algorithm Parameters ========================================== + 'num_patches' : 2, + 'model_eqns' : 2, + 'alt_soundspeed' : 'F', + 'num_fluids' : 2, + 'adv_alphan' : 'T', + 'mpp_lim' : 'T', + 'mixture_err' : 'T', + 'time_stepper' : 3, + 'weno_order' : 5, + 'weno_eps' : 1.E-16, + 'weno_Re_flux' : 'F', + 'weno_avg' : 'F', + 'avg_state' : 2, + 'mapped_weno' : 'T', + 'null_weights' : 'F', + 'mp_weno' : 'F', + 'riemann_solver' : 2, + 'wave_speeds' : 1, + 'bc_x%beg' : -2, + 'bc_x%end' : -6, + 'bc_y%beg' : -2, + 'bc_y%end' : -6, + 'bc_z%beg' : -2, + 'bc_z%end' : -6, + # ========================================================================== + + # Formatted Database Files Structure Parameters ============================ + 'format' : 1, + 'precision' : 2, + 'prim_vars_wrt' :'T', + 'parallel_io' :'T', + # ========================================================================== + + # Patch 1: High pressured water ============================================ + 'patch_icpp(1)%geometry' : 9, + 'patch_icpp(1)%x_centroid' : 80.E-03/1.E-03, + 'patch_icpp(1)%y_centroid' : 80.E-03/1.E-03, + 'patch_icpp(1)%z_centroid' : 80.E-03/1.E-03, + 'patch_icpp(1)%length_x' : 160.E-03/1.E-03, + 'patch_icpp(1)%length_y' : 160.E-03/1.E-03, + 'patch_icpp(1)%length_z' : 160.E-03/1.E-03, + 'patch_icpp(1)%vel(1)' : 0.E+00, + 'patch_icpp(1)%vel(2)' : 0.E+00, + 'patch_icpp(1)%vel(3)' : 0.E+00, + 'patch_icpp(1)%pres' : 1.E+05, + 'patch_icpp(1)%alpha_rho(1)' : 1000.E+00, + 'patch_icpp(1)%alpha_rho(2)' : 0.1E+00, + 'patch_icpp(1)%alpha(1)' : 0.9E+00, + 'patch_icpp(1)%alpha(2)' : 0.1E+00, + # ========================================================================== + + # Patch 3: Air bubble ====================================================== + 'patch_icpp(2)%geometry' : 8, + 'patch_icpp(2)%smoothen' : 'T', + 'patch_icpp(2)%smooth_patch_id' : 1, + 'patch_icpp(2)%smooth_coeff' : 0.5E+00, + 'patch_icpp(2)%x_centroid' : 0.E+00, + 'patch_icpp(2)%y_centroid' : 0.E+00, + 'patch_icpp(2)%z_centroid' : 0.E+00, + 'patch_icpp(2)%radius' : 1.E-03/1.E-03, + 'patch_icpp(2)%alter_patch(1)' : 'T', + 'patch_icpp(2)%vel(1)' : 0.E+00, + 'patch_icpp(2)%vel(2)' : 0.E+00, + 'patch_icpp(2)%vel(3)' : 0.E+00, + 'patch_icpp(2)%pres' : 1.E+03, + 'patch_icpp(2)%alpha_rho(1)' : 100.E+00, + 'patch_icpp(2)%alpha_rho(2)' : 0.9E+00, + 'patch_icpp(2)%alpha(1)' : 0.1E+00, + 'patch_icpp(2)%alpha(2)' : 0.9E+00, + # ========================================================================== + + # Fluids Physical Parameters =============================================== + 'fluid_pp(1)%gamma' : 1.E+00/(4.4E+00-1.E+00), + 'fluid_pp(1)%pi_inf' : 4.4E+00*6.E+08/(4.4E+00-1.E+00), + 'fluid_pp(2)%gamma' : 1.E+00/(1.4E+00-1.E+00), + 'fluid_pp(2)%pi_inf' : 0.E+00, + # ========================================================================== +})) + +# ============================================================================== diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp index 783709fae8..54eb438840 100644 --- a/src/pre_process/m_patches.fpp +++ b/src/pre_process/m_patches.fpp @@ -1544,8 +1544,6 @@ contains radius = patch_ib(patch_id)%radius end if - print *, x_centroid, y_centroid, z_centroid, radius - ! Initializing the pseudo volume fraction value to 1. The value will ! be modified as the patch is laid out on the grid, but only in the ! case that smoothing of the spherical patch's boundary is enabled. diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp index 38339dadbc..aa2172c5be 100644 --- a/src/pre_process/m_start_up.fpp +++ b/src/pre_process/m_start_up.fpp @@ -861,10 +861,10 @@ contains time_final = 0d0 if (num_procs == 1) then time_final = time_avg - print *, "Final Time", time_final + print *, "Elapsed Time", time_final else time_final = maxval(proc_time) - print *, "Final Time", time_final + print *, "Elapsed Time", time_final end if inquire (FILE='pre_time_data.dat', EXIST=file_exists) if (file_exists) then diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index 9f67297029..ea668c224c 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -636,12 +636,15 @@ contains end subroutine s_initialize_rhs_module ! ------------------------------- - subroutine s_compute_rhs(q_cons_vf, q_prim_vf, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step) ! ------- + subroutine s_compute_rhs(q_cons_vf, q_prim_vf, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg) ! ------- type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf type(scalar_field), dimension(sys_size), intent(INOUT) :: q_prim_vf type(scalar_field), dimension(sys_size), intent(INOUT) :: rhs_vf real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(INOUT) :: pb, mv + real(kind(0d0)), intent(INOUT) :: time_avg + real(kind(0d0)) :: t_start, t_finish + real(kind(0d0)) :: gp_sum real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(INOUT) :: rhs_pb, rhs_mv integer, intent(IN) :: t_step @@ -676,7 +679,7 @@ contains ! ================================================================== !$acc update device(ix, iy, iz) - + call cpu_time(t_start) ! Association/Population of Working Variables ====================== !$acc parallel loop collapse(4) gang vector default(present) do i = 1, sys_size @@ -919,7 +922,6 @@ contains ! END: Additional physics and source terms ========================= end do - if (ib) then !$acc parallel loop collapse(3) gang vector default(present) do l = 0, p @@ -975,9 +977,13 @@ contains end do end do end do - end if - + call cpu_time(t_finish) + if (t_step >= 4) then + time_avg = (abs(t_finish - t_start)/((ix%end - ix%beg)*(iy%end - iy%beg)*(iz%end - iz%beg)) + (t_step - 4)*time_avg)/(t_step - 3) + else + time_avg = 0d0 + end if ! ================================================================== end subroutine s_compute_rhs ! ----------------------------------------- diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index ee80251911..7248483ed1 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -1128,12 +1128,11 @@ contains if (num_procs == 1) then time_final = time_avg io_time_final = io_time_avg - print *, "Final Time", time_final else time_final = maxval(proc_time) io_time_final = maxval(io_proc_time) - print *, "Final Time", time_final end if + print *, "Performance: ", time_final*1.0d9/sys_size, " ns/gp/eq/rhs" inquire (FILE='time_data.dat', EXIST=file_exists) if (file_exists) then open (1, file='time_data.dat', position='append', status='old') diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 41c308ee4f..8a8150e317 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -286,16 +286,13 @@ contains real(kind(0d0)), intent(INOUT) :: time_avg integer :: i, j, k, l, q!< Generic loop iterator - real(kind(0d0)) :: start, finish real(kind(0d0)) :: nR3bar ! Stage 1 of 1 ===================================================== - call cpu_time(start) - call nvtxStartRange("Time_Step") - call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg) if (ib .and. t_step == 1) then if (qbmm .and. .not. polytropic) then @@ -387,14 +384,6 @@ contains call nvtxEndRange - call cpu_time(finish) - - if (t_step >= 4) then - time_avg = (abs(finish - start) + (t_step - 4)*time_avg)/(t_step - 3) - else - time_avg = 0d0 - end if - ! ================================================================== end subroutine s_1st_order_tvd_rk ! ------------------------------------ @@ -416,7 +405,7 @@ contains call nvtxStartRange("Time_Step") - call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg) if (ib .and. t_step == 1) then if (qbmm .and. .not. polytropic) then @@ -503,7 +492,7 @@ contains ! Stage 2 of 2 ===================================================== - call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg) !$acc parallel loop collapse(4) gang vector default(present) do i = 1, sys_size @@ -574,13 +563,6 @@ contains call nvtxEndRange call cpu_time(finish) - - if (t_step >= 4) then - time_avg = (abs(finish - start) + (t_step - 4)*time_avg)/(t_step - 3) - else - time_avg = 0d0 - end if - ! ================================================================== end subroutine s_2nd_order_tvd_rk ! ------------------------------------ @@ -605,7 +587,7 @@ contains call nvtxStartRange("Time_Step") end if - call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg) if (ib .and. t_step == 1) then if (qbmm .and. .not. polytropic) then @@ -693,7 +675,7 @@ contains ! Stage 2 of 3 ===================================================== - call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg) !$acc parallel loop collapse(4) gang vector default(present) do i = 1, sys_size @@ -764,7 +746,7 @@ contains ! ================================================================== ! Stage 3 of 3 ===================================================== - call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step) + call s_compute_rhs(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg) !$acc parallel loop collapse(4) gang vector default(present) do i = 1, sys_size @@ -837,12 +819,6 @@ contains call cpu_time(finish) time = time + (finish - start) - - if (t_step >= 4) then - time_avg = (abs(finish - start) + (t_step - 4)*time_avg)/(t_step - 3) - else - time_avg = 0d0 - end if end if ! ================================================================== @@ -879,12 +855,6 @@ contains time = time + (finish - start) - if (t_step >= 4) then - time_avg = (abs(finish - start) + (t_step - 4)*time_avg)/(t_step - 3) - else - time_avg = 0d0 - end if - ! ================================================================== end subroutine s_strang_splitting ! ------------------------------------ diff --git a/toolchain/bench.yaml b/toolchain/bench.yaml index bd682819cd..3e3a177237 100644 --- a/toolchain/bench.yaml +++ b/toolchain/bench.yaml @@ -35,4 +35,3 @@ path: benchmarks/hypo_hll/case.py args: [] -