From 2b3d35d7ab47c84290094d3efa6811423e8a8f46 Mon Sep 17 00:00:00 2001 From: anandrdbz <62814442+anandrdbz@users.noreply.github.com> Date: Fri, 5 Apr 2024 23:18:05 -0400 Subject: [PATCH] OpenACC + Cray CCE + AMD MI200+ (#368) Co-authored-by: Henry Le Berre Co-authored-by: Steve Abbott Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: wilfonba Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Steve Abbott Co-authored-by: Abbott, Stephen R Co-authored-by: Spencer Bryngelson Co-authored-by: Anand Radhakrishnan Co-authored-by: Ben Wilfong <48168887+wilfonba@users.noreply.github.com> Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Co-authored-by: Anand Radhakrishnan Co-authored-by: Anand Co-authored-by: Spencer Bryngelson --- .github/workflows/frontier/build.sh | 4 + .github/workflows/frontier/submit.sh | 43 ++ .github/workflows/frontier/test.sh | 3 + .github/workflows/test.yml | 17 +- .typos.toml | 4 +- CMakeLists.txt | 65 ++- docs/Doxyfile.in | 2 +- examples/3D_weak_scaling/analyze.sh | 5 + misc/profiling_amdgpu.txt | 23 ++ src/common/include/inline_conversions.fpp | 8 +- src/common/include/macros.fpp | 102 ++++- src/common/m_derived_types.fpp | 2 +- src/common/m_helper.fpp | 7 +- src/common/m_mpi_common.fpp | 2 +- src/common/m_phase_change.fpp | 24 +- src/common/m_variables_conversion.fpp | 59 ++- src/post_process/m_start_up.f90 | 2 +- src/pre_process/m_checker.f90 | 35 +- src/pre_process/m_start_up.fpp | 2 +- src/simulation/include/case.fpp | 5 +- src/simulation/m_bubbles.fpp | 42 +- src/simulation/m_cbc.fpp | 236 ++++++----- src/simulation/m_compute_cbc.fpp | 48 ++- src/simulation/m_data_output.fpp | 56 +-- src/simulation/m_derived_variables.f90 | 3 +- src/simulation/m_fftw.fpp | 114 ++++-- src/simulation/m_global_parameters.fpp | 155 ++++++-- src/simulation/m_hypoelastic.fpp | 28 +- src/simulation/m_ibm.fpp | 58 +-- src/simulation/m_monopole.fpp | 35 +- src/simulation/m_mpi_proxy.fpp | 178 ++++----- src/simulation/m_qbmm.fpp | 67 +++- src/simulation/m_rhs.fpp | 323 ++++++++------- src/simulation/m_riemann_solvers.fpp | 222 +++++++---- src/simulation/m_start_up.fpp | 39 +- src/simulation/m_time_steppers.fpp | 114 ++++-- src/simulation/m_viscous.fpp | 459 +++++++++++----------- src/simulation/m_weno.fpp | 253 +++++++----- toolchain/bootstrap/format.sh | 10 +- toolchain/bootstrap/format_file.sh | 37 ++ toolchain/bootstrap/modules.sh | 8 +- toolchain/cce_simulation_workgroup_256.sh | 48 +++ toolchain/dependencies/CMakeLists.txt | 77 ++-- toolchain/mfc/args.py | 8 +- toolchain/mfc/build.py | 27 +- toolchain/mfc/common.py | 1 - toolchain/mfc/run/input.py | 2 +- toolchain/mfc/state.py | 9 +- toolchain/mfc/test/test.py | 4 +- toolchain/modules | 12 +- toolchain/requirements.txt | 2 + toolchain/templates/frontier.mako | 54 +++ toolchain/templates/include/helpers.mako | 8 +- 53 files changed, 2064 insertions(+), 1087 deletions(-) create mode 100644 .github/workflows/frontier/build.sh create mode 100644 .github/workflows/frontier/submit.sh create mode 100644 .github/workflows/frontier/test.sh create mode 100755 examples/3D_weak_scaling/analyze.sh create mode 100644 misc/profiling_amdgpu.txt create mode 100644 toolchain/bootstrap/format_file.sh create mode 100755 toolchain/cce_simulation_workgroup_256.sh create mode 100644 toolchain/templates/frontier.mako diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh new file mode 100644 index 000000000..a6a51b65f --- /dev/null +++ b/.github/workflows/frontier/build.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +. ./mfc.sh load -c f -m g +./mfc.sh build -j 8 --gpu diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh new file mode 100644 index 000000000..706d7e468 --- /dev/null +++ b/.github/workflows/frontier/submit.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + +usage() { + echo "Usage: $0 [script.sh] [cpu|gpu]" +} + +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` +else + usage + exit 1 +fi + +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" + +sbatch <): # # * Locate all source files for of the type # # src/[,common]/[.,include]/*.[f90,fpp]. -# -# * For each .fpp file found with filepath /.fpp, using a +# +# * For each .fpp file found with filepath /.fpp, using a # custom command, instruct CMake how to generate a file with path # # src//fypp/.f90 @@ -224,7 +235,7 @@ endif() # file is modified, but also when any file with filepath of the form # # src/[,common]/include/*.fpp -# +# # is modified. This is a reasonable compromise as modifications to .fpp files # in the include directories will be rare - by design. Other approaches would # have required a more complex CMakeLists.txt file (perhaps parsing the .fpp @@ -333,11 +344,11 @@ function(MFC_SETUP_TARGET) set_target_properties(${ARGS_TARGET} PROPERTIES Fortran_PREPROCESS ON) - target_include_directories(${ARGS_TARGET} PRIVATE + target_include_directories(${ARGS_TARGET} PRIVATE "${CMAKE_SOURCE_DIR}/src/common" "${CMAKE_SOURCE_DIR}/src/common/include" "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}") - + if (EXISTS "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include") target_include_directories(${ARGS_TARGET} PRIVATE "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include") @@ -367,9 +378,14 @@ function(MFC_SETUP_TARGET) endif() if (ARGS_FFTW) - if (MFC_OpenACC AND ARGS_OpenACC AND (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")) - find_package(CUDAToolkit REQUIRED) - target_link_libraries(${ARGS_TARGET} PRIVATE CUDA::cudart CUDA::cufft) + if (MFC_OpenACC AND ARGS_OpenACC) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") + find_package(CUDAToolkit REQUIRED) + target_link_libraries(${ARGS_TARGET} PRIVATE CUDA::cudart CUDA::cufft) + else() + find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED) + target_link_libraries(${ARGS_TARGET} PRIVATE hipfort::hipfft) + endif() else() find_package(FFTW REQUIRED) target_link_libraries(${ARGS_TARGET} PRIVATE FFTW::FFTW) @@ -420,6 +436,9 @@ function(MFC_SETUP_TARGET) PRIVATE -gpu=autocompare,debug ) endif() + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + find_package(hipfort COMPONENTS hip CONFIG REQUIRED) + target_link_libraries(${ARGS_TARGET} PRIVATE hipfort::hip hipfort::hipfort-amdgcn) endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") target_compile_options(${ARGS_TARGET} PRIVATE "SHELL:-h noacc" "SHELL:-x acc") @@ -438,19 +457,31 @@ if (MFC_PRE_PROCESS) MFC_SETUP_TARGET(TARGET pre_process SOURCES "${pre_process_SRCs}" MPI) + if(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + target_compile_options(pre_process PRIVATE -hfp0) + endif() endif() if (MFC_SIMULATION) MFC_SETUP_TARGET(TARGET simulation SOURCES "${simulation_SRCs}" MPI OpenACC FFTW) + + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND MFC_OpenACC) + add_custom_command(TARGET simulation POST_BUILD + COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/toolchain/cce_simulation_workgroup_256.sh" + "${CMAKE_CURRENT_BINARY_DIR}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMENT "Patching & Rebuilding with Cray hacks" + ) + endif() endif() if (MFC_POST_PROCESS) MFC_SETUP_TARGET(TARGET post_process SOURCES "${post_process_SRCs}" MPI SILO HDF5 FFTW) - + # -O0 is in response to https://github.com/MFlowCode/MFC-develop/issues/95 target_compile_options(post_process PRIVATE -O0) endif() @@ -468,7 +499,7 @@ if (MFC_DOCUMENTATION) add_custom_command( OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/docs/documentation/examples.md" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/docs/examples.sh;${examples_DOCs}" - COMMAND "bash" "${CMAKE_CURRENT_SOURCE_DIR}/docs/examples.sh" + COMMAND "bash" "${CMAKE_CURRENT_SOURCE_DIR}/docs/examples.sh" "${CMAKE_CURRENT_SOURCE_DIR}" COMMENT "Generating examples.md" VERBATIM @@ -500,7 +531,7 @@ if (MFC_DOCUMENTATION) set(DOXYGEN_IMAGE_PATH "\"${CMAKE_CURRENT_SOURCE_DIR}/docs/res\"\ \"${CMAKE_CURRENT_SOURCE_DIR}/docs/${target}\"") - file(MAKE_DIRECTORY "${DOXYGEN_OUTPUT_DIRECTORY}") + file(MAKE_DIRECTORY "${DOXYGEN_OUTPUT_DIRECTORY}") configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/docs/Doxyfile.in" @@ -537,7 +568,7 @@ if (MFC_DOCUMENTATION) endmacro() add_custom_target(documentation) - + find_package(Doxygen REQUIRED dot REQUIRED) # > Fetch CSS Theme diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 67b87a704..26e1a0156 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -548,7 +548,7 @@ INTERNAL_DOCS = YES # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows -# (including Cygwin) ands Mac users are advised to set this option to NO. +# (including Cygwin) and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES diff --git a/examples/3D_weak_scaling/analyze.sh b/examples/3D_weak_scaling/analyze.sh new file mode 100755 index 000000000..a6cd72428 --- /dev/null +++ b/examples/3D_weak_scaling/analyze.sh @@ -0,0 +1,5 @@ +# This script is ran from the 3D_weak_scaling case directory after running +# MFC with --omni -n . To analyze, run chmod u+x ./analyze.sh followed +# by ./analyze.sh + +omniperf analyze -p workloads/$1/mi200 --metric 0 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 16.3.1 16.3.2 16.3.7 17.3.2 17.3.3 17.3.8 diff --git a/misc/profiling_amdgpu.txt b/misc/profiling_amdgpu.txt new file mode 100644 index 000000000..7421ac514 --- /dev/null +++ b/misc/profiling_amdgpu.txt @@ -0,0 +1,23 @@ +Profile MFC using omniperf + +0) Start an interactive session with the desired number of nodes and total tasks using `salloc -A [account] -J interactive -t 2:00:00 -p batch -N [nnodes] -n [total tasks]` + +1) Generate MFC input files by running `./mfc.sh run [path to casefile] -N [nnodes] -n [total tasks] --gpu -t pre_process simulation --case-optimization` + +2) Move to the simulation directory using `cd [path to casefile]` + +3) - `module load` the following modules: + - rocm/5.5.1 + - cray-python + - omniperf + These must be loaded in the order that they are listed. + +4) Run `omniperf profile -n [profile name] -- [path to MFC beginning with /]/build/install/bin/simulation` + +5) Run `omniperf analyze --gui -p [path to casefile]/workloads/[profile name]/mi200` + +6) Determine what login node you're on, call it [node name] + +7) Open a new terminal window and log into Frontier using `ssh -L8050:localhost:8050 username@[node name].frontier.olcf.ornl.gov` + +8) Open a web browser and navigate to `http://localhost:8050/` diff --git a/src/common/include/inline_conversions.fpp b/src/common/include/inline_conversions.fpp index 9811290a8..81ce0fa9b 100644 --- a/src/common/include/inline_conversions.fpp +++ b/src/common/include/inline_conversions.fpp @@ -1,6 +1,10 @@ #:def s_compute_speed_of_sound() subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c) - +#ifdef CRAY_ACC_WAR +!DIR$ INLINEALWAYS s_compute_speed_of_sound +#else +!$acc routine seq +#endif real(kind(0d0)), intent(IN) :: pres real(kind(0d0)), intent(IN) :: rho, gamma, pi_inf real(kind(0d0)), intent(IN) :: H @@ -39,7 +43,6 @@ (pres + pi_inf/(gamma + 1d0))/ & (rho*(1d0 - adv(num_fluids))) end if - else c = ((H - 5d-1*vel_sum)/gamma) end if @@ -49,7 +52,6 @@ else c = sqrt(c) end if - end subroutine s_compute_speed_of_sound #:enddef diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 2f2d2fcf6..10f1a678f 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -4,6 +4,7 @@ use iso_fortran_env, only: output_unit print *, '${_FILE_.split('/')[-1]}$:${_LINE_}$: ', ${expr}$ + call flush (output_unit) end block #endif @@ -12,18 +13,107 @@ #:def ALLOCATE(*args) @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) allocate (${', '.join(args)}$) - #:if MFC_COMPILER == 'Cray' - !$acc enter data create(${', '.join([ arg.split('(')[0] for arg in args ])}$) - #:else - !$acc enter data create(${', '.join(args)}$) - #:endif +#ifndef CRAY_ACC_WAR +!$acc enter data create(${', '.join(args)}$) +#endif #:enddef ALLOCATE #:def DEALLOCATE(*args) @:LOG({'@:DEALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) deallocate (${', '.join(args)}$) - !$acc exit data delete(${', '.join(args)}$) +#ifndef CRAY_ACC_WAR +!$acc exit data delete(${', '.join(args)}$) +#endif #:enddef DEALLOCATE +#:def ALLOCATE_GLOBAL(*args) + @:LOG({'@:ALLOCATE_GLOBAL(${re.sub(' +', ' ', ', '.join(args))}$)'}) +#ifdef CRAY_ACC_WAR + allocate (${', '.join(('p_' + arg.strip() for arg in args))}$) + #:for arg in args + ${re.sub('\(.*\)','',arg)}$ => ${ 'p_' + re.sub('\(.*\)','',arg.strip()) }$ + #:endfor +!$acc enter data create(${', '.join(('p_' + re.sub('\(.*\)','',arg.strip()) for arg in args))}$) & +!$acc& attach(${', '.join(map(lambda x: re.sub('\(.*\)','',x), args))}$) +#else + allocate (${', '.join(args)}$) +!$acc enter data create(${', '.join(args)}$) +#endif + +#:enddef ALLOCATE_GLOBAL + +#:def DEALLOCATE_GLOBAL(*args) + @:LOG({'@:DEALLOCATE_GLOBAL(${re.sub(' +', ' ', ', '.join(args))}$)'}) +#ifdef CRAY_ACC_WAR + !$acc exit data delete(${', '.join(('p_' + arg.strip() for arg in args))}$) & + !$acc& detach(${', '.join(args)}$) + #:for arg in args + nullify (${arg}$) + #:endfor + deallocate (${', '.join(('p_' + arg.strip() for arg in args))}$) +#else + deallocate (${', '.join(args)}$) +!$acc exit data delete(${', '.join(args)}$) +#endif + +#:enddef DEALLOCATE_GLOBAL + +#:def CRAY_DECLARE_GLOBAL(intype, dim, *args) +#ifdef CRAY_ACC_WAR + ${intype}$, ${dim}$, allocatable, target :: ${', '.join(('p_' + arg.strip() for arg in args))}$ + ${intype}$, ${dim}$, pointer :: ${', '.join(args)}$ +#else + ${intype}$, ${dim}$, allocatable :: ${', '.join(args)}$ +#endif +#:enddef CRAY_DECLARE_GLOBAL + +#:def CRAY_DECLARE_GLOBAL_SCALAR(intype, *args) +#ifdef CRAY_ACC_WAR + ${intype}$, target :: ${', '.join(('p_' + arg.strip() for arg in args))}$ + ${intype}$, pointer :: ${', '.join(args)}$ +#else + ${intype}$::${', '.join(args)}$ +#endif +#:enddef CRAY_DECLARE_GLOBAL_SCALAR + +#:def ACC_SETUP_VFs(*args) +#ifdef CRAY_ACC_WAR + block + integer :: macros_setup_vfs_i + + @:LOG({'@:ACC_SETUP_VFs(${', '.join(args)}$)'}) + + #:for arg in args + !$acc enter data copyin(${arg}$) + !$acc enter data copyin(${arg}$%vf) + if (allocated(${arg}$%vf)) then + do macros_setup_vfs_i = lbound(${arg}$%vf, 1), ubound(${arg}$%vf, 1) + if (associated(${arg}$%vf(macros_setup_vfs_i)%sf)) then + !$acc enter data copyin(${arg}$%vf(macros_setup_vfs_i)) + !$acc enter data create(${arg}$%vf(macros_setup_vfs_i)%sf) + end if + end do + end if + #:endfor + end block +#endif +#:enddef + +#:def ACC_SETUP_SFs(*args) +#ifdef CRAY_ACC_WAR + block + + @:LOG({'@:ACC_SETUP_SFs(${', '.join(args)}$)'}) + + #:for arg in args + !$acc enter data copyin(${arg}$) + if (associated(${arg}$%sf)) then + !$acc enter data create(${arg}$%sf) + end if + #:endfor + end block +#endif +#:enddef + #define t_vec3 real(kind(0d0)), dimension(1:3) #define t_mat4x4 real(kind(0d0)), dimension(1:4,1:4) diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp index 05346f69f..5dfa90dd7 100644 --- a/src/common/m_derived_types.fpp +++ b/src/common/m_derived_types.fpp @@ -272,7 +272,7 @@ module m_derived_types real(kind(0d0)), dimension(3) :: loc !< Physical location of the ghost point real(kind(0d0)), dimension(3) :: ip_loc !< Physical location of the image point - real(kind(0d0)), dimension(3) :: ip_grid !< Top left grid point of IP + integer, dimension(3) :: ip_grid !< Top left grid point of IP real(kind(0d0)), dimension(2, 2, 2) :: interp_coeffs !< Interpolation Coefficients of image point integer :: ib_patch_id !< ID of the IB Patch the ghost point is part of logical :: slip diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp index 528d79a9d..1fb72389a 100644 --- a/src/common/m_helper.fpp +++ b/src/common/m_helper.fpp @@ -184,10 +184,15 @@ contains rhol0 = rhoref pl0 = pref - +#ifdef MFC_SIMULATION + @:ALLOCATE_GLOBAL(pb0(nb), mass_n0(nb), mass_v0(nb), Pe_T(nb)) + @:ALLOCATE_GLOBAL(k_n(nb), k_v(nb), omegaN(nb)) + @:ALLOCATE_GLOBAL(Re_trans_T(nb), Re_trans_c(nb), Im_trans_T(nb), Im_trans_c(nb)) +#else @:ALLOCATE(pb0(nb), mass_n0(nb), mass_v0(nb), Pe_T(nb)) @:ALLOCATE(k_n(nb), k_v(nb), omegaN(nb)) @:ALLOCATE(Re_trans_T(nb), Re_trans_c(nb), Im_trans_T(nb), Im_trans_c(nb)) +#endif pb0(:) = dflt_real mass_n0(:) = dflt_real diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 3c5596edb..348d6b903 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -134,7 +134,7 @@ contains #ifndef MFC_POST_PROCESS if (present(ib_markers)) then -#ifdef MPI_SIMULATION +#ifdef MFC_PRE_PROCESS MPI_IO_IB_DATA%var%sf => ib_markers%sf #else MPI_IO_IB_DATA%var%sf => ib_markers%sf(0:m, 0:n, 0:p) diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp index 0c636c799..1dada8cb4 100644 --- a/src/common/m_phase_change.fpp +++ b/src/common/m_phase_change.fpp @@ -74,7 +74,6 @@ contains !! selecting the phase change module that will be used !! (pT- or pTg-equilibrium) subroutine s_initialize_phasechange_module() - ! variables used in the calculation of the saturation curves for fluids 1 and 2 A = (gs_min(lp)*cvs(lp) - gs_min(vp)*cvs(vp) & + qvps(vp) - qvps(lp))/((gs_min(vp) - 1.0d0)*cvs(vp)) @@ -87,14 +86,6 @@ contains D = ((gs_min(lp) - 1.0d0)*cvs(lp)) & /((gs_min(vp) - 1.0d0)*cvs(vp)) - ! Associating procedural pointer to the subroutine that will be - ! utilized to calculate the solution to the selected relaxation system - if ((relax_model == 5) .or. (relax_model == 6)) then - s_relaxation_solver => s_infinite_relaxation_k - else - call s_mpi_abort('relaxation solver was not set!') - end if - end subroutine s_initialize_phasechange_module !------------------------------- !> This subroutine is created to activate either the pT- (N fluids) or the @@ -103,6 +94,7 @@ contains !! state conditions. !! @param q_cons_vf Cell-average conservative variables subroutine s_infinite_relaxation_k(q_cons_vf) ! ---------------- + type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf real(kind(0.0d0)) :: pS, pSOV, pSSL !< equilibrium pressure for mixture, overheated vapor, and subcooled liquid real(kind(0.0d0)) :: TS, TSOV, TSSL, TSatOV, TSatSL !< equilibrium temperature for mixture, overheated vapor, and subcooled liquid. Saturation Temperatures at overheated vapor and subcooled liquid @@ -305,6 +297,7 @@ contains !! @param rhoe mixture energy !! @param TS equilibrium temperature at the interface subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, rM, q_cons_vf, rhoe, TS) + !$acc routine seq ! initializing variables @@ -321,10 +314,7 @@ contains integer :: i, ns !< generic loop iterators ! auxiliary variables for the pT-equilibrium solver - mCP = 0.0d0; mQ = 0.0d0; p_infpT = ps_inf; pk(1:num_fluids) = 0.0d0 - - ig(1:num_fluids) = 0 - + mCP = 0.0d0; mQ = 0.0d0; p_infpT = ps_inf; ! Performing tests before initializing the pT-equilibrium !$acc loop seq do i = 1, num_fluids @@ -409,6 +399,7 @@ contains !! @param q_cons_vf Cell-average conservative variables !! @param TS equilibrium temperature at the interface subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) + !$acc routine seq type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf @@ -424,7 +415,6 @@ contains !< Generic loop iterators integer :: i, ns - ! pTg-equilibrium solution procedure ! Newton Solver parameters ! counter @@ -517,7 +507,6 @@ contains ! common temperature TS = (rhoe + pS - mQ)/mCP - end subroutine s_infinite_ptg_relaxation_k ! ----------------------- !> This auxiliary subroutine corrects the partial densities of the REACTING fluids in case one of them is negative @@ -538,7 +527,6 @@ contains real(kind(0.0d0)), intent(OUT) :: MCT integer, intent(IN) :: j, k, l !> @} - if (rM < 0.0d0) then if ((q_cons_vf(lp + contxb - 1)%sf(j, k, l) >= -1.0d0*mixM) .and. & @@ -572,7 +560,6 @@ contains q_cons_vf(vp + contxb - 1)%sf(j, k, l) = MCT*rM end if - end subroutine s_correct_partial_densities !> This auxiliary subroutine calculates the 2 x 2 Jacobian and, its inverse and transpose @@ -785,9 +772,6 @@ contains !> This subroutine finalizes the phase change module subroutine s_finalize_relaxation_solver_module() - - s_relaxation_solver => null() - end subroutine #endif diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp index bc7fa2911..d1b9a008f 100644 --- a/src/common/m_variables_conversion.fpp +++ b/src/common/m_variables_conversion.fpp @@ -89,11 +89,17 @@ module m_variables_conversion !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps) #endif +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), Gs) + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), bubrs) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), Res) +!$acc declare link(bubrs, Gs, Res) +#else real(kind(0d0)), allocatable, dimension(:) :: Gs integer, allocatable, dimension(:) :: bubrs real(kind(0d0)), allocatable, dimension(:, :) :: Res - !$acc declare create(bubrs, Gs, Res) - +!$acc declare create(bubrs, Gs, Res) +#endif integer :: is1b, is2b, is3b, is1e, is2e, is3e !$acc declare create(is1b, is2b, is3b, is1e, is2e, is3e) @@ -449,7 +455,11 @@ contains gamma_K, pi_inf_K, qv_K, & alpha_K, alpha_rho_K, Re_K, k, l, r, & G_K, G) - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc +#else +!$acc routine seq +#endif real(kind(0d0)), intent(OUT) :: rho_K, gamma_K, pi_inf_K, qv_K @@ -527,7 +537,11 @@ contains subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, & gamma_K, pi_inf_K, qv_K, & alpha_K, alpha_rho_K, Re_K, k, l, r) - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc +#else +!$acc routine seq +#endif real(kind(0d0)), intent(INOUT) :: rho_K, gamma_K, pi_inf_K, qv_K @@ -612,8 +626,20 @@ contains end if #endif - !$acc update device(ixb, ixe, iyb, iye, izb, ize) +!$acc enter data copyin(ixb, ixe, iyb, iye, izb, ize) +!$acc enter data copyin(is1b, is1e, is2b, is2e, is3b, is3e) +!$acc update device(ixb, ixe, iyb, iye, izb, ize) +#ifdef MFC_SIMULATION + @:ALLOCATE_GLOBAL(gammas (1:num_fluids)) + @:ALLOCATE_GLOBAL(gs_min (1:num_fluids)) + @:ALLOCATE_GLOBAL(pi_infs(1:num_fluids)) + @:ALLOCATE_GLOBAL(ps_inf(1:num_fluids)) + @:ALLOCATE_GLOBAL(cvs (1:num_fluids)) + @:ALLOCATE_GLOBAL(qvs (1:num_fluids)) + @:ALLOCATE_GLOBAL(qvps (1:num_fluids)) + @:ALLOCATE_GLOBAL(Gs (1:num_fluids)) +#else @:ALLOCATE(gammas (1:num_fluids)) @:ALLOCATE(gs_min (1:num_fluids)) @:ALLOCATE(pi_infs(1:num_fluids)) @@ -622,6 +648,7 @@ contains @:ALLOCATE(qvs (1:num_fluids)) @:ALLOCATE(qvps (1:num_fluids)) @:ALLOCATE(Gs (1:num_fluids)) +#endif do i = 1, num_fluids gammas(i) = fluid_pp(i)%gamma @@ -638,8 +665,7 @@ contains #ifdef MFC_SIMULATION if (any(Re_size > 0)) then - @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) - + @:ALLOCATE_GLOBAL(Res(1:2, 1:maxval(Re_size))) do i = 1, 2 do j = 1, Re_size(i) Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) @@ -651,7 +677,11 @@ contains #endif if (bubbles) then +#ifdef MFC_SIMULATION + @:ALLOCATE_GLOBAL(bubrs(1:nb)) +#else @:ALLOCATE(bubrs(1:nb)) +#endif do i = 1, nb bubrs(i) = bub_idx%rs(i) @@ -837,7 +867,7 @@ contains real(kind(0d0)) :: pres - integer :: i, j, k, l !< Generic loop iterators + integer :: i, j, k, l, q !< Generic loop iterators real(kind(0.d0)) :: ntmp @@ -978,6 +1008,7 @@ contains do i = advxb, advxe qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) end do + end do end do end do @@ -1014,7 +1045,7 @@ contains real(kind(0d0)) :: dyn_pres real(kind(0d0)) :: nbub, R3, vftmp, R3tmp real(kind(0d0)), dimension(nb) :: Rtmp - real(kind(0d0)) :: G + real(kind(0d0)) :: G = 0d0 real(kind(0d0)), dimension(2) :: Re_K integer :: i, j, k, l, q !< Generic loop iterators @@ -1245,7 +1276,6 @@ contains ! energy flux, u(E+p) FK_vf(j, k, l, E_idx) = vel_K(dir_idx(1))*(E_K + pres_K) - ! have been using == 2 if (riemann_solver == 1) then !$acc loop seq do i = advxb, advxe @@ -1266,6 +1296,7 @@ contains end do end if + end do end do end do @@ -1281,11 +1312,17 @@ contains deallocate (rho_sf, gamma_sf, pi_inf_sf, qv_sf) #endif +#ifdef MFC_SIMULATION + @:DEALLOCATE_GLOBAL(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs) + if (bubbles) then + @:DEALLOCATE_GLOBAL(bubrs) + end if +#else @:DEALLOCATE(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs) - if (bubbles) then @:DEALLOCATE(bubrs) end if +#endif ! Nullifying the procedure pointer to the subroutine transferring/ ! computing the mixture/species variables to the mixture variables diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90 index db871c955..01320658d 100644 --- a/src/post_process/m_start_up.f90 +++ b/src/post_process/m_start_up.f90 @@ -25,7 +25,7 @@ module m_start_up use m_data_output !< Procedures that write the grid and chosen flow !! variable(s) to the formatted database file(s) - use m_derived_variables !< Procedures used to compute quantites derived + use m_derived_variables !< Procedures used to compute quantities derived !! from the conservative and primitive variables use m_helper diff --git a/src/pre_process/m_checker.f90 b/src/pre_process/m_checker.f90 index cc728e313..7d18b3020 100644 --- a/src/pre_process/m_checker.f90 +++ b/src/pre_process/m_checker.f90 @@ -165,16 +165,31 @@ subroutine s_check_inputs() if (cyl_coord) then ! Cartesian coordinates - ! Constraints on domain boundaries for cylindrical coordinates - if (n == 0 & - .or. & - y_domain%beg /= 0d0 & - .or. & - y_domain%end == dflt_real & - .or. & - y_domain%end < 0d0 & - .or. & - y_domain%beg >= y_domain%end) then + ! in case restart of a simulation + if (old_grid .and. old_ic) then + ! checking of there is any input to the domains + if ((x_domain%beg /= dflt_real .or. x_domain%end /= dflt_real) & + .or. & + (y_domain%beg /= dflt_real .or. y_domain%end /= dflt_real) & + .or. & + (y_domain%beg /= dflt_real .or. y_domain%end /= dflt_real)) then + call s_mpi_abort('domain are not dflt_real.'// & + 'Please, correct them') + elseif (m == dflt_int .or. n == dflt_int .or. p == dflt_int) then + call s_mpi_abort('m, n, and/or p are set to dflt_int.'// & + 'Please, correct them') + end if + ! in case it is NOT restart + ! Constraints on domain boundaries for cylindrical coordinates + elseif (n == 0 & + .or. & + y_domain%beg /= 0d0 & + .or. & + y_domain%end == dflt_real & + .or. & + y_domain%end < 0d0 & + .or. & + y_domain%beg >= y_domain%end) then call s_mpi_abort('Unsupported choice of the combination of '// & 'cyl_coord and n, y_domain%beg, or '// & 'y_domain%end. Exiting ...') diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp index 5d4746b81..14a7f2a4d 100644 --- a/src/pre_process/m_start_up.fpp +++ b/src/pre_process/m_start_up.fpp @@ -837,7 +837,7 @@ contains & pTg-equilirium (relax = "T" activated)' end if - call s_relaxation_solver(q_cons_vf) + call s_infinite_relaxation_k(q_cons_vf) end if call s_write_data_files(q_cons_vf, ib_markers) diff --git a/src/simulation/include/case.fpp b/src/simulation/include/case.fpp index 2fb832db3..b75ab9152 100644 --- a/src/simulation/include/case.fpp +++ b/src/simulation/include/case.fpp @@ -1,2 +1,3 @@ -! This file is purposefully empty. It is only important for builds that make use -! of --case-optimization. +! This file was generated by MFC. It is only used if the --case-optimization +! option is passed to ./mfc.sh run or test, enabling a GPU-oriented optimization +! that hard-codes certain case parameters from the input file. diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp index 47f195b62..971541770 100644 --- a/src/simulation/m_bubbles.fpp +++ b/src/simulation/m_bubbles.fpp @@ -24,10 +24,24 @@ module m_bubbles real(kind(0.d0)) :: chi_vw !< Bubble wall properties (Ando 2010) real(kind(0.d0)) :: k_mw !< Bubble wall properties (Ando 2010) real(kind(0.d0)) :: rho_mw !< Bubble wall properties (Ando 2010) - !$acc declare create(chi_vw, k_mw, rho_mw) +!$acc declare create(chi_vw, k_mw, rho_mw) +#ifdef CRAY_ACC_WAR !> @name Bubble dynamic source terms !> @{ + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), bub_adv_src) + !$acc declare link(bub_adv_src) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), bub_r_src, bub_v_src, bub_p_src, bub_m_src) + !$acc declare link(bub_r_src, bub_v_src, bub_p_src, bub_m_src) + + type(scalar_field) :: divu !< matrix for div(u) + !$acc declare create(divu) + + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), rs, vs, ms, ps) +!$acc declare link(rs, vs, ms, ps) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: bub_adv_src real(kind(0d0)), allocatable, dimension(:, :, :, :) :: bub_r_src, bub_v_src, bub_p_src, bub_m_src !$acc declare create(bub_adv_src, bub_r_src, bub_v_src, bub_p_src, bub_m_src) @@ -36,7 +50,8 @@ module m_bubbles !$acc declare create(divu) integer, allocatable, dimension(:) :: rs, vs, ms, ps - !$acc declare create(rs, vs, ms, ps) +!$acc declare create(rs, vs, ms, ps) +#endif contains @@ -53,11 +68,11 @@ contains ix%end = m - ix%beg; iy%end = n - iy%beg; iz%end = p - iz%beg ! ================================================================== - @:ALLOCATE(rs(1:nb)) - @:ALLOCATE(vs(1:nb)) + @:ALLOCATE_GLOBAL(rs(1:nb)) + @:ALLOCATE_GLOBAL(vs(1:nb)) if (.not. polytropic) then - @:ALLOCATE(ps(1:nb)) - @:ALLOCATE(ms(1:nb)) + @:ALLOCATE_GLOBAL(ps(1:nb)) + @:ALLOCATE_GLOBAL(ms(1:nb)) end if do l = 1, nb @@ -75,14 +90,15 @@ contains end if @:ALLOCATE(divu%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) + @:ACC_SETUP_SFs(divu) - @:ALLOCATE(bub_adv_src(0:m, 0:n, 0:p)) - @:ALLOCATE(bub_r_src(0:m, 0:n, 0:p, 1:nb)) - @:ALLOCATE(bub_v_src(0:m, 0:n, 0:p, 1:nb)) - @:ALLOCATE(bub_p_src(0:m, 0:n, 0:p, 1:nb)) - @:ALLOCATE(bub_m_src(0:m, 0:n, 0:p, 1:nb)) + @:ALLOCATE_GLOBAL(bub_adv_src(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(bub_r_src(0:m, 0:n, 0:p, 1:nb)) + @:ALLOCATE_GLOBAL(bub_v_src(0:m, 0:n, 0:p, 1:nb)) + @:ALLOCATE_GLOBAL(bub_p_src(0:m, 0:n, 0:p, 1:nb)) + @:ALLOCATE_GLOBAL(bub_m_src(0:m, 0:n, 0:p, 1:nb)) - end subroutine + end subroutine s_initialize_bubbles_module subroutine s_compute_bubbles_rhs(idir, q_prim_vf) @@ -137,7 +153,7 @@ contains end if - end subroutine + end subroutine s_compute_bubbles_rhs !> The purpose of this procedure is to compute the source terms !! that are needed for the bubble modeling diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp index 822e66dfd..febf24695 100644 --- a/src/simulation/m_cbc.fpp +++ b/src/simulation/m_cbc.fpp @@ -18,6 +18,7 @@ !! 8) Supersonic Outflow !! Please refer to Thompson (1987, 1990) for detailed descriptions. +#:include 'macros.fpp' #:include 'inline_conversions.fpp' module m_cbc @@ -39,42 +40,87 @@ module m_cbc !! The cell-average primitive variables. They are obtained by reshaping (RS) !! q_prim_vf in the coordinate direction normal to the domain boundary along !! which the CBC is applied. - +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), q_prim_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), q_prim_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), q_prim_rsz_vf) +!$acc declare link(q_prim_rsx_vf, q_prim_rsy_vf, q_prim_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: q_prim_rsx_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: q_prim_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf +#endif +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:), F_rs_vf, F_src_rs_vf) +!$acc declare link(F_rs_vf, F_src_rs_vf) +#else type(scalar_field), allocatable, dimension(:) :: F_rs_vf, F_src_rs_vf !< +#endif !! Cell-average fluxes (src - source). These are directly determined from the !! cell-average primitive variables, q_prims_rs_vf, and not a Riemann solver. +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), F_rsx_vf, F_src_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), F_rsy_vf, F_src_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), F_rsz_vf, F_src_rsz_vf) +!$acc declare link(F_rsx_vf, F_src_rsx_vf, F_rsy_vf, F_src_rsy_vf, F_rsz_vf, F_src_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: F_rsx_vf, F_src_rsx_vf !< real(kind(0d0)), allocatable, dimension(:, :, :, :) :: F_rsy_vf, F_src_rsy_vf !< real(kind(0d0)), allocatable, dimension(:, :, :, :) :: F_rsz_vf, F_src_rsz_vf !< - +#endif + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsx_vf, flux_src_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsy_vf, flux_src_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsz_vf, flux_src_rsz_vf) +!$acc declare link(flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf, flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf !< real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf +#endif real(kind(0d0)) :: c !< Cell averaged speed of sound real(kind(0d0)), dimension(2) :: Re !< Cell averaged Reynolds numbers + !$acc declare create(c, Re) real(kind(0d0)) :: dpres_ds !< Spatial derivatives in s-dir of pressure +!$acc declare create(dpres_ds) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), ds) +!$acc declare link(ds) +#else real(kind(0d0)), allocatable, dimension(:) :: ds !< Cell-width distribution in the s-direction +#endif ! CBC Coefficients ========================================================= +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), fd_coef_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), fd_coef_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), fd_coef_z) +!$acc declare link(fd_coef_x, fd_coef_y, fd_coef_z) +#else real(kind(0d0)), allocatable, dimension(:, :) :: fd_coef_x !< Finite diff. coefficients x-dir real(kind(0d0)), allocatable, dimension(:, :) :: fd_coef_y !< Finite diff. coefficients y-dir real(kind(0d0)), allocatable, dimension(:, :) :: fd_coef_z !< Finite diff. coefficients z-dir +#endif !! The first dimension identifies the location of a coefficient in the FD !! formula, while the last dimension denotes the location of the CBC. ! Bug with NVHPC when using nullified pointers in a declare create ! real(kind(0d0)), pointer, dimension(:, :) :: fd_coef => null() - +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), pi_coef_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), pi_coef_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), pi_coef_z) +!$acc declare link(pi_coef_x, pi_coef_y, pi_coef_z) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: pi_coef_x !< Polynominal interpolant coefficients in x-dir real(kind(0d0)), allocatable, dimension(:, :, :) :: pi_coef_y !< Polynominal interpolant coefficients in y-dir real(kind(0d0)), allocatable, dimension(:, :, :) :: pi_coef_z !< Polynominal interpolant coefficients in z-dir +#endif !! The first dimension of the array identifies the polynomial, the !! second dimension identifies the position of its coefficients and the last !! dimension denotes the location of the CBC. @@ -82,17 +128,19 @@ module m_cbc ! ========================================================================== type(int_bounds_info) :: is1, is2, is3 !< Indical bounds in the s1-, s2- and s3-directions + !$acc declare create(is1, is2, is3) integer :: dj - integer :: bcxb, bcxe, bcyb, bcye, bczb, bcze - integer :: cbc_dir, cbc_loc +!$acc declare create(dj, bcxb, bcxe, bcyb, bcye, bczb, bcze, cbc_dir, cbc_loc) - !$acc declare create(q_prim_rsx_vf, q_prim_rsy_vf, q_prim_rsz_vf, F_rsx_vf, F_src_rsx_vf,flux_rsx_vf, flux_src_rsx_vf, & - !$acc F_rsy_vf, F_src_rsy_vf,flux_rsy_vf, flux_src_rsy_vf, F_rsz_vf, F_src_rsz_vf,flux_rsz_vf, flux_src_rsz_vf,Re, & - !$acc ds,fd_coef_x,fd_coef_y,fd_coef_z, & - !$acc pi_coef_x,pi_coef_y,pi_coef_z, bcxb, bcxe, bcyb, bcye, bczb, bcze, is1, is2, is3, dj, cbc_dir, cbc_loc) +#ifndef CRAY_ACC_WAR +!$acc declare create(q_prim_rsx_vf, q_prim_rsy_vf, q_prim_rsz_vf, F_rsx_vf, F_src_rsx_vf,flux_rsx_vf, flux_src_rsx_vf, & +!$acc F_rsy_vf, F_src_rsy_vf,flux_rsy_vf, flux_src_rsy_vf, F_rsz_vf, F_src_rsz_vf,flux_rsz_vf, flux_src_rsz_vf,Re, & +!$acc ds,fd_coef_x,fd_coef_y,fd_coef_z, & +!$acc pi_coef_x,pi_coef_y,pi_coef_z) +#endif contains @@ -127,29 +175,29 @@ contains end if is3%end = p - is3%beg - allocate (q_prim_rsx_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(q_prim_rsx_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) if (weno_order > 1) then - allocate (F_rsx_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_rsx_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (F_src_rsx_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_src_rsx_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) end if - allocate (flux_rsx_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_rsx_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (flux_src_rsx_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_src_rsx_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) if (n > 0) then @@ -170,29 +218,29 @@ contains end if is3%end = p - is3%beg - allocate (q_prim_rsy_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(q_prim_rsy_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) if (weno_order > 1) then - allocate (F_rsy_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_rsy_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (F_src_rsy_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_src_rsy_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) end if - allocate (flux_rsy_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_rsy_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (flux_src_rsy_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_src_rsy_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) end if @@ -215,42 +263,42 @@ contains end if is3%end = m - is3%beg - allocate (q_prim_rsz_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(q_prim_rsz_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) if (weno_order > 1) then - allocate (F_rsz_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_rsz_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (F_src_rsz_vf(0:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(F_src_rsz_vf(0:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) end if - allocate (flux_rsz_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_rsz_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:adv_idx%end)) - allocate (flux_src_rsz_vf(-1:buff_size, & - is2%beg:is2%end, & - is3%beg:is3%end, adv_idx%beg:adv_idx%end)) + @:ALLOCATE_GLOBAL(flux_src_rsz_vf(-1:buff_size, & + is2%beg:is2%end, & + is3%beg:is3%end, adv_idx%beg:adv_idx%end)) end if ! Allocating the cell-width distribution in the s-direction - allocate (ds(0:buff_size)) + @:ALLOCATE_GLOBAL(ds(0:buff_size)) ! Allocating/Computing CBC Coefficients in x-direction ============= if (all((/bc_x%beg, bc_x%end/) <= -5) .and. all((/bc_x%beg, bc_x%end/) >= -13)) then - allocate (fd_coef_x(0:buff_size, -1:1)) + @:ALLOCATE_GLOBAL(fd_coef_x(0:buff_size, -1:1)) if (weno_order > 1) then - allocate (pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) + @:ALLOCATE_GLOBAL(pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) end if call s_compute_cbc_coefficients(1, -1) @@ -258,20 +306,20 @@ contains elseif (bc_x%beg <= -5 .and. bc_x%beg >= -13) then - allocate (fd_coef_x(0:buff_size, -1:-1)) + @:ALLOCATE_GLOBAL(fd_coef_x(0:buff_size, -1:-1)) if (weno_order > 1) then - allocate (pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) + @:ALLOCATE_GLOBAL(pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) end if call s_compute_cbc_coefficients(1, -1) elseif (bc_x%end <= -5 .and. bc_x%end >= -13) then - allocate (fd_coef_x(0:buff_size, 1:1)) + @:ALLOCATE_GLOBAL(fd_coef_x(0:buff_size, 1:1)) if (weno_order > 1) then - allocate (pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) + @:ALLOCATE_GLOBAL(pi_coef_x(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) end if call s_compute_cbc_coefficients(1, 1) @@ -284,10 +332,10 @@ contains if (all((/bc_y%beg, bc_y%end/) <= -5) .and. all((/bc_y%beg, bc_y%end/) >= -13)) then - allocate (fd_coef_y(0:buff_size, -1:1)) + @:ALLOCATE_GLOBAL(fd_coef_y(0:buff_size, -1:1)) if (weno_order > 1) then - allocate (pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) + @:ALLOCATE_GLOBAL(pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) end if call s_compute_cbc_coefficients(2, -1) @@ -295,20 +343,20 @@ contains elseif (bc_y%beg <= -5 .and. bc_y%beg >= -13) then - allocate (fd_coef_y(0:buff_size, -1:-1)) + @:ALLOCATE_GLOBAL(fd_coef_y(0:buff_size, -1:-1)) if (weno_order > 1) then - allocate (pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) + @:ALLOCATE_GLOBAL(pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) end if call s_compute_cbc_coefficients(2, -1) elseif (bc_y%end <= -5 .and. bc_y%end >= -13) then - allocate (fd_coef_y(0:buff_size, 1:1)) + @:ALLOCATE_GLOBAL(fd_coef_y(0:buff_size, 1:1)) if (weno_order > 1) then - allocate (pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) + @:ALLOCATE_GLOBAL(pi_coef_y(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) end if call s_compute_cbc_coefficients(2, 1) @@ -323,10 +371,10 @@ contains if (all((/bc_z%beg, bc_z%end/) <= -5) .and. all((/bc_z%beg, bc_z%end/) >= -13)) then - allocate (fd_coef_z(0:buff_size, -1:1)) + @:ALLOCATE_GLOBAL(fd_coef_z(0:buff_size, -1:1)) if (weno_order > 1) then - allocate (pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) + @:ALLOCATE_GLOBAL(pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, -1:1)) end if call s_compute_cbc_coefficients(3, -1) @@ -334,20 +382,20 @@ contains elseif (bc_z%beg <= -5 .and. bc_z%beg >= -13) then - allocate (fd_coef_z(0:buff_size, -1:-1)) + @:ALLOCATE_GLOBAL(fd_coef_z(0:buff_size, -1:-1)) if (weno_order > 1) then - allocate (pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) + @:ALLOCATE_GLOBAL(pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, -1:-1)) end if call s_compute_cbc_coefficients(3, -1) elseif (bc_z%end <= -5 .and. bc_z%end >= -13) then - allocate (fd_coef_z(0:buff_size, 1:1)) + @:ALLOCATE_GLOBAL(fd_coef_z(0:buff_size, 1:1)) if (weno_order > 1) then - allocate (pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) + @:ALLOCATE_GLOBAL(pi_coef_z(0:weno_polyn - 1, 0:weno_order - 3, 1:1)) end if call s_compute_cbc_coefficients(3, 1) @@ -748,7 +796,6 @@ contains if (bubbles) then call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc, 0, k, r) - else call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc, 0, k, r) end if @@ -958,10 +1005,8 @@ contains ! The reshaping of outputted data and disssociation of the FD and PI ! coefficients, or CBC coefficients, respectively, based on selected ! CBC coordinate direction. - call s_finalize_cbc(flux_vf, flux_src_vf, & ix, iy, iz) - end subroutine s_cbc ! ------------------------------------------------- !> The computation of parameters, the allocation of memory, @@ -1007,8 +1052,8 @@ contains end if dj = max(0, cbc_loc) - - !$acc update device(is1, is2, is3, dir_idx, dir_flg, dj) + !$acc update device(is1, is2, is3, dj) + !$acc update device( dir_idx, dir_flg) ! Reshaping Inputted Data in x-direction =========================== if (cbc_dir == 1) then @@ -1461,46 +1506,55 @@ contains if (is_cbc .eqv. .false.) return ! Deallocating the cell-average primitive variables - deallocate (q_prim_rsx_vf) + @:DEALLOCATE_GLOBAL(q_prim_rsx_vf) if (weno_order > 1) then - deallocate (F_rsx_vf, F_src_rsx_vf) + @:DEALLOCATE_GLOBAL(F_rsx_vf, F_src_rsx_vf) end if - deallocate (flux_rsx_vf, flux_src_rsx_vf) + @:DEALLOCATE_GLOBAL(flux_rsx_vf, flux_src_rsx_vf) if (n > 0) then - deallocate (q_prim_rsy_vf) + @:DEALLOCATE_GLOBAL(q_prim_rsy_vf) if (weno_order > 1) then - deallocate (F_rsy_vf, F_src_rsy_vf) + @:DEALLOCATE_GLOBAL(F_rsy_vf, F_src_rsy_vf) end if - deallocate (flux_rsy_vf, flux_src_rsy_vf) + @:DEALLOCATE_GLOBAL(flux_rsy_vf, flux_src_rsy_vf) end if if (p > 0) then - deallocate (q_prim_rsz_vf) + @:DEALLOCATE_GLOBAL(q_prim_rsz_vf) if (weno_order > 1) then - deallocate (F_rsz_vf, F_src_rsz_vf) + @:DEALLOCATE_GLOBAL(F_rsz_vf, F_src_rsz_vf) end if - deallocate (flux_rsz_vf, flux_src_rsz_vf) + @:DEALLOCATE_GLOBAL(flux_rsz_vf, flux_src_rsz_vf) end if ! Deallocating the cell-width distribution in the s-direction - deallocate (ds) + @:DEALLOCATE_GLOBAL(ds) ! Deallocating CBC Coefficients in x-direction ===================== if (any((/bc_x%beg, bc_x%end/) <= -5) .and. any((/bc_x%beg, bc_x%end/) >= -13)) then - deallocate (fd_coef_x); if (weno_order > 1) deallocate (pi_coef_x) + @:DEALLOCATE_GLOBAL(fd_coef_x) + if (weno_order > 1) then + @:DEALLOCATE_GLOBAL(pi_coef_x) + end if end if ! ================================================================== ! Deallocating CBC Coefficients in y-direction ===================== if (n > 0 .and. any((/bc_y%beg, bc_y%end/) <= -5) .and. & any((/bc_y%beg, bc_y%end/) >= -13 .and. bc_y%beg /= -14)) then - deallocate (fd_coef_y); if (weno_order > 1) deallocate (pi_coef_y) + @:DEALLOCATE_GLOBAL(fd_coef_y) + if (weno_order > 1) then + @:DEALLOCATE_GLOBAL(pi_coef_y) + end if end if ! ================================================================== ! Deallocating CBC Coefficients in z-direction ===================== if (p > 0 .and. any((/bc_z%beg, bc_z%end/) <= -5) .and. any((/bc_z%beg, bc_z%end/) >= -13)) then - deallocate (fd_coef_z); if (weno_order > 1) deallocate (pi_coef_z) + @:DEALLOCATE_GLOBAL(fd_coef_z) + if (weno_order > 1) then + @:DEALLOCATE_GLOBAL(pi_coef_z) + end if end if ! ================================================================== diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp index 4b3a98528..0b893d34f 100644 --- a/src/simulation/m_compute_cbc.fpp +++ b/src/simulation/m_compute_cbc.fpp @@ -28,7 +28,11 @@ contains !! the normal component of velocity is zero at all times, !! while the transverse velocities may be nonzero. subroutine s_compute_slip_wall_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_slip_wall_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -52,7 +56,11 @@ contains !! buffer reduces the amplitude of any reflections caused by !! outgoing waves. subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -89,7 +97,11 @@ contains !! CBC assumes an incoming flow and reduces the amplitude of !! any reflections caused by outgoing waves. subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS ss_compute_nonreflecting_subsonic_inflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -111,7 +123,11 @@ contains !! subsonic CBC presumes an outgoing flow and reduces the !! amplitude of any reflections caused by outgoing waves. subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -147,7 +163,11 @@ contains !! at the boundary is simply advected outward at the fluid !! velocity. subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -179,7 +199,11 @@ contains !! subsonic outflow maintains a fixed pressure at the CBC !! boundary in absence of any transverse effects. subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -212,7 +236,11 @@ contains !! transverse terms may generate a time dependence at the !! inflow boundary. subroutine s_compute_supersonic_inflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds @@ -232,7 +260,11 @@ contains !! flow evolution at the boundary is determined completely !! by the interior data. subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) ! -------------- - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(3), intent(IN) :: lambda real(kind(0d0)), dimension(num_fluids), intent(IN) :: mf, dalpha_rho_ds, dadv_ds real(kind(0d0)), dimension(num_dims), intent(IN) :: dvel_ds diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp index 9e525f0a2..2eca61fbb 100644 --- a/src/simulation/m_data_output.fpp +++ b/src/simulation/m_data_output.fpp @@ -67,19 +67,24 @@ module m_data_output end subroutine s_write_abstract_data_files ! ------------------- end interface ! ======================================================== - +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), icfl_sf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), vcfl_sf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), ccfl_sf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), Rc_sf) +!$acc declare link(icfl_sf, vcfl_sf, ccfl_sf, Rc_sf) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: icfl_sf !< ICFL stability criterion real(kind(0d0)), allocatable, dimension(:, :, :) :: vcfl_sf !< VCFL stability criterion real(kind(0d0)), allocatable, dimension(:, :, :) :: ccfl_sf !< CCFL stability criterion real(kind(0d0)), allocatable, dimension(:, :, :) :: Rc_sf !< Rc stability criterion - - !$acc declare create(icfl_sf, vcfl_sf, ccfl_sf, Rc_sf) +!$acc declare create(icfl_sf, vcfl_sf, ccfl_sf, Rc_sf) +#endif real(kind(0d0)) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids real(kind(0d0)) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids real(kind(0d0)) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids real(kind(0d0)) :: Rc_min_loc, Rc_min_glb !< Rc stability extrema on local and global grids - !$acc declare create(icfl_max_loc, icfl_max_glb, vcfl_max_loc, vcfl_max_glb, ccfl_max_loc, ccfl_max_glb, Rc_min_loc, Rc_min_glb) !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps @@ -94,6 +99,8 @@ module m_data_output contains + @:s_compute_speed_of_sound() + !> The purpose of this subroutine is to open a new or pre- !! existing run-time information file and append to it the !! basic header information relevant to current simulation. @@ -249,7 +256,7 @@ contains !! Modified dtheta accounting for Fourier filtering in azimuthal direction. ! Computing Stability Criteria at Current Time-step ================ - !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho, vel, alpha, Re) + !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho, vel, alpha, Re, fltr_dtheta, Nfq) do l = 0, p do k = 0, n do j = 0, m @@ -364,6 +371,20 @@ contains ! Determining local stability criteria extrema at current time-step +#ifdef CRAY_ACC_WAR + !$acc update host(icfl_sf) + + if (any(Re_size > 0)) then + !$acc update host(vcfl_sf, Rc_sf) + end if + + icfl_max_loc = maxval(icfl_sf) + + if (any(Re_size > 0)) then + vcfl_max_loc = maxval(vcfl_sf) + Rc_min_loc = minval(Rc_sf) + end if +#else !$acc kernels icfl_max_loc = maxval(icfl_sf) !$acc end kernels @@ -374,6 +395,7 @@ contains Rc_min_loc = minval(Rc_sf) !$acc end kernels end if +#endif ! Determining global stability criteria extrema at current time-step if (num_procs > 1) then @@ -980,18 +1002,6 @@ contains call MPI_FILE_CLOSE(ifile, ierr) end if - if (ib) then - var_MOK = int(sys_size + 1, MPI_OFFSET_KIND) - - ! Initial displacement to skip at beginning of file - disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1) - - call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, & - 'native', mpi_info_int, ierr) - call MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, & - MPI_DOUBLE_PRECISION, status, ierr) - end if - call MPI_FILE_CLOSE(ifile, ierr) #endif @@ -1598,8 +1608,6 @@ contains end subroutine s_write_probe_files ! ----------------------------------- - @:s_compute_speed_of_sound() - !> The goal of this subroutine is to write to the run-time !! information file basic footer information applicable to !! the current computation and to close the file when done. @@ -1649,12 +1657,12 @@ contains integer :: i !< Generic loop iterator ! Allocating/initializing ICFL, VCFL, CCFL and Rc stability criteria - @:ALLOCATE(icfl_sf(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(icfl_sf(0:m, 0:n, 0:p)) icfl_max = 0d0 if (any(Re_size > 0)) then - @:ALLOCATE(vcfl_sf(0:m, 0:n, 0:p)) - @:ALLOCATE(Rc_sf (0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(vcfl_sf(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(Rc_sf (0:m, 0:n, 0:p)) vcfl_max = 0d0 Rc_min = 1d3 @@ -1688,9 +1696,9 @@ contains integer :: i !< Generic loop iterator ! Deallocating the ICFL, VCFL, CCFL, and Rc stability criteria - @:DEALLOCATE(icfl_sf) + @:DEALLOCATE_GLOBAL(icfl_sf) if (any(Re_size > 0)) then - @:DEALLOCATE(vcfl_sf, Rc_sf) + @:DEALLOCATE_GLOBAL(vcfl_sf, Rc_sf) end if ! Disassociating the pointer to the procedure that was utilized to diff --git a/src/simulation/m_derived_variables.f90 b/src/simulation/m_derived_variables.f90 index 459193a5b..ce615ea80 100644 --- a/src/simulation/m_derived_variables.f90 +++ b/src/simulation/m_derived_variables.f90 @@ -177,7 +177,7 @@ end subroutine s_compute_derived_variables ! --------------------------- !! @param q_sf Acceleration component subroutine s_derive_acceleration_component(i, q_prim_vf0, q_prim_vf1, & q_prim_vf2, q_prim_vf3, q_sf) ! ---------- - +!DIR$ INLINEALWAYS s_derive_acceleration_component integer, intent(IN) :: i type(scalar_field), dimension(sys_size), intent(IN) :: q_prim_vf0 @@ -202,6 +202,7 @@ subroutine s_derive_acceleration_component(i, q_prim_vf0, q_prim_vf1, & do r = -fd_number, fd_number if (n == 0) then ! 1D simulation + print *, q_sf(j, k, l), q_prim_vf0(mom_idx%beg)%sf(j, k, l), fd_coeff_x(r, j), q_prim_vf0(mom_idx%beg)%sf(r + j, k, l) q_sf(j, k, l) = q_sf(j, k, l) & + q_prim_vf0(mom_idx%beg)%sf(j, k, l)*fd_coeff_x(r, j)* & q_prim_vf0(mom_idx%beg)%sf(r + j, k, l) diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index db45a2993..9dd955c22 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -18,7 +18,12 @@ module m_fftw #if defined(MFC_OpenACC) && defined(__PGI) use cufft +#elif defined(MFC_OpenACC) + use hipfort + use hipfort_check + use hipfort_hipfft #endif + ! ========================================================================== implicit none @@ -27,13 +32,13 @@ module m_fftw s_apply_fourier_filter, & s_finalize_fftw_module -#if !(defined(MFC_OpenACC) && defined(__PGI)) +#if !defined(MFC_OpenACC) include 'fftw3.f03' #endif type(c_ptr) :: fwd_plan, bwd_plan type(c_ptr) :: fftw_real_data, fftw_cmplx_data, fftw_fltr_cmplx_data - integer :: real_size, cmplx_size, x_size, batch_size + integer :: real_size, cmplx_size, x_size, batch_size, Nfq real(c_double), pointer :: data_real(:) !< Real data @@ -43,17 +48,29 @@ module m_fftw complex(c_double_complex), pointer :: data_fltr_cmplx(:) !< !! Filtered complex data in Fourier space -#if defined(MFC_OpenACC) && defined(__PGI) - !$acc declare create(real_size, cmplx_size, x_size, batch_size) +#if defined(MFC_OpenACC) +!$acc declare create(real_size, cmplx_size, x_size, batch_size, Nfq) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), data_real_gpu) + @:CRAY_DECLARE_GLOBAL(complex(kind(0d0)), dimension(:), data_cmplx_gpu) + @:CRAY_DECLARE_GLOBAL(complex(kind(0d0)), dimension(:), data_fltr_cmplx_gpu) +!$acc declare link(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu) +#else real(kind(0d0)), allocatable :: data_real_gpu(:) complex(kind(0d0)), allocatable :: data_cmplx_gpu(:) complex(kind(0d0)), allocatable :: data_fltr_cmplx_gpu(:) - !$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu) +!$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu) +#endif - integer :: fwd_plan_gpu, bwd_plan_gpu, ierr +#if defined(__PGI) + integer :: fwd_plan_gpu, bwd_plan_gpu +#else + type(c_ptr) :: fwd_plan_gpu, bwd_plan_gpu +#endif + integer :: ierr - integer, allocatable :: cufft_size(:), iembed(:), oembed(:) + integer, allocatable :: gpu_fft_size(:), iembed(:), oembed(:) integer :: istride, ostride, idist, odist, rank #endif @@ -65,13 +82,6 @@ contains !! applying the Fourier filter in the azimuthal direction. subroutine s_initialize_fftw_module() ! ---------------------------------- -#if defined(MFC_OpenACC) && !defined(__PGI) - - print *, "The FFTW module is not supported when using OpenACC with a compiler other than NVHPC/PGI." - stop 1 - -#endif - ! Size of input array going into DFT real_size = p + 1 ! Size of output array coming out of DFT @@ -81,15 +91,15 @@ contains batch_size = x_size*sys_size -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) rank = 1; istride = 1; ostride = 1 - allocate (cufft_size(1:rank), iembed(1:rank), oembed(1:rank)) + allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank)) - cufft_size(1) = real_size; + gpu_fft_size(1) = real_size; iembed(1) = 0 oembed(1) = 0 - +!$acc enter data copyin(real_size, cmplx_size, x_size, sys_size, batch_size, Nfq) !$acc update device(real_size, cmplx_size, x_size, sys_size, batch_size) #else ! Allocate input and output DFT data sizes @@ -106,13 +116,19 @@ contains bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) #endif -#if defined(MFC_OpenACC) && defined(__PGI) - @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) - @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) - @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) +#if defined(MFC_OpenACC) + @:ALLOCATE_GLOBAL(data_real_gpu(1:real_size*x_size*sys_size)) + @:ALLOCATE_GLOBAL(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + @:ALLOCATE_GLOBAL(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + +#if defined(__PGI) + ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size) + ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size) +#else + ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size) + ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size) +#endif - ierr = cufftPlanMany(fwd_plan_gpu, rank, cufft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size) - ierr = cufftPlanMany(bwd_plan_gpu, rank, cufft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size) #endif end subroutine s_initialize_fftw_module ! ------------------------------ @@ -125,14 +141,11 @@ contains type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf - integer :: Nfq !< Number of kept modes - integer :: i, j, k, l !< Generic loop iterators ! Restrict filter to processors that have cells adjacent to axis if (bc_y%beg >= 0) return - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) !$acc parallel loop collapse(3) gang vector default(present) do k = 1, sys_size @@ -152,13 +165,18 @@ contains end do end do - !$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +#if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) +#else + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + call hipCheck(hipDeviceSynchronize()) +#endif !$acc end host_data - Nfq = 3 + !$acc update device(Nfq) - !$acc parallel loop collapse(3) gang vector default(present) firstprivate(Nfq) + !$acc parallel loop collapse(3) gang vector default(present) do k = 1, sys_size do j = 0, m do l = 1, Nfq @@ -167,8 +185,13 @@ contains end do end do - !$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +#if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) +#else + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + call hipCheck(hipDeviceSynchronize()) +#endif !$acc end host_data !$acc parallel loop collapse(3) gang vector default(present) @@ -201,13 +224,19 @@ contains end do end do - !$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +!$acc host_data use_device(data_real_gpu, data_cmplx_gpu) +#if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) +#else + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(data_real_gpu), c_loc(data_cmplx_gpu)) + call hipCheck(hipDeviceSynchronize()) +#endif !$acc end host_data Nfq = min(floor(2d0*real(i, kind(0d0))*pi), cmplx_size) + !$acc update device(Nfq) - !$acc parallel loop collapse(3) gang vector default(present) firstprivate(Nfq) + !$acc parallel loop collapse(3) gang vector default(present) do k = 1, sys_size do j = 0, m do l = 1, Nfq @@ -216,8 +245,13 @@ contains end do end do - !$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +!$acc host_data use_device(data_real_gpu, data_fltr_cmplx_gpu) +#if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) +#else + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(data_fltr_cmplx_gpu), c_loc(data_real_gpu)) + call hipCheck(hipDeviceSynchronize()) +#endif !$acc end host_data !$acc parallel loop collapse(3) gang vector default(present) firstprivate(i) @@ -270,10 +304,16 @@ contains !! applying the Fourier filter in the azimuthal direction. subroutine s_finalize_fftw_module() ! ------------------------------------ -#if defined(MFC_OpenACC) && defined(__PGI) - @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu) +#if defined(MFC_OpenACC) + @:DEALLOCATE_GLOBAL(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu) +#if defined(__PGI) + ierr = cufftDestroy(fwd_plan_gpu) ierr = cufftDestroy(bwd_plan_gpu) +#else + ierr = hipfftDestroy(fwd_plan_gpu) + ierr = hipfftDestroy(bwd_plan_gpu) +#endif #else call fftw_free(fftw_real_data) call fftw_free(fftw_cmplx_data) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index e9d7e81b0..64a56c5e5 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -59,24 +59,40 @@ module m_global_parameters !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), x_cb, y_cb, z_cb) +#else real(kind(0d0)), target, allocatable, dimension(:) :: x_cb, y_cb, z_cb +#endif !> @} !> @name Cell-center (CC) locations in the x-, y- and z-directions, respectively !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), x_cc, y_cc, z_cc) +#else real(kind(0d0)), target, allocatable, dimension(:) :: x_cc, y_cc, z_cc +#endif !> @} !type(bounds_info) :: x_domain, y_domain, z_domain !< !! Locations of the domain bounds in the x-, y- and z-coordinate directions !> @name Cell-width distributions in the x-, y- and z-directions, respectively !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), dx, dy, dz) +#else real(kind(0d0)), target, allocatable, dimension(:) :: dx, dy, dz +#endif !> @} real(kind(0d0)) :: dt !< Size of the time-step - !$acc declare create(x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p) - +#ifdef CRAY_ACC_WAR +!$acc declare link(x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz) +!$acc declare create(m, n, p, dt) +#else +!$acc declare create(x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p) +#endif !> @name Starting time-step iteration, stopping time-step iteration and the number !! of time-step iterations between successive solution backups, respectively !> @{ @@ -94,7 +110,7 @@ module m_global_parameters #:else integer :: num_dims !< Number of spatial dimensions #:endif - integer :: num_fluids !< Number of fluids in the flow + integer :: num_fluids logical :: adv_alphan !< Advection of the last volume fraction logical :: mpp_lim !< Mixture physical parameters (MPP) limits integer :: time_stepper !< Time-stepper algorithm @@ -116,10 +132,6 @@ module m_global_parameters integer :: riemann_solver !< Riemann solver algorithm integer :: wave_speeds !< Wave speeds estimation method integer :: avg_state !< Average state evaluation method - logical :: relax !< activate phase change - integer :: relax_model !< Relaxation model - real(kind(0d0)) :: palpha_eps !< trigger parameter for the p relaxation procedure, phase change model - real(kind(0d0)) :: ptgalpha_eps !< trigger parameter for the pTg relaxation procedure, phase change model logical :: alt_soundspeed !< Alternate mixture sound speed logical :: null_weights !< Null undesired WENO weights logical :: mixture_err !< Mixture properties correction @@ -132,7 +144,16 @@ module m_global_parameters !$acc declare create(num_dims, weno_polyn, weno_order) #:endif - !$acc declare create(mpp_lim, num_fluids, model_eqns, mixture_err, alt_soundspeed, avg_state, mapped_weno, mp_weno, weno_eps, hypoelasticity, relax, palpha_eps,ptgalpha_eps) + !$acc declare create(mpp_lim, num_fluids, model_eqns, mixture_err, alt_soundspeed, avg_state, mapped_weno, mp_weno, weno_eps, hypoelasticity) + + logical :: relax !< activate phase change + integer :: relax_model !< Relaxation model + real(kind(0d0)) :: palpha_eps !< trigger parameter for the p relaxation procedure, phase change model + real(kind(0d0)) :: ptgalpha_eps !< trigger parameter for the pTg relaxation procedure, phase change model + +!#ifndef _CRAYFTN +!$acc declare create(relax, relax_model, palpha_eps,ptgalpha_eps) +!#endif !> @name Boundary conditions (BC) in the x-, y- and z-directions, respectively !> @{ @@ -184,10 +205,18 @@ module m_global_parameters !! numbers, will be non-negligible. !> @{ integer, dimension(2) :: Re_size +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(integer, dimension(:, :), Re_idx) +#else integer, allocatable, dimension(:, :) :: Re_idx - !> @{ - - !$acc declare create(Re_size, Re_idx) +#endif + !> @} +#ifdef CRAY_ACC_WAR +!$acc declare create(Re_size) +!$acc declare link(Re_idx) +#else +!$acc declare create(Re_size, Re_idx) +#endif ! The WENO average (WA) flag regulates whether the calculation of any cell- ! average spatial derivatives is carried out in each cell by utilizing the @@ -288,16 +317,30 @@ module m_global_parameters real(kind(0d0)) :: Ca !< Cavitation number real(kind(0d0)) :: Web !< Weber number real(kind(0d0)) :: Re_inv !< Inverse Reynolds number +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), weight) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), R0) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), V0) +!$acc declare link(weight, R0, V0) +#else real(kind(0d0)), dimension(:), allocatable :: weight !< Simpson quadrature weights real(kind(0d0)), dimension(:), allocatable :: R0 !< Bubble sizes real(kind(0d0)), dimension(:), allocatable :: V0 !< Bubble velocities +!$acc declare create(weight, R0, V0) +#endif logical :: bubbles !< Bubbles on/off logical :: polytropic !< Polytropic switch logical :: polydisperse !< Polydisperse bubbles integer :: bubble_model !< Gilmore or Keller--Miksis bubble model integer :: thermal !< Thermal behavior. 1 = adiabatic, 2 = isotherm, 3 = transfer +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), ptil) +!$acc declare link(ptil) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: ptil !< Pressure modification +!$acc declare create(ptil) +#endif real(kind(0d0)) :: poly_sigma !< log normal sigma for polydisperse PDF logical :: qbmm !< Quadrature moment method @@ -310,23 +353,39 @@ module m_global_parameters !$acc declare create(nb) #:endif - !$acc declare create(R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles, polytropic, polydisperse, qbmm, nmomsp, nmomtot, R0_type, ptil, bubble_model, thermal, poly_sigma) +!$acc declare create(R0ref, Ca, Web, Re_inv, bubbles, polytropic, polydisperse, qbmm, nmomsp, nmomtot, R0_type, bubble_model, thermal, poly_sigma) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:), mom_sp) + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:, :, :), mom_3d) +!$acc declare link(mom_sp, mom_3d) +#else type(scalar_field), allocatable, dimension(:) :: mom_sp type(scalar_field), allocatable, dimension(:, :, :) :: mom_3d +!$acc declare create(mom_sp, mom_3d) +#endif !> @} - !$acc declare create(mom_sp, mom_3d) !> @name Physical bubble parameters (see Ando 2010, Preston 2007) !> @{ real(kind(0d0)) :: R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v +!$acc declare create(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), k_n, k_v, pb0, mass_n0, mass_v0, Pe_T) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN) +!$acc declare link( k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN) +#else real(kind(0d0)), dimension(:), allocatable :: k_n, k_v, pb0, mass_n0, mass_v0, Pe_T real(kind(0d0)), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN +!$acc declare create( k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN) +#endif real(kind(0d0)) :: mul0, ss, gamma_v, mu_v real(kind(0d0)) :: gamma_m, gamma_n, mu_n real(kind(0d0)) :: gam !> @} - !$acc declare create(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN , mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam) + + !$acc declare create(mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam) + !> @name Acoustic monopole parameters !> @{ logical :: monopole !< Monopole switch @@ -341,19 +400,33 @@ module m_global_parameters integer :: intxb, intxe integer :: bubxb, bubxe integer :: strxb, strxe - !$acc declare create(momxb, momxe, advxb, advxe, contxb, contxe, intxb, intxe, bubxb, bubxe, strxb, strxe) +!$acc declare create(momxb, momxe, advxb, advxe, contxb, contxe, intxb, intxe, bubxb, bubxe, strxb, strxe) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps) +!$acc declare link(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps) +#else real(kind(0d0)), allocatable, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps - !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps) +!$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps) +#endif real(kind(0d0)) :: mytime !< Current simulation time real(kind(0d0)) :: finaltime !< Final simulation time logical :: weno_flat, riemann_flat, cu_mpi +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(pres_field), dimension(:), pb_ts) + @:CRAY_DECLARE_GLOBAL(type(pres_field), dimension(:), mv_ts) + +!$acc declare link(pb_ts, mv_ts) +#else type(pres_field), allocatable, dimension(:) :: pb_ts + type(pres_field), allocatable, dimension(:) :: mv_ts - !$acc declare create(pb_ts, mv_ts) + +!$acc declare create(pb_ts, mv_ts) +#endif ! ====================================================================== contains @@ -538,6 +611,7 @@ contains weno_polyn = (weno_order - 1)/2 !$acc update device(weno_polyn) !$acc update device(nb) + !$acc update device(num_dims, num_fluids) #:endif ! Initializing the number of fluids for which viscous effects will @@ -612,7 +686,7 @@ contains ! print*, 'alf idx', alf_idx ! print*, 'bub -idx beg end', bub_idx%beg, bub_idx%end - @:ALLOCATE(weight(nb), R0(nb), V0(nb)) + @:ALLOCATE_GLOBAL(weight(nb), R0(nb), V0(nb)) @:ALLOCATE(bub_idx%rs(nb), bub_idx%vs(nb)) @:ALLOCATE(bub_idx%ps(nb), bub_idx%ms(nb)) @@ -674,7 +748,7 @@ contains if (polytropic) then pv = fluid_pp(1)%pv pv = pv/pref - @:ALLOCATE(pb0(nb)) + @:ALLOCATE_GLOBAL(pb0(nb)) if (Web == dflt_real) then pb0 = pref pb0 = pb0/pref @@ -683,7 +757,6 @@ contains rhoref = 1d0 end if end if - end if if (hypoelasticity) then @@ -726,7 +799,7 @@ contains @:ALLOCATE(bub_idx%rs(nb), bub_idx%vs(nb)) @:ALLOCATE(bub_idx%ps(nb), bub_idx%ms(nb)) - @:ALLOCATE(weight(nb), R0(nb), V0(nb)) + @:ALLOCATE_GLOBAL(weight(nb), R0(nb), V0(nb)) do i = 1, nb if (polytropic) then @@ -773,7 +846,7 @@ contains ! fluids whose interface will support effects of surface tension if (any(Re_size > 0)) then - @:ALLOCATE(Re_idx(1:2, 1:maxval(Re_size))) + @:ALLOCATE_GLOBAL(Re_idx(1:2, 1:maxval(Re_size))) k = 0 do i = 1, num_fluids @@ -849,7 +922,7 @@ contains ix%end = m - ix%beg; iy%end = n - iy%beg; iz%end = p - iz%beg - @:ALLOCATE(ptil(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) + @:ALLOCATE_GLOBAL(ptil(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) end if if (probe_wrt) then @@ -891,21 +964,31 @@ contains intxe = internalEnergies_idx%end !$acc update device(momxb, momxe, advxb, advxe, contxb, contxe, bubxb, bubxe, intxb, intxe, sys_size, buff_size, E_idx, alf_idx, strxb, strxe) + !$acc update device(m, n, p) + + !$acc update device(alt_soundspeed, monopole, num_mono) + !$acc update device(dt, sys_size, buff_size, pref, rhoref, gamma_idx, pi_inf_idx, E_idx, alf_idx, stress_idx, mpp_lim, bubbles, hypoelasticity, alt_soundspeed, avg_state, num_fluids, model_eqns, num_dims, mixture_err, grid_geometry, cyl_coord, mapped_weno, mp_weno, weno_eps) + + !$acc enter data copyin(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma) + !$acc enter data copyin(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN , mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam) + !$acc enter data copyin(dir_idx, dir_flg, dir_idx_tau) + + !$acc enter data copyin(relax, relax_model, palpha_eps,ptgalpha_eps) ! Allocating grid variables for the x-, y- and z-directions - @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size)) - @:ALLOCATE(x_cc(-buff_size:m + buff_size)) - @:ALLOCATE(dx(-buff_size:m + buff_size)) + @:ALLOCATE_GLOBAL(x_cb(-1 - buff_size:m + buff_size)) + @:ALLOCATE_GLOBAL(x_cc(-buff_size:m + buff_size)) + @:ALLOCATE_GLOBAL(dx(-buff_size:m + buff_size)) if (n == 0) return; - @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size)) - @:ALLOCATE(y_cc(-buff_size:n + buff_size)) - @:ALLOCATE(dy(-buff_size:n + buff_size)) + @:ALLOCATE_GLOBAL(y_cb(-1 - buff_size:n + buff_size)) + @:ALLOCATE_GLOBAL(y_cc(-buff_size:n + buff_size)) + @:ALLOCATE_GLOBAL(dy(-buff_size:n + buff_size)) if (p == 0) return; - @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size)) - @:ALLOCATE(z_cc(-buff_size:p + buff_size)) - @:ALLOCATE(dz(-buff_size:p + buff_size)) + @:ALLOCATE_GLOBAL(z_cb(-1 - buff_size:p + buff_size)) + @:ALLOCATE_GLOBAL(z_cc(-buff_size:p + buff_size)) + @:ALLOCATE_GLOBAL(dz(-buff_size:p + buff_size)) end subroutine s_initialize_global_parameters_module ! ----------------- @@ -949,17 +1032,17 @@ contains ! fluids and any pairs of fluids whose interfaces supported effects ! of surface tension if (any(Re_size > 0)) then - @:DEALLOCATE(Re_idx) + @:DEALLOCATE_GLOBAL(Re_idx) end if ! Deallocating grid variables for the x-, y- and z-directions - @:DEALLOCATE(x_cb, x_cc, dx) + @:DEALLOCATE_GLOBAL(x_cb, x_cc, dx) if (n == 0) return; - @:DEALLOCATE(y_cb, y_cc, dy) + @:DEALLOCATE_GLOBAL(y_cb, y_cc, dy) if (p == 0) return; - @:DEALLOCATE(z_cb, z_cc, dz) + @:DEALLOCATE_GLOBAL(z_cb, z_cc, dz) deallocate (proc_coords) if (parallel_io) then diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp index c799cb367..5e8f9d2e9 100644 --- a/src/simulation/m_hypoelastic.fpp +++ b/src/simulation/m_hypoelastic.fpp @@ -22,6 +22,18 @@ module m_hypoelastic private; public :: s_initialize_hypoelastic_module, & s_compute_hypoelastic_rhs +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), Gs) + !$acc declare link(Gs) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), du_dx, du_dy, du_dz) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), dv_dx, dv_dy, dv_dz) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), dw_dx, dw_dy, dw_dz) + !$acc declare link(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), rho_K_field, G_K_field) +!$acc declare link(rho_K_field, G_K_field) +#else real(kind(0d0)), allocatable, dimension(:) :: Gs !$acc declare create(Gs) @@ -31,7 +43,9 @@ module m_hypoelastic !$acc declare create(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz) real(kind(0d0)), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field - !$acc declare create(rho_K_field, G_K_field) +!$acc declare create(rho_K_field, G_K_field) + +#endif contains @@ -39,14 +53,14 @@ contains integer :: i - @:ALLOCATE(Gs(1:num_fluids)) - @:ALLOCATE(rho_K_field(0:m,0:n,0:p), G_K_field(0:m,0:n,0:p)) - @:ALLOCATE(du_dx(0:m,0:n,0:p)) + @:ALLOCATE_GLOBAL(Gs(1:num_fluids)) + @:ALLOCATE_GLOBAL(rho_K_field(0:m,0:n,0:p), G_K_field(0:m,0:n,0:p)) + @:ALLOCATE_GLOBAL(du_dx(0:m,0:n,0:p)) if (n > 0) then - @:ALLOCATE(du_dy(0:m,0:n,0:p), dv_dx(0:m,0:n,0:p), dv_dy(0:m,0:n,0:p)) + @:ALLOCATE_GLOBAL(du_dy(0:m,0:n,0:p), dv_dx(0:m,0:n,0:p), dv_dy(0:m,0:n,0:p)) if (p > 0) then - @:ALLOCATE(du_dz(0:m,0:n,0:p), dv_dz(0:m,0:n,0:p)) - @:ALLOCATE(dw_dx(0:m,0:n,0:p), dw_dy(0:m,0:n,0:p), dw_dz(0:m,0:n,0:p)) + @:ALLOCATE_GLOBAL(du_dz(0:m,0:n,0:p), dv_dz(0:m,0:n,0:p)) + @:ALLOCATE_GLOBAL(dw_dx(0:m,0:n,0:p), dw_dy(0:m,0:n,0:p), dw_dz(0:m,0:n,0:p)) end if end if diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp index 342a1de63..e9620352b 100644 --- a/src/simulation/m_ibm.fpp +++ b/src/simulation/m_ibm.fpp @@ -7,7 +7,6 @@ !> @brief This module is used to handle all operations related to immersed !! boundary methods (IBMs) module m_ibm - ! Dependencies ============================================================= use m_derived_types !< Definitions of the derived types @@ -38,22 +37,30 @@ module m_ibm s_finalize_ibm_module type(integer_field), public :: ib_markers - !! Marker for solid cells. 0 if liquid, the patch id of its IB if solid +!$acc declare create(ib_markers) + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), levelset) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :, :), levelset_norm) + @:CRAY_DECLARE_GLOBAL(type(ghost_point), dimension(:), ghost_points) +!$acc declare link(levelset, levelset_norm, ghost_points) +#else + + !! Marker for solid cells. 0 if liquid, the patch id of its IB if solid real(kind(0d0)), dimension(:, :, :, :), allocatable :: levelset !! Matrix of distance to IB - real(kind(0d0)), dimension(:, :, :, :, :), allocatable :: levelset_norm !! Matrix of normal vector to IB - type(ghost_point), dimension(:), allocatable :: ghost_points !! Matrix of normal vector to IB +!$acc declare create(levelset, levelset_norm, ghost_points) +#endif + integer :: gp_layers !< Number of ghost point layers integer :: num_gps !< Number of ghost points - - !$acc declare create(ib_markers, levelset, levelset_norm) - !$acc declare create(ghost_points, gp_layers, num_gps) + !$acc declare create(gp_layers, num_gps) contains @@ -69,16 +76,21 @@ contains @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, & -gp_layers:n+gp_layers, 0:0)) end if + @:ACC_SETUP_SFs(ib_markers) ! @:ALLOCATE(ib_markers%sf(0:m, 0:n, 0:p)) - @:ALLOCATE(levelset(0:m, 0:n, 0:p, num_ibs)) - @:ALLOCATE(levelset_norm(0:m, 0:n, 0:p, num_ibs, 3)) + @:ALLOCATE_GLOBAL(levelset(0:m, 0:n, 0:p, num_ibs)) + @:ALLOCATE_GLOBAL(levelset_norm(0:m, 0:n, 0:p, num_ibs, 3)) + + !$acc enter data copyin(gp_layers, num_gps) end subroutine s_initialize_ibm_module subroutine s_ibm_setup() - integer :: i, j + integer :: i, j, k + + !$acc update device(ib_markers%sf) ! Get neighboring IB variables from other processors call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers) @@ -86,7 +98,8 @@ contains call s_find_num_ghost_points() !$acc update device(num_gps) - @:ALLOCATE(ghost_points(num_gps)) + @:ALLOCATE_GLOBAL(ghost_points(num_gps)) + !$acc enter data copyin(ghost_points) call s_find_ghost_points(ghost_points) !$acc update device(ghost_points) @@ -116,7 +129,7 @@ contains real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), optional, intent(INOUT) :: pb, mv - integer :: i, j, k, l, q, r !< Iterator variables + integer :: i, j, k, l, q, r!< Iterator variables integer :: patch_id !< Patch ID of ghost point real(kind(0d0)) :: rho, gamma, pi_inf, dyn_pres !< Mixture variables real(kind(0d0)), dimension(2) :: Re_K @@ -124,7 +137,8 @@ contains real(kind(0d0)) :: qv_K real(kind(0d0)), dimension(num_fluids) :: Gs - real(kind(0d0)) :: pres_IP, vel_IP(3), vel_norm_IP(3) + real(kind(0d0)) :: pres_IP, coeff + real(kind(0d0)), dimension(3) :: vel_IP, vel_norm_IP real(kind(0d0)), dimension(num_fluids) :: alpha_rho_IP, alpha_IP real(kind(0d0)), dimension(nb) :: r_IP, v_IP, pb_IP, mv_IP real(kind(0d0)), dimension(nb*nmom) :: nmom_IP @@ -140,14 +154,14 @@ contains real(kind(0d0)) :: buf type(ghost_point) :: gp - !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, vel_g, vel_norm_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, presb_IP, massv_IP, rho, gamma, pi_inf, Re_K, G_K, Gs, gp, norm, buf) + !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, vel_g, vel_norm_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, presb_IP, massv_IP, rho, gamma, pi_inf, Re_K, G_K, Gs, gp, norm, buf, j, k, l, q, coeff) do i = 1, num_gps gp = ghost_points(i) j = gp%loc(1) k = gp%loc(2) l = gp%loc(3) - patch_id = gp%ib_patch_id + patch_id = ghost_points(i)%ib_patch_id ! Calculate physical location of GP if (p > 0) then @@ -156,7 +170,7 @@ contains physical_loc = [x_cc(j), y_cc(k), 0d0] end if - ! Interpolate primitive variables at image point associated w/ GP + !Interpolate primitive variables at image point associated w/ GP if (bubbles .and. .not. qbmm) then call s_interpolate_image_point(q_prim_vf, gp, & alpha_rho_IP, alpha_IP, pres_IP, vel_IP, & @@ -174,9 +188,10 @@ contains alpha_rho_IP, alpha_IP, pres_IP, vel_IP) end if - dyn_pres = 0 + dyn_pres = 0d0 ! Set q_prim_vf params at GP so that mixture vars calculated properly + !$acc loop seq do q = 1, num_fluids q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q) q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q) @@ -383,7 +398,6 @@ contains end subroutine s_compute_image_points subroutine s_find_num_ghost_points() - integer, dimension(2*gp_layers + 1, 2*gp_layers + 1) & :: subsection_2D integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) & @@ -629,7 +643,6 @@ contains subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP) !$acc routine seq - type(scalar_field), & dimension(sys_size), & intent(IN) :: q_prim_vf !< Primitive Variables @@ -777,9 +790,10 @@ contains !! @param fV Current bubble velocity !! @param fpb Internal bubble pressure subroutine s_finalize_ibm_module() - deallocate (ib_markers%sf) - deallocate (levelset) - deallocate (levelset_norm) + + @:DEALLOCATE(ib_markers%sf) + @:DEALLOCATE_GLOBAL(levelset) + @:DEALLOCATE_GLOBAL(levelset_norm) end subroutine s_finalize_ibm_module end module m_ibm diff --git a/src/simulation/m_monopole.fpp b/src/simulation/m_monopole.fpp index 519ff38cf..431e14a18 100644 --- a/src/simulation/m_monopole.fpp +++ b/src/simulation/m_monopole.fpp @@ -20,10 +20,30 @@ module m_monopole private; public :: s_initialize_monopole_module, s_monopole_calculations, & s_compute_monopole_rhs +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), pulse, support) + !$acc declare link(pulse, support) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), loc_mono) + !$acc declare link(loc_mono) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), foc_length, aperture, support_width) + !$acc declare link(foc_length, aperture, support_width) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), mag, length, npulse, dir, delay) + !$acc declare link(mag, length, npulse, dir, delay) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), mono_mass_src, mono_e_src) + !$acc declare link(mono_mass_src, mono_e_src) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), mono_mom_src) +!$acc declare link(mono_mom_src) + +#else integer, allocatable, dimension(:) :: pulse, support !$acc declare create(pulse, support) - real(kind(0d0)), allocatable, dimension(:, :) :: loc_mono + real(kind(0d0)), allocatable, target, dimension(:, :) :: loc_mono !$acc declare create(loc_mono) real(kind(0d0)), allocatable, dimension(:) :: foc_length, aperture, support_width @@ -37,14 +57,16 @@ module m_monopole real(kind(0d0)), allocatable, dimension(:, :, :) :: mono_mass_src, mono_e_src real(kind(0d0)), allocatable, dimension(:, :, :, :) :: mono_mom_src !> @} - !$acc declare create(mono_mass_src, mono_e_src, mono_mom_src) +!$acc declare create(mono_mass_src, mono_e_src, mono_mom_src) + +#endif contains subroutine s_initialize_monopole_module() integer :: i, j !< generic loop variables - @:ALLOCATE(mag(1:num_mono), support(1:num_mono), length(1:num_mono), npulse(1:num_mono), pulse(1:num_mono), dir(1:num_mono), delay(1:num_mono), loc_mono(1:3, 1:num_mono), foc_length(1:num_mono), aperture(1:num_mono), support_width(1:num_mono)) + @:ALLOCATE_GLOBAL(mag(1:num_mono), support(1:num_mono), length(1:num_mono), npulse(1:num_mono), pulse(1:num_mono), dir(1:num_mono), delay(1:num_mono), loc_mono(1:3, 1:num_mono), foc_length(1:num_mono), aperture(1:num_mono), support_width(1:num_mono)) do i = 1, num_mono mag(i) = mono(i)%mag @@ -63,9 +85,9 @@ contains end do !$acc update device(mag, support, length, npulse, pulse, dir, delay, foc_length, aperture, loc_mono, support_width) - @:ALLOCATE(mono_mass_src(0:m, 0:n, 0:p)) - @:ALLOCATE(mono_mom_src(1:num_dims, 0:m, 0:n, 0:p)) - @:ALLOCATE(mono_E_src(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(mono_mass_src(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(mono_mom_src(1:num_dims, 0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(mono_E_src(0:m, 0:n, 0:p)) end subroutine @@ -321,6 +343,7 @@ contains !! @param mono_loc Nominal source term location !! @param mono_leng Length of source term in space function f_delta(j, k, l, mono_loc, mono_leng, nm, angle, angle_z) + !$acc routine seq real(kind(0d0)), dimension(3), intent(IN) :: mono_loc integer, intent(IN) :: nm diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 969d59fc6..2ca830bad 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -28,6 +28,14 @@ module m_mpi_proxy implicit none +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), q_cons_buff_send) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), q_cons_buff_recv) + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), ib_buff_send) + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), ib_buff_recv) + +!$acc declare link(q_cons_buff_recv, q_cons_buff_send, ib_buff_send, ib_buff_recv) +#else real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_send !< !! This variable is utilized to pack and send the buffer of the cell-average !! conservative variables, for a single computational domain boundary at the @@ -48,13 +56,13 @@ module m_mpi_proxy !! immersed boundary markers, for a single computational domain boundary !! at the time, from the relevant neighboring processor. +!$acc declare create(q_cons_buff_send, q_cons_buff_recv, ib_buff_send, ib_buff_recv) +#endif !> @name Generic flags used to identify and report MPI errors !> @{ integer, private :: err_code, ierr, v_size + !$acc declare create(v_size) !> @} - - !$acc declare create(q_cons_buff_send, q_cons_buff_recv, v_size) - !real :: s_time, e_time !real :: compress_time, mpi_time, decompress_time !integer :: nCalls_time = 0 @@ -75,43 +83,45 @@ contains if (qbmm .and. .not. polytropic) then if (n > 0) then if (p > 0) then - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* & + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* & & (m + 2*buff_size + 1)* & & (n + 2*buff_size + 1)* & & (p + 2*buff_size + 1)/ & & (min(m, n, p) + 2*buff_size + 1))) else - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* & + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* & & (max(m, n) + 2*buff_size + 1))) end if else - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4))) + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4))) end if - @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1))) + @:ALLOCATE_GLOBAL(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1))) v_size = sys_size + 2*nb*4 else if (n > 0) then if (p > 0) then - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* & + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*sys_size* & & (m + 2*buff_size + 1)* & & (n + 2*buff_size + 1)* & & (p + 2*buff_size + 1)/ & & (min(m, n, p) + 2*buff_size + 1))) else - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* & + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*sys_size* & & (max(m, n) + 2*buff_size + 1))) end if else - @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size)) + @:ALLOCATE_GLOBAL(q_cons_buff_send(0:-1 + buff_size*sys_size)) end if - @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1))) + @:ALLOCATE_GLOBAL(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1))) v_size = sys_size end if +!$acc update device(v_size) + #endif end subroutine s_initialize_mpi_proxy_module ! ------------------------- @@ -840,7 +850,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) @@ -896,7 +906,7 @@ contains MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -953,7 +963,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1005,13 +1015,12 @@ contains MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(q_cons_buff_recv) end if @@ -1127,7 +1136,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1156,7 +1165,6 @@ contains !$acc wait else #endif - !$acc update host(q_cons_buff_send) if (qbmm .and. .not. polytropic) then call MPI_SENDRECV( & @@ -1178,7 +1186,7 @@ contains MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -1234,8 +1242,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1264,7 +1271,6 @@ contains !$acc wait else #endif - !$acc update host(q_cons_buff_send) if (qbmm .and. .not. polytropic) then @@ -1287,7 +1293,7 @@ contains MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -1411,8 +1417,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1464,8 +1469,7 @@ contains MPI_DOUBLE_PRECISION, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -1520,8 +1524,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1573,14 +1576,12 @@ contains MPI_DOUBLE_PRECISION, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(q_cons_buff_recv) end if @@ -1694,8 +1695,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1747,8 +1747,7 @@ contains MPI_DOUBLE_PRECISION, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -1804,7 +1803,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -1856,14 +1855,12 @@ contains MPI_DOUBLE_PRECISION, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(q_cons_buff_recv) end if @@ -1986,8 +1983,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -2017,8 +2013,7 @@ contains buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & MPI_DOUBLE_PRECISION, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2075,8 +2070,7 @@ contains end if !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -2107,13 +2101,12 @@ contains MPI_DOUBLE_PRECISION, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(q_cons_buff_recv) end if @@ -2232,7 +2225,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -2262,7 +2255,7 @@ contains MPI_DOUBLE_PRECISION, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2322,7 +2315,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) @@ -2351,14 +2344,13 @@ contains buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & MPI_DOUBLE_PRECISION, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(q_cons_buff_recv) end if @@ -2449,19 +2441,19 @@ contains if (n > 0) then if (p > 0) then - @:ALLOCATE(ib_buff_send(0:-1 + gp_layers * & + @:ALLOCATE_GLOBAL(ib_buff_send(0:-1 + gp_layers * & & (m + 2*gp_layers + 1)* & & (n + 2*gp_layers + 1)* & & (p + 2*gp_layers + 1)/ & & (min(m, n, p) + 2*gp_layers + 1))) else - @:ALLOCATE(ib_buff_send(0:-1 + gp_layers* & + @:ALLOCATE_GLOBAL(ib_buff_send(0:-1 + gp_layers* & & (max(m, n) + 2*gp_layers + 1))) end if else - @:ALLOCATE(ib_buff_send(0:-1 + gp_layers)) + @:ALLOCATE_GLOBAL(ib_buff_send(0:-1 + gp_layers)) end if - @:ALLOCATE(ib_buff_recv(0:ubound(ib_buff_send, 1))) + @:ALLOCATE_GLOBAL(ib_buff_recv(0:ubound(ib_buff_send, 1))) !nCalls_time = nCalls_time + 1 @@ -2483,7 +2475,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send, ib_buff_recv, ib_buff_send) @@ -2514,7 +2506,7 @@ contains MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2533,7 +2525,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2563,13 +2555,13 @@ contains MPI_INTEGER, bc_x%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -2605,7 +2597,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2634,7 +2626,7 @@ contains MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2653,7 +2645,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2683,7 +2675,7 @@ contains MPI_INTEGER, bc_x%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2727,7 +2719,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2758,7 +2750,7 @@ contains MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2778,7 +2770,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2809,13 +2801,13 @@ contains MPI_INTEGER, bc_y%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -2853,7 +2845,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2884,7 +2876,7 @@ contains MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -2904,7 +2896,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -2935,13 +2927,13 @@ contains MPI_INTEGER, bc_y%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -2982,7 +2974,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -3013,7 +3005,7 @@ contains MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -3033,7 +3025,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -3064,13 +3056,13 @@ contains MPI_INTEGER, bc_z%beg, 0, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -3109,7 +3101,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -3139,7 +3131,7 @@ contains MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif @@ -3160,7 +3152,7 @@ contains !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi) then !$acc host_data use_device( ib_buff_recv, ib_buff_send ) @@ -3190,13 +3182,13 @@ contains MPI_INTEGER, bc_z%end, 1, & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) end if #endif end if -#if defined(_OPENACC) && defined(__PGI) +#if defined(MFC_OpenACC) if (cu_mpi .eqv. .false.) then !$acc update device(ib_buff_recv) end if @@ -3229,9 +3221,9 @@ contains #ifdef MFC_MPI ! Deallocating q_cons_buff_send and q_cons_buff_recv - @:DEALLOCATE(q_cons_buff_send, q_cons_buff_recv) + @:DEALLOCATE_GLOBAL(q_cons_buff_send, q_cons_buff_recv) if (ib) then - @:DEALLOCATE(ib_buff_send, ib_buff_recv) + @:DEALLOCATE_GLOBAL(ib_buff_send, ib_buff_recv) end if #endif diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp index b916688b6..02b9f2075 100644 --- a/src/simulation/m_qbmm.fpp +++ b/src/simulation/m_qbmm.fpp @@ -26,21 +26,32 @@ module m_qbmm private; public :: s_initialize_qbmm_module, s_mom_inv, s_coeff, s_compute_qbmm_rhs +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :, :), momrhs) +!$acc declare link(momrhs) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :, :) :: momrhs - +!$acc declare create(momrhs) +#endif #:if MFC_CASE_OPTIMIZATION integer, parameter :: nterms = ${nterms}$ #:else integer :: nterms + !$acc declare create(nterms) #:endif - type(int_bounds_info) :: is1, is2, is3 + type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm +!$acc declare create(is1_qbmm, is2_qbmm, is3_qbmm) +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(integer, dimension(:), bubrs) + @:CRAY_DECLARE_GLOBAL(integer, dimension(:, :), bubmoms) +!$acc declare link(bubrs, bubmoms) +#else integer, allocatable, dimension(:) :: bubrs integer, allocatable, dimension(:, :) :: bubmoms - - !$acc declare create(momrhs, nterms, is1, is2, is3) - !$acc declare create(bubrs, bubmoms) +!$acc declare create(bubrs, bubmoms) +#endif contains @@ -58,11 +69,12 @@ contains nterms = 7 end if + !$acc enter data copyin(nterms) !$acc update device(nterms) #:endif - @:ALLOCATE(momrhs(3, 0:2, 0:2, nterms, nb)) + @:ALLOCATE_GLOBAL(momrhs(3, 0:2, 0:2, nterms, nb)) momrhs = 0d0 ! Assigns the required RHS moments for moment transport equations @@ -394,8 +406,8 @@ contains !$acc update device(momrhs) - @:ALLOCATE(bubrs(1:nb)) - @:ALLOCATE(bubmoms(1:nb, 1:nmom)) + @:ALLOCATE_GLOBAL(bubrs(1:nb)) + @:ALLOCATE_GLOBAL(bubmoms(1:nb, 1:nmom)) do i = 1, nb bubrs(i) = bub_idx%rs(i) @@ -651,9 +663,14 @@ contains end subroutine !Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv) + subroutine s_coeff_nonpoly(pres, rho, c, coeffs) - !$acc routine seq - real(kind(0.d0)), intent(INOUT) :: pres, rho, c +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_coeff_nonpoly +#else +!$acc routine seq +#endif + real(kind(0.d0)), intent(IN) :: pres, rho, c real(kind(0.d0)), dimension(nterms, 0:2, 0:2), intent(OUT) :: coeffs integer :: i1, i2, q @@ -720,7 +737,12 @@ contains !Coefficient array for polytropic model (pb for each R0 bin accounted for in wght_pb) subroutine s_coeff(pres, rho, c, coeffs) - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_coeff +#else +!$acc routine seq +#endif + real(kind(0.d0)), intent(INOUT) :: pres, rho, c real(kind(0.d0)), dimension(nterms, 0:2, 0:2), intent(OUT) :: coeffs integer :: i1, i2, q @@ -797,10 +819,14 @@ contains integer :: id1, id2, id3 integer :: i1, i2 + is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz + + !$acc update device(is1_qbmm, is2_qbmm, is3_qbmm) + !$acc parallel loop collapse(3) gang vector default(present) private(moms, msum, wght, abscX, abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, n_tait, B_tait, pres, rho, nbub, c, alf, R3, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T) - do id3 = iz%beg, iz%end - do id2 = iy%beg, iy%end - do id1 = ix%beg, ix%end + do id3 = is3_qbmm%beg, is3_qbmm%end + do id2 = is2_qbmm%beg, is2_qbmm%end + do id1 = is1_qbmm%beg, is1_qbmm%end alf = q_prim_vf(alf_idx)%sf(id1, id2, id3) pres = q_prim_vf(E_idx)%sf(id1, id2, id3) @@ -989,7 +1015,11 @@ contains end subroutine s_mom_inv subroutine s_chyqmom(momin, wght, abscX, abscY) - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_chyqmom +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(nnode), intent(INOUT) :: wght, abscX, abscY real(kind(0d0)), dimension(nmom), intent(IN) :: momin @@ -1052,7 +1082,11 @@ contains end subroutine s_chyqmom subroutine s_hyqmom(frho, fup, fmom) - !$acc routine seq +#ifdef CRAY_ACC_WAR + !DIR$ INLINEALWAYS s_hyqmom +#else +!$acc routine seq +#endif real(kind(0d0)), dimension(2), intent(INOUT) :: frho, fup real(kind(0d0)), dimension(3), intent(IN) :: fmom real(kind(0d0)) :: bu, d2, c2 @@ -1086,6 +1120,7 @@ contains function f_quad2D(abscX, abscY, wght_in, pow) !$acc routine seq real(kind(0.d0)), dimension(nnode), intent(IN) :: abscX, abscY, wght_in + real(kind(0.d0)), dimension(3), intent(IN) :: pow real(kind(0.d0)) :: f_quad2D diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index e7cbeb1d7..d2f2c9032 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -59,15 +59,17 @@ module m_rhs s_pressure_relaxation_procedure, & s_finalize_rhs_module - type(vector_field) :: q_cons_qp !< !! This variable contains the WENO-reconstructed values of the cell-average !! conservative variables, which are located in q_cons_vf, at cell-interior !! Gaussian quadrature points (QP). + type(vector_field) :: q_cons_qp !< + !$acc declare create(q_cons_qp) - type(vector_field) :: q_prim_qp !< !! The primitive variables at cell-interior Gaussian quadrature points. These !! are calculated from the conservative variables and gradient magnitude (GM) !! of the volume fractions, q_cons_qp and gm_alpha_qp, respectively. + type(vector_field) :: q_prim_qp !< + !$acc declare create(q_prim_qp) !> @name The first-order spatial derivatives of the primitive variables at cell- !! interior Gaussian quadrature points. These are WENO-reconstructed from @@ -75,22 +77,30 @@ module m_rhs !! of the divergence theorem on the integral-average cell-boundary values !! of the primitive variables, located in qK_prim_n, where K = L or R. !> @{ - type(vector_field) :: dq_prim_dx_qp - type(vector_field) :: dq_prim_dy_qp - type(vector_field) :: dq_prim_dz_qp - !> @} +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp) +!$acc declare link(dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp) +#else + type(vector_field), allocatable, dimension(:) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp +!$acc declare create(dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp) +#endif !> @name The left and right WENO-reconstructed cell-boundary values of the cell- !! average first-order spatial derivatives of the primitive variables. The !! cell-average of the first-order spatial derivatives may be found in the !! variables dq_prim_ds_qp, where s = x, y or z. !> @{ - type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n - type(vector_field), allocatable, dimension(:) :: dqL_prim_dy_n - type(vector_field), allocatable, dimension(:) :: dqL_prim_dz_n - type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n - type(vector_field), allocatable, dimension(:) :: dqR_prim_dy_n - type(vector_field), allocatable, dimension(:) :: dqR_prim_dz_n +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n) + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n) +!$acc declare link(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n) +!$acc declare link(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n) +#else + type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n + type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n +!$acc declare create(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n) +!$acc declare create(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n) +#endif !> @} type(vector_field) :: gm_alpha_qp !< @@ -98,64 +108,108 @@ module m_rhs !! quadrature points. gm_alpha_qp is calculated from individual first-order !! spatial derivatives located in dq_prim_ds_qp. + !$acc declare create(gm_alpha_qp) + !> @name The left and right WENO-reconstructed cell-boundary values of the cell- !! average gradient magnitude of volume fractions, located in gm_alpha_qp. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), gm_alphaL_n) + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), gm_alphaR_n) +!$acc declare link(gm_alphaL_n, gm_alphaR_n) +#else type(vector_field), allocatable, dimension(:) :: gm_alphaL_n type(vector_field), allocatable, dimension(:) :: gm_alphaR_n +!$acc declare create(gm_alphaL_n, gm_alphaR_n) +#endif !> @} !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical !! source). These are computed by applying the chosen Riemann problem solver !! .on the left and right cell-boundary values of the primitive variables !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), flux_n) + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), flux_src_n) + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), flux_gsrc_n) +!$acc declare link(flux_n, flux_src_n, flux_gsrc_n) +#else type(vector_field), allocatable, dimension(:) :: flux_n type(vector_field), allocatable, dimension(:) :: flux_src_n type(vector_field), allocatable, dimension(:) :: flux_gsrc_n +!$acc declare create(flux_n, flux_src_n, flux_gsrc_n) +#endif !> @} +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), qL_prim, qR_prim) +!$acc declare link(qL_prim, qR_prim) +#else type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim +!$acc declare create(qL_prim, qR_prim) +#endif type(int_bounds_info) :: iv !< Vector field indical bounds + !$acc declare create(iv) !> @name Indical bounds in the x-, y- and z-directions !> @{ type(int_bounds_info) :: ix, iy, iz - !> @} + !$acc declare create(ix, iy, iz) type(int_bounds_info) :: is1, is2, is3 + !$acc declare create(is1, is2, is3) type(int_bounds_info) :: ixt, iyt, izt - - real(kind(0d0)), allocatable, dimension(:, :, :, :, :) :: bub_mom_src - !$acc declare create(bub_mom_src) + !$acc declare create(ixt, iyt, izt) !> @name Saved fluxes for testing !> @{ type(scalar_field) :: alf_sum !> @} +!$acc declare create(alf_sum) + +#ifdef CRAY_ACC_WAR + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), blkmod1, blkmod2, alpha1, alpha2, Kterm) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf) +!$acc declare link(blkmod1, blkmod2, alpha1, alpha2, Kterm) +!$acc declare link(qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf) +!$acc declare link(dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: blkmod1, blkmod2, alpha1, alpha2, Kterm real(kind(0d0)), allocatable, dimension(:, :, :, :) :: qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf - +!$acc declare create(blkmod1, blkmod2, alpha1, alpha2, Kterm) +!$acc declare create(qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf) +!$acc declare create(dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf) +#endif + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), gamma_min, pres_inf) +!$acc declare link(gamma_min, pres_inf) +#else real(kind(0d0)), allocatable, dimension(:) :: gamma_min, pres_inf - !$acc declare create(gamma_min, pres_inf) +!$acc declare create(gamma_min, pres_inf) +#endif +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), Res) +!$acc declare link(Res) +#else real(kind(0d0)), allocatable, dimension(:, :) :: Res - !$acc declare create(Res) - - !$acc declare create(q_cons_qp,q_prim_qp, & - !$acc dq_prim_dx_qp,dq_prim_dy_qp,dq_prim_dz_qp,dqL_prim_dx_n,dqL_prim_dy_n, & - !$acc dqL_prim_dz_n,dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n,gm_alpha_qp, & - !$acc gm_alphaL_n,gm_alphaR_n,flux_n,flux_src_n,flux_gsrc_n, & - !$acc qL_prim, qR_prim, iv,ix, iy, iz,is1,is2,is3,alf_sum, & - !$acc blkmod1, blkmod2, alpha1, alpha2, Kterm, qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf, & - !$acc dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf, & - !$acc ixt, iyt, izt) +!$acc declare create(Res) +#endif +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), nbub) +!$acc declare link(nbub) +#else real(kind(0d0)), allocatable, dimension(:, :, :) :: nbub !< Bubble number density - !$acc declare create(nbub) +!$acc declare create(nbub) +#endif contains @@ -174,6 +228,7 @@ contains ix%end = m - ix%beg; iy%end = n - iy%beg; iz%end = p - iz%beg ! ================================================================== + !$acc enter data copyin(ix, iy, iz) !$acc update device(ix, iy, iz) ixt = ix; iyt = iy; izt = iz @@ -193,22 +248,24 @@ contains @:ALLOCATE(q_prim_qp%vf(l)%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) end do + @:ACC_SETUP_VFs(q_cons_qp, q_prim_qp) + do l = 1, cont_idx%end - q_prim_qp%vf(l)%sf => & - q_cons_qp%vf(l)%sf + q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf + !$acc enter data copyin(q_prim_qp%vf(l)%sf) !$acc enter data attach(q_prim_qp%vf(l)%sf) end do do l = adv_idx%beg, adv_idx%end - q_prim_qp%vf(l)%sf => & - q_cons_qp%vf(l)%sf + q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf + !$acc enter data copyin(q_prim_qp%vf(l)%sf) !$acc enter data attach(q_prim_qp%vf(l)%sf) end do ! ================================================================== if (qbmm) then - @:ALLOCATE(mom_sp(1:nmomsp), mom_3d(0:2, 0:2, nb)) + @:ALLOCATE_GLOBAL(mom_sp(1:nmomsp), mom_3d(0:2, 0:2, nb)) do i = 0, 2 do j = 0, 2 @@ -217,105 +274,111 @@ contains & ix%beg:ix%end, & & iy%beg:iy%end, & & iz%beg:iz%end)) + @:ACC_SETUP_SFs(mom_3d(i, j, k)) end do end do end do + do i = 1, nmomsp @:ALLOCATE(mom_sp(i)%sf( & & ix%beg:ix%end, & & iy%beg:iy%end, & & iz%beg:iz%end)) + @:ACC_SETUP_SFs(mom_sp(i)) end do end if ! Allocation/Association of qK_cons_n and qK_prim_n ========== - @:ALLOCATE(qL_prim(1:num_dims)) - @:ALLOCATE(qR_prim(1:num_dims)) + @:ALLOCATE_GLOBAL(qL_prim(1:num_dims)) + @:ALLOCATE_GLOBAL(qR_prim(1:num_dims)) do i = 1, num_dims @:ALLOCATE(qL_prim(i)%vf(1:sys_size)) @:ALLOCATE(qR_prim(i)%vf(1:sys_size)) - end do - - if (weno_Re_flux) then - - do i = 1, num_dims - do l = mom_idx%beg, mom_idx%end - @:ALLOCATE(qL_prim(i)%vf(l)%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) - @:ALLOCATE(qR_prim(i)%vf(l)%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) - end do + do l = mom_idx%beg, mom_idx%end + @:ALLOCATE(qL_prim(i)%vf(l)%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) + @:ALLOCATE(qR_prim(i)%vf(l)%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) end do - end if + @:ACC_SETUP_VFs(qL_prim(i), qR_prim(i)) + end do if (mpp_lim .and. bubbles) then @:ALLOCATE(alf_sum%sf(ix%beg:ix%end, iy%beg:iy%end, iz%beg:iz%end)) end if ! END: Allocation/Association of qK_cons_n and qK_prim_n ====== - @:ALLOCATE(qL_rsx_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qL_rsx_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) - @:ALLOCATE(qR_rsx_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qR_rsx_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) if (n > 0) then - @:ALLOCATE(qL_rsy_vf(iy%beg:iy%end, & + @:ALLOCATE_GLOBAL(qL_rsy_vf(iy%beg:iy%end, & ix%beg:ix%end, iz%beg:iz%end, 1:sys_size)) - @:ALLOCATE(qR_rsy_vf(iy%beg:iy%end, & + @:ALLOCATE_GLOBAL(qR_rsy_vf(iy%beg:iy%end, & ix%beg:ix%end, iz%beg:iz%end, 1:sys_size)) else - @:ALLOCATE(qL_rsy_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qL_rsy_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) - @:ALLOCATE(qR_rsy_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qR_rsy_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) end if if (p > 0) then - @:ALLOCATE(qL_rsz_vf(iz%beg:iz%end, & + @:ALLOCATE_GLOBAL(qL_rsz_vf(iz%beg:iz%end, & iy%beg:iy%end, ix%beg:ix%end, 1:sys_size)) - @:ALLOCATE(qR_rsz_vf(iz%beg:iz%end, & + @:ALLOCATE_GLOBAL(qR_rsz_vf(iz%beg:iz%end, & iy%beg:iy%end, ix%beg:ix%end, 1:sys_size)) else - @:ALLOCATE(qL_rsz_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qL_rsz_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) - @:ALLOCATE(qR_rsz_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(qR_rsz_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, 1:sys_size)) end if ! Allocation of dq_prim_ds_qp ====================================== - if (any(Re_size > 0)) then - - @:ALLOCATE(dq_prim_dx_qp%vf(1:sys_size)) - @:ALLOCATE(dq_prim_dy_qp%vf(1:sys_size)) - @:ALLOCATE(dq_prim_dz_qp%vf(1:sys_size)) + @:ALLOCATE_GLOBAL(dq_prim_dx_qp(1:1)) + @:ALLOCATE_GLOBAL(dq_prim_dy_qp(1:1)) + @:ALLOCATE_GLOBAL(dq_prim_dz_qp(1:1)) + if (any(Re_size > 0)) then + @:ALLOCATE(dq_prim_dx_qp(1)%vf(1:sys_size)) + @:ALLOCATE(dq_prim_dy_qp(1)%vf(1:sys_size)) + @:ALLOCATE(dq_prim_dz_qp(1)%vf(1:sys_size)) if (any(Re_size > 0)) then do l = mom_idx%beg, mom_idx%end - @:ALLOCATE(dq_prim_dx_qp%vf(l)%sf( & + @:ALLOCATE(dq_prim_dx_qp(1)%vf(l)%sf( & & ix%beg:ix%end, & & iy%beg:iy%end, & & iz%beg:iz%end)) end do + @:ACC_SETUP_VFs(dq_prim_dx_qp(1)) + if (n > 0) then do l = mom_idx%beg, mom_idx%end - @:ALLOCATE(dq_prim_dy_qp%vf(l)%sf( & + @:ALLOCATE(dq_prim_dy_qp(1)%vf(l)%sf( & & ix%beg:ix%end, & & iy%beg:iy%end, & & iz%beg:iz%end)) end do + @:ACC_SETUP_VFs(dq_prim_dy_qp(1)) + if (p > 0) then + do l = mom_idx%beg, mom_idx%end - @:ALLOCATE(dq_prim_dz_qp%vf(l)%sf( & + @:ALLOCATE(dq_prim_dz_qp(1)%vf(l)%sf( & & ix%beg:ix%end, & & iy%beg:iy%end, & & iz%beg:iz%end)) end do + @:ACC_SETUP_VFs(dq_prim_dz_qp(1)) end if end if @@ -326,12 +389,12 @@ contains ! END: Allocation of dq_prim_ds_qp ================================= ! Allocation/Association of dqK_prim_ds_n ======================= - @:ALLOCATE(dqL_prim_dx_n(1:num_dims)) - @:ALLOCATE(dqL_prim_dy_n(1:num_dims)) - @:ALLOCATE(dqL_prim_dz_n(1:num_dims)) - @:ALLOCATE(dqR_prim_dx_n(1:num_dims)) - @:ALLOCATE(dqR_prim_dy_n(1:num_dims)) - @:ALLOCATE(dqR_prim_dz_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqL_prim_dx_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqL_prim_dy_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqL_prim_dz_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqR_prim_dx_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqR_prim_dy_n(1:num_dims)) + @:ALLOCATE_GLOBAL(dqR_prim_dz_n(1:num_dims)) if (any(Re_size > 0)) then do i = 1, num_dims @@ -383,40 +446,42 @@ contains end if + @:ACC_SETUP_VFs(dqL_prim_dx_n(i), dqL_prim_dy_n(i), dqL_prim_dz_n(i)) + @:ACC_SETUP_VFs(dqR_prim_dx_n(i), dqR_prim_dy_n(i), dqR_prim_dz_n(i)) end do end if ! END: Allocation/Association of d K_prim_ds_n ================== if (any(Re_size > 0)) then if (weno_Re_flux) then - @:ALLOCATE(dqL_rsx_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqL_rsx_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) - @:ALLOCATE(dqR_rsx_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqR_rsx_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) if (n > 0) then - @:ALLOCATE(dqL_rsy_vf(iy%beg:iy%end, & + @:ALLOCATE_GLOBAL(dqL_rsy_vf(iy%beg:iy%end, & ix%beg:ix%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) - @:ALLOCATE(dqR_rsy_vf(iy%beg:iy%end, & + @:ALLOCATE_GLOBAL(dqR_rsy_vf(iy%beg:iy%end, & ix%beg:ix%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) else - @:ALLOCATE(dqL_rsy_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqL_rsy_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) - @:ALLOCATE(dqR_rsy_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqR_rsy_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) end if if (p > 0) then - @:ALLOCATE(dqL_rsz_vf(iz%beg:iz%end, & + @:ALLOCATE_GLOBAL(dqL_rsz_vf(iz%beg:iz%end, & iy%beg:iy%end, ix%beg:ix%end, mom_idx%beg:mom_idx%end)) - @:ALLOCATE(dqR_rsz_vf(iz%beg:iz%end, & + @:ALLOCATE_GLOBAL(dqR_rsz_vf(iz%beg:iz%end, & iy%beg:iy%end, ix%beg:ix%end, mom_idx%beg:mom_idx%end)) else - @:ALLOCATE(dqL_rsz_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqL_rsz_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) - @:ALLOCATE(dqR_rsz_vf(ix%beg:ix%end, & + @:ALLOCATE_GLOBAL(dqR_rsz_vf(ix%beg:ix%end, & iy%beg:iy%end, iz%beg:iz%end, mom_idx%beg:mom_idx%end)) end if @@ -426,20 +491,14 @@ contains ! ================================================================== ! Allocation of gm_alphaK_n ===================================== - @:ALLOCATE(gm_alphaL_n(1:num_dims)) - @:ALLOCATE(gm_alphaR_n(1:num_dims)) + @:ALLOCATE_GLOBAL(gm_alphaL_n(1:num_dims)) + @:ALLOCATE_GLOBAL(gm_alphaR_n(1:num_dims)) ! ================================================================== - if (bubbles) then - if (qbmm) then - @:ALLOCATE(bub_mom_src(1:nmom, 0:m, 0:n, 0:p, 1:nb)) - end if - end if - ! Allocation/Association of flux_n, flux_src_n, and flux_gsrc_n === - @:ALLOCATE(flux_n(1:num_dims)) - @:ALLOCATE(flux_src_n(1:num_dims)) - @:ALLOCATE(flux_gsrc_n(1:num_dims)) + @:ALLOCATE_GLOBAL(flux_n(1:num_dims)) + @:ALLOCATE_GLOBAL(flux_src_n(1:num_dims)) + @:ALLOCATE_GLOBAL(flux_gsrc_n(1:num_dims)) do i = 1, num_dims @@ -448,7 +507,6 @@ contains @:ALLOCATE(flux_gsrc_n(i)%vf(1:sys_size)) if (i == 1) then - do l = 1, sys_size @:ALLOCATE(flux_n(i)%vf(l)%sf( & & ix%beg:ix%end, & @@ -481,14 +539,7 @@ contains & iy%beg:iy%end, & & iz%beg:iz%end)) end do - else - do l = adv_idx%beg + 1, adv_idx%end - flux_src_n(i)%vf(l)%sf => & - flux_src_n(i)%vf(adv_idx%beg)%sf - !$acc enter data attach(flux_src_n(i)%vf(l)%sf(ix%beg:ix%end,iy%beg:iy%end,iz%beg:iz%end)) - end do end if - else do l = 1, sys_size @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf( & @@ -496,25 +547,35 @@ contains iy%beg:iy%end, & iz%beg:iz%end)) end do + end if + + @:ACC_SETUP_VFs(flux_n(i), flux_src_n(i), flux_gsrc_n(i)) + + if (i == 1) then + if (riemann_solver /= 1) then + do l = adv_idx%beg + 1, adv_idx%end + flux_src_n(i)%vf(l)%sf => flux_src_n(i)%vf(adv_idx%beg)%sf + + !$acc enter data attach(flux_src_n(i)%vf(l)%sf) + end do + end if + else do l = 1, sys_size - flux_n(i)%vf(l)%sf => & - flux_n(1)%vf(l)%sf - flux_src_n(i)%vf(l)%sf => & - flux_src_n(1)%vf(l)%sf + flux_n(i)%vf(l)%sf => flux_n(1)%vf(l)%sf + flux_src_n(i)%vf(l)%sf => flux_src_n(1)%vf(l)%sf !$acc enter data attach(flux_n(i)%vf(l)%sf,flux_src_n(i)%vf(l)%sf) end do - end if end do ! END: Allocation/Association of flux_n, flux_src_n, and flux_gsrc_n === if (alt_soundspeed) then - @:ALLOCATE(blkmod1(0:m, 0:n, 0:p), blkmod2(0:m, 0:n, 0:p), alpha1(0:m, 0:n, 0:p), alpha2(0:m, 0:n, 0:p), Kterm(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(blkmod1(0:m, 0:n, 0:p), blkmod2(0:m, 0:n, 0:p), alpha1(0:m, 0:n, 0:p), alpha2(0:m, 0:n, 0:p), Kterm(0:m, 0:n, 0:p)) end if - @:ALLOCATE(gamma_min(1:num_fluids), pres_inf(1:num_fluids)) + @:ALLOCATE_GLOBAL(gamma_min(1:num_fluids), pres_inf(1:num_fluids)) do i = 1, num_fluids gamma_min(i) = 1d0/fluid_pp(i)%gamma + 1d0 @@ -523,7 +584,7 @@ contains !$acc update device(gamma_min, pres_inf) if (any(Re_size > 0)) then - @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) + @:ALLOCATE_GLOBAL(Res(1:2, 1:maxval(Re_size))) end if if (any(Re_size > 0)) then @@ -570,7 +631,7 @@ contains end do if (bubbles) then - @:ALLOCATE(nbub(0:m, 0:n, 0:p)) + @:ALLOCATE_GLOBAL(nbub(0:m, 0:n, 0:p)) end if end subroutine s_initialize_rhs_module ! ------------------------------- @@ -738,21 +799,21 @@ contains iv%beg = mom_idx%beg; iv%end = mom_idx%end if (weno_Re_flux) then call s_reconstruct_cell_boundary_values_visc_deriv( & - dq_prim_dx_qp%vf(iv%beg:iv%end), & + dq_prim_dx_qp(1)%vf(iv%beg:iv%end), & dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, & dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf, & id, dqL_prim_dx_n(id)%vf(iv%beg:iv%end), dqR_prim_dx_n(id)%vf(iv%beg:iv%end), & ix, iy, iz) if (n > 0) then call s_reconstruct_cell_boundary_values_visc_deriv( & - dq_prim_dy_qp%vf(iv%beg:iv%end), & + dq_prim_dy_qp(1)%vf(iv%beg:iv%end), & dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, & dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf, & id, dqL_prim_dy_n(id)%vf(iv%beg:iv%end), dqR_prim_dy_n(id)%vf(iv%beg:iv%end), & ix, iy, iz) if (p > 0) then call s_reconstruct_cell_boundary_values_visc_deriv( & - dq_prim_dz_qp%vf(iv%beg:iv%end), & + dq_prim_dz_qp(1)%vf(iv%beg:iv%end), & dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, & dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf, & id, dqL_prim_dz_n(id)%vf(iv%beg:iv%end), dqR_prim_dz_n(id)%vf(iv%beg:iv%end), & @@ -773,7 +834,6 @@ contains end if ix%end = m; iy%end = n; iz%end = p ! =============================================================== - call nvtxStartRange("RHS-Riemann") ! Computing Riemann Solver Flux and Source Flux ================= @@ -1339,7 +1399,6 @@ contains end if end if end if - end if ! id loop call nvtxEndRange @@ -1367,9 +1426,9 @@ contains q_prim_qp%vf, & rhs_vf, & flux_src_n(id)%vf, & - dq_prim_dx_qp%vf, & - dq_prim_dy_qp%vf, & - dq_prim_dz_qp%vf, & + dq_prim_dx_qp(1)%vf, & + dq_prim_dy_qp(1)%vf, & + dq_prim_dz_qp(1)%vf, & ixt, iyt, izt) call nvtxEndRange @@ -1392,7 +1451,6 @@ contains rhs_mv) call nvtxEndRange ! END: Additional physics and source terms ========================= - end do if (ib) then @@ -1439,6 +1497,7 @@ contains if (n > 0) iy%beg = -buff_size; if (p > 0) iz%beg = -buff_size; ix%end = m - ix%beg; iy%end = n - iy%beg; iz%end = p - iz%beg + !$acc update device(ix, iy, iz) !$acc parallel loop collapse(4) gang vector default(present) @@ -1817,25 +1876,25 @@ contains end do @:DEALLOCATE(q_cons_qp%vf, q_prim_qp%vf) - @:DEALLOCATE(qL_rsx_vf, qR_rsx_vf) + @:DEALLOCATE_GLOBAL(qL_rsx_vf, qR_rsx_vf) if (n > 0) then - @:DEALLOCATE(qL_rsy_vf, qR_rsy_vf) + @:DEALLOCATE_GLOBAL(qL_rsy_vf, qR_rsy_vf) end if if (p > 0) then - @:DEALLOCATE(qL_rsz_vf, qR_rsz_vf) + @:DEALLOCATE_GLOBAL(qL_rsz_vf, qR_rsz_vf) end if if (any(Re_size > 0) .and. weno_Re_flux) then - @:DEALLOCATE(dqL_rsx_vf, dqR_rsx_vf) + @:DEALLOCATE_GLOBAL(dqL_rsx_vf, dqR_rsx_vf) if (n > 0) then - @:DEALLOCATE(dqL_rsy_vf, dqR_rsy_vf) + @:DEALLOCATE_GLOBAL(dqL_rsy_vf, dqR_rsy_vf) end if if (p > 0) then - @:DEALLOCATE(dqL_rsz_vf, dqR_rsz_vf) + @:DEALLOCATE_GLOBAL(dqL_rsz_vf, dqR_rsz_vf) end if end if @@ -1846,26 +1905,26 @@ contains if (any(Re_size > 0)) then do l = mom_idx%beg, mom_idx%end - @:DEALLOCATE(dq_prim_dx_qp%vf(l)%sf) + @:DEALLOCATE(dq_prim_dx_qp(1)%vf(l)%sf) end do if (n > 0) then do l = mom_idx%beg, mom_idx%end - @:DEALLOCATE(dq_prim_dy_qp%vf(l)%sf) + @:DEALLOCATE(dq_prim_dy_qp(1)%vf(l)%sf) end do if (p > 0) then do l = mom_idx%beg, mom_idx%end - @:DEALLOCATE(dq_prim_dz_qp%vf(l)%sf) + @:DEALLOCATE(dq_prim_dz_qp(1)%vf(l)%sf) end do end if end if - @:DEALLOCATE(dq_prim_dx_qp%vf) - @:DEALLOCATE(dq_prim_dy_qp%vf) - @:DEALLOCATE(dq_prim_dz_qp%vf) + @:DEALLOCATE(dq_prim_dx_qp(1)%vf) + @:DEALLOCATE(dq_prim_dy_qp(1)%vf) + @:DEALLOCATE(dq_prim_dz_qp(1)%vf) end if if (any(Re_size > 0)) then @@ -1902,8 +1961,8 @@ contains end do end if - @:DEALLOCATE(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n) - @:DEALLOCATE(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n) + @:DEALLOCATE_GLOBAL(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n) + @:DEALLOCATE_GLOBAL(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n) do i = num_dims, 1, -1 if (i /= 1) then @@ -1940,7 +1999,7 @@ contains @:DEALLOCATE(flux_n(i)%vf, flux_src_n(i)%vf, flux_gsrc_n(i)%vf) end do - @:DEALLOCATE(flux_n, flux_src_n, flux_gsrc_n) + @:DEALLOCATE_GLOBAL(flux_n, flux_src_n, flux_gsrc_n) s_riemann_solver => null() s_convert_to_mixture_variables => null() diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp index 1d3fc7c88..34268775b 100644 --- a/src/simulation/m_riemann_solvers.fpp +++ b/src/simulation/m_riemann_solvers.fpp @@ -17,6 +17,8 @@ !! 1) Harten-Lax-van Leer (HLL) !! 2) Harten-Lax-van Leer-Contact (HLLC) !! 3) Exact + +#:include 'macros.fpp' #:include 'inline_riemann.fpp' #:include 'inline_conversions.fpp' @@ -154,39 +156,75 @@ module m_riemann_solvers !! source terms, by using the left and right states given in qK_prim_rs_vf, !! dqK_prim_ds_vf where ds = dx, dy or dz. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsx_vf, flux_src_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsy_vf, flux_src_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_rsz_vf, flux_src_rsz_vf) +!$acc declare link( flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf, & +!$acc flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf ) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf +!$acc declare create( flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf, & +!$acc flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf ) +#endif !> @} - !$acc declare create( flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf, & - !$acc flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf ) !> The cell-boundary values of the geometrical source flux that are computed !! through the chosen Riemann problem solver by using the left and right !! states given in qK_prim_rs_vf. Currently 2D axisymmetric for inviscid only. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_gsrc_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_gsrc_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), flux_gsrc_rsz_vf) +!$acc declare link( flux_gsrc_rsx_vf, flux_gsrc_rsy_vf, flux_gsrc_rsz_vf ) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !< real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !< real(kind(0d0)), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !< +!$acc declare create( flux_gsrc_rsx_vf, flux_gsrc_rsy_vf, flux_gsrc_rsz_vf ) +#endif !> @} - !$acc declare create( flux_gsrc_rsx_vf, flux_gsrc_rsy_vf, flux_gsrc_rsz_vf ) ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as ! part of Riemann problem solution and is used to evaluate the source flux. +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), vel_src_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), vel_src_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), vel_src_rsz_vf) +!$acc declare link(vel_src_rsx_vf, vel_src_rsy_vf, vel_src_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf - !$acc declare create(vel_src_rsx_vf, vel_src_rsy_vf, vel_src_rsz_vf) - +!$acc declare create(vel_src_rsx_vf, vel_src_rsy_vf, vel_src_rsz_vf) +#endif + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), mom_sp_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), mom_sp_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), mom_sp_rsz_vf) +!$acc declare link(mom_sp_rsx_vf, mom_sp_rsy_vf, mom_sp_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf - !$acc declare create(mom_sp_rsx_vf, mom_sp_rsy_vf, mom_sp_rsz_vf) - +!$acc declare create(mom_sp_rsx_vf, mom_sp_rsy_vf, mom_sp_rsz_vf) +#endif + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), Re_avg_rsx_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), Re_avg_rsy_vf) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), Re_avg_rsz_vf) +!$acc declare link(Re_avg_rsx_vf, Re_avg_rsy_vf, Re_avg_rsz_vf) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf real(kind(0d0)), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf - !$acc declare create(Re_avg_rsx_vf, Re_avg_rsy_vf, Re_avg_rsz_vf) +!$acc declare link(Re_avg_rsx_vf, Re_avg_rsy_vf, Re_avg_rsz_vf) +#endif procedure(s_abstract_riemann_solver), & pointer :: s_riemann_solver => null() !< @@ -203,16 +241,29 @@ module m_riemann_solvers type(int_bounds_info) :: is1, is2, is3 type(int_bounds_info) :: isx, isy, isz !> @} - !$acc declare create(is1, is2, is3, isx, isy, isz) +!$acc declare create(is1, is2, is3, isx, isy, isz) + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:), Gs) +!$acc declare link(Gs) +#else real(kind(0d0)), allocatable, dimension(:) :: Gs - !$acc declare create(Gs) +!$acc declare create(Gs) +#endif +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), Res) +!$acc declare link(Res) +#else real(kind(0d0)), allocatable, dimension(:, :) :: Res - !$acc declare create(Res) +!$acc declare create(Res) +#endif contains + @:s_compute_speed_of_sound() + subroutine s_hll_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, & ! ------- dqL_prim_dy_vf, & dqL_prim_dz_vf, & @@ -1881,7 +1932,7 @@ contains !$acc end parallel loop else !$acc parallel loop collapse(3) gang vector default(present) private(vel_L, vel_R, Re_L, Re_R, & - !$acc rho_avg, h_avg, gamma_avg, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms) + !$acc rho_avg, h_avg, gamma_avg, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms) copyin(is1,is2,is3) do l = is3%beg, is3%end do k = is2%beg, is2%end do j = is1%beg, is1%end @@ -2185,6 +2236,7 @@ contains end if #:endif + end do end do end do @@ -2235,7 +2287,7 @@ contains ! the Riemann problem solution integer :: i, j - allocate (Gs(1:num_fluids)) + @:ALLOCATE_GLOBAL(Gs(1:num_fluids)) do i = 1, num_fluids Gs(i) = fluid_pp(i)%G @@ -2243,7 +2295,7 @@ contains !$acc update device(Gs) if (any(Re_size > 0)) then - allocate (Res(1:2, 1:maxval(Re_size))) + @:ALLOCATE_GLOBAL(Res(1:2, 1:maxval(Re_size))) end if if (any(Re_size > 0)) then @@ -2255,6 +2307,8 @@ contains !$acc update device(Res, Re_idx, Re_size) end if + !$acc enter data copyin(is1, is2, is3, isx, isy, isz) + ! Associating procedural pointer to the subroutine that will be ! utilized to calculate the solution of a given Riemann problem if (riemann_solver == 1) then @@ -2274,26 +2328,26 @@ contains is1%beg = -1; is2%beg = 0; is3%beg = 0 is1%end = m; is2%end = n; is3%end = p - allocate (flux_rsx_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_gsrc_rsx_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_src_rsx_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, advxb:sys_size)) - allocate (vel_src_rsx_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:num_dims)) + @:ALLOCATE_GLOBAL(flux_rsx_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_gsrc_rsx_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_src_rsx_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, advxb:sys_size)) + @:ALLOCATE_GLOBAL(vel_src_rsx_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:num_dims)) if (qbmm) then - allocate (mom_sp_rsx_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) + @:ALLOCATE_GLOBAL(mom_sp_rsx_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) end if if (any(Re_size > 0)) then - allocate (Re_avg_rsx_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:2)) + @:ALLOCATE_GLOBAL(Re_avg_rsx_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:2)) end if if (n == 0) return @@ -2301,27 +2355,27 @@ contains is1%beg = -1; is2%beg = 0; is3%beg = 0 is1%end = n; is2%end = m; is3%end = p - allocate (flux_rsy_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_gsrc_rsy_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_src_rsy_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, advxb:sys_size)) - allocate (vel_src_rsy_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:num_dims)) + @:ALLOCATE_GLOBAL(flux_rsy_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_gsrc_rsy_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_src_rsy_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, advxb:sys_size)) + @:ALLOCATE_GLOBAL(vel_src_rsy_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:num_dims)) if (qbmm) then - allocate (mom_sp_rsy_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) + @:ALLOCATE_GLOBAL(mom_sp_rsy_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) end if if (any(Re_size > 0)) then - allocate (Re_avg_rsy_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:2)) + @:ALLOCATE_GLOBAL(Re_avg_rsy_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:2)) end if if (p == 0) return @@ -2329,27 +2383,27 @@ contains is1%beg = -1; is2%beg = 0; is3%beg = 0 is1%end = p; is2%end = n; is3%end = m - allocate (flux_rsz_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_gsrc_rsz_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:sys_size)) - allocate (flux_src_rsz_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, advxb:sys_size)) - allocate (vel_src_rsz_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:num_dims)) + @:ALLOCATE_GLOBAL(flux_rsz_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_gsrc_rsz_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(flux_src_rsz_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, advxb:sys_size)) + @:ALLOCATE_GLOBAL(vel_src_rsz_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:num_dims)) if (qbmm) then - allocate (mom_sp_rsz_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) + @:ALLOCATE_GLOBAL(mom_sp_rsz_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4)) end if if (any(Re_size > 0)) then - allocate (Re_avg_rsz_vf(is1%beg:is1%end, & - is2%beg:is2%end, & - is3%beg:is3%end, 1:2)) + @:ALLOCATE_GLOBAL(Re_avg_rsz_vf(is1%beg:is1%end, & + is2%beg:is2%end, & + is3%beg:is3%end, 1:2)) end if end subroutine s_initialize_riemann_solvers_module ! ------------------- @@ -2416,6 +2470,8 @@ contains dir_idx = (/3, 1, 2/); dir_flg = (/0d0, 0d0, 1d0/) end if + !$acc update device(is1, is2, is3) + if (hypoelasticity) then if (norm_dir == 1) then dir_idx_tau = (/1, 2, 4/) @@ -2427,8 +2483,8 @@ contains end if isx = ix; isy = iy; isz = iz - - !$acc update device(is1, is2, is3, dir_idx, dir_flg, isx, isy, isz, dir_idx_tau) + !$acc update device(isx, isy, isz) ! for stuff in the same module + !$acc update device(dir_idx, dir_flg, dir_idx_tau) ! for stuff in different modules ! Population of Buffers in x-direction ============================= if (norm_dir == 1) then @@ -3898,8 +3954,6 @@ contains end subroutine s_compute_cartesian_viscous_source_flux ! ------------------------- - @:s_compute_speed_of_sound() - !> Deallocation and/or disassociation procedures that are !! needed to finalize the selected Riemann problem solver !! @param flux_vf Intercell fluxes @@ -4089,40 +4143,40 @@ contains ! s_convert_to_mixture_variables => null() if (Re_size(1) > 0) then - deallocate (Re_avg_rsx_vf) + @:DEALLOCATE_GLOBAL(Re_avg_rsx_vf) end if - deallocate (vel_src_rsx_vf) - deallocate (flux_rsx_vf) - deallocate (flux_src_rsx_vf) - deallocate (flux_gsrc_rsx_vf) + @:DEALLOCATE_GLOBAL(vel_src_rsx_vf) + @:DEALLOCATE_GLOBAL(flux_rsx_vf) + @:DEALLOCATE_GLOBAL(flux_src_rsx_vf) + @:DEALLOCATE_GLOBAL(flux_gsrc_rsx_vf) if (qbmm) then - deallocate (mom_sp_rsx_vf) + @:DEALLOCATE_GLOBAL(mom_sp_rsx_vf) end if if (n == 0) return if (Re_size(1) > 0) then - deallocate (Re_avg_rsy_vf) + @:DEALLOCATE_GLOBAL(Re_avg_rsy_vf) end if - deallocate (vel_src_rsy_vf) - deallocate (flux_rsy_vf) - deallocate (flux_src_rsy_vf) - deallocate (flux_gsrc_rsy_vf) + @:DEALLOCATE_GLOBAL(vel_src_rsy_vf) + @:DEALLOCATE_GLOBAL(flux_rsy_vf) + @:DEALLOCATE_GLOBAL(flux_src_rsy_vf) + @:DEALLOCATE_GLOBAL(flux_gsrc_rsy_vf) if (qbmm) then - deallocate (mom_sp_rsy_vf) + @:DEALLOCATE_GLOBAL(mom_sp_rsy_vf) end if if (p == 0) return if (Re_size(1) > 0) then - deallocate (Re_avg_rsz_vf) + @:DEALLOCATE_GLOBAL(Re_avg_rsz_vf) end if - deallocate (vel_src_rsz_vf) - deallocate (flux_rsz_vf) - deallocate (flux_src_rsz_vf) - deallocate (flux_gsrc_rsz_vf) + @:DEALLOCATE_GLOBAL(vel_src_rsz_vf) + @:DEALLOCATE_GLOBAL(flux_rsz_vf) + @:DEALLOCATE_GLOBAL(flux_src_rsz_vf) + @:DEALLOCATE_GLOBAL(flux_gsrc_rsz_vf) if (qbmm) then - deallocate (mom_sp_rsz_vf) + @:DEALLOCATE_GLOBAL(mom_sp_rsz_vf) end if end subroutine s_finalize_riemann_solvers_module ! --------------------- diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 11ecaeee9..ecf633a83 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -44,7 +44,7 @@ module m_start_up use m_qbmm !< Quadrature MOM - use m_derived_variables !< Procedures used to compute quantites derived + use m_derived_variables !< Procedures used to compute quantities derived !! from the conservative and primitive variables use m_hypoelastic @@ -121,6 +121,8 @@ contains integer :: iostatus !! Integer to check iostat of file read + CHARACTER(len=511) :: CRAY_ACC_MODULE + character(len=1000) :: line ! Namelist of the global parameters which may be specified by user @@ -148,8 +150,9 @@ contains polytropic, thermal, & integral, integral_wrt, num_integrals, & polydisperse, poly_sigma, qbmm, & - R0_type, file_per_process, relax, relax_model, & - palpha_eps, ptgalpha_eps + relax, relax_model, & + palpha_eps, ptgalpha_eps, & + R0_type, file_per_process ! Checking that an input file has been provided by the user. If it ! has, then the input file is read in, otherwise, simulation exits. @@ -181,6 +184,16 @@ contains call s_mpi_abort(trim(file_path)//' is missing. Exiting ...') end if +#ifdef _CRAYFTN +#ifdef MFC_OpenACC + call get_environment_variable("CRAY_ACC_MODULE", CRAY_ACC_MODULE) + + if (CRAY_ACC_MODULE == "") then + call s_mpi_abort("CRAY_ACC_MODULE is not set. Exiting...") + end if +#endif +#endif + end subroutine s_read_input_file ! ------------------------------------- !> The goal of this procedure is to verify that each of the @@ -468,7 +481,7 @@ contains else call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting...') end if - + ! Assigning local cell boundary locations x_cb(-1:m) = x_cb_glb((start_idx(1) - 1):(start_idx(1) + m)) ! Computing the cell width distribution @@ -1081,9 +1094,7 @@ contains elseif (time_stepper == 3) then call s_3rd_order_tvd_rk(t_step, time_avg) end if - - if (relax) call s_relaxation_solver(q_cons_ts(1)%vf) - + if (relax) call s_infinite_relaxation_k(q_cons_ts(1)%vf) ! Time-stepping loop controls if ((mytime + dt) >= finaltime) dt = finaltime - mytime t_step = t_step + 1 @@ -1234,7 +1245,7 @@ contains call acc_present_dump() #endif - if (hypoelasticity) call s_initialize_hypoelastic_module() + if (hypoelasticity) call s_initialize_hypoelastic_module() if (relax) call s_initialize_phasechange_module() call s_initialize_data_output_module() call s_initialize_derived_variables_module() @@ -1350,30 +1361,25 @@ contains subroutine s_initialize_gpu_vars() integer :: i !Update GPU DATA - !$acc update device(dt, dx, dy, dz, x_cc, y_cc, z_cc, x_cb, y_cb, z_cb) - !$acc update device(sys_size, buff_size) - !$acc update device(m, n, p) - !$acc update device(momxb, momxe, bubxb, bubxe, advxb, advxe, contxb, contxe, strxb, strxe) do i = 1, sys_size !$acc update device(q_cons_ts(1)%vf(i)%sf) end do if (qbmm .and. .not. polytropic) then !$acc update device(pb_ts(1)%sf, mv_ts(1)%sf) end if - !$acc update device(dt, sys_size, pref, rhoref, gamma_idx, pi_inf_idx, E_idx, alf_idx, stress_idx, mpp_lim, bubbles, hypoelasticity, alt_soundspeed, avg_state, num_fluids, model_eqns, num_dims, mixture_err, nb, weight, grid_geometry, cyl_coord, mapped_weno, mp_weno, weno_eps) !$acc update device(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma) !$acc update device(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN , mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam) - !$acc update device(monopole, num_mono) - + !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc) !$acc update device(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3) !$acc update device(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3) !$acc update device(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3) - !$acc update device(relax) + !$acc update device(relax, relax_model) if (relax) then !$acc update device(palpha_eps, ptgalpha_eps) end if + end subroutine s_initialize_gpu_vars subroutine s_finalize_modules() @@ -1393,7 +1399,6 @@ contains call s_finalize_mpi_proxy_module() call s_finalize_global_parameters_module() if (relax) call s_finalize_relaxation_solver_module() - if (any(Re_size > 0)) then call s_finalize_viscous_module() end if diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index d482d0ab5..7d820a026 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -34,6 +34,28 @@ module m_time_steppers implicit none +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), q_cons_ts) + !! Cell-average conservative variables at each time-stage (TS) + + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:), q_prim_vf) + !! Cell-average primitive variables at the current time-stage + + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:), rhs_vf) + !! Cell-average RHS variables at the current time-stage + + @:CRAY_DECLARE_GLOBAL(type(vector_field), dimension(:), q_prim_ts) + !! Cell-average primitive variables at consecutive TIMESTEPS + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :, :), rhs_pb) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :, :), rhs_mv) + + integer, private :: num_ts !< + !! Number of time stages in the time-stepping scheme + +!$acc declare link(q_cons_ts,q_prim_vf,rhs_vf,q_prim_ts, rhs_mv, rhs_pb) +#else type(vector_field), allocatable, dimension(:) :: q_cons_ts !< !! Cell-average conservative variables at each time-stage (TS) @@ -53,7 +75,8 @@ module m_time_steppers integer, private :: num_ts !< !! Number of time stages in the time-stepping scheme - !$acc declare create(q_cons_ts,q_prim_vf,rhs_vf,q_prim_ts, rhs_mv, rhs_pb) +!$acc declare create(q_cons_ts,q_prim_vf,rhs_vf,q_prim_ts, rhs_mv, rhs_pb) +#endif contains @@ -91,7 +114,7 @@ contains end if ! Allocating the cell-average conservative variables - @:ALLOCATE(q_cons_ts(1:num_ts)) + @:ALLOCATE_GLOBAL(q_cons_ts(1:num_ts)) do i = 1, num_ts @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size)) @@ -103,11 +126,12 @@ contains iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end)) end do + @:ACC_SETUP_VFs(q_cons_ts(i)) end do ! Allocating the cell-average primitive ts variables if (probe_wrt) then - @:ALLOCATE(q_prim_ts(0:3)) + @:ALLOCATE_GLOBAL(q_prim_ts(0:3)) do i = 0, 3 @:ALLOCATE(q_prim_ts(i)%vf(1:sys_size)) @@ -120,15 +144,20 @@ contains iz_t%beg:iz_t%end)) end do end do + + do i = 0, 3 + @:ACC_SETUP_VFs(q_prim_ts(i)) + end do end if ! Allocating the cell-average primitive variables - @:ALLOCATE(q_prim_vf(1:sys_size)) + @:ALLOCATE_GLOBAL(q_prim_vf(1:sys_size)) do i = 1, adv_idx%end @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end)) + @:ACC_SETUP_SFs(q_prim_vf(i)) end do if (bubbles) then @@ -136,79 +165,100 @@ contains @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end)) + @:ACC_SETUP_SFs(q_prim_vf(i)) + end do + end if + + if (hypoelasticity) then + + do i = stress_idx%beg, stress_idx%end + @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & + iy_t%beg:iy_t%end, & + iz_t%beg:iz_t%end)) + @:ACC_SETUP_SFs(q_prim_vf(i)) + end do + end if + + if (model_eqns == 3) then + do i = internalEnergies_idx%beg, internalEnergies_idx%end + @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & + iy_t%beg:iy_t%end, & + iz_t%beg:iz_t%end)) + @:ACC_SETUP_SFs(q_prim_vf(i)) end do end if - @:ALLOCATE(pb_ts(1:2)) + @:ALLOCATE_GLOBAL(pb_ts(1:2)) !Initialize bubble variables pb and mv at all quadrature nodes for all R0 bins if (qbmm .and. (.not. polytropic)) then @:ALLOCATE(pb_ts(1)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) + @:ACC_SETUP_SFs(pb_ts(1)) + @:ALLOCATE(pb_ts(2)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) - @:ALLOCATE(rhs_pb(ix_t%beg:ix_t%end, & + @:ACC_SETUP_SFs(pb_ts(2)) + + @:ALLOCATE_GLOBAL(rhs_pb(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) else if (qbmm .and. polytropic) then @:ALLOCATE(pb_ts(1)%sf(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) + @:ACC_SETUP_SFs(pb_ts(1)) + @:ALLOCATE(pb_ts(2)%sf(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) - @:ALLOCATE(rhs_pb(ix_t%beg:ix_t%beg + 1, & + @:ACC_SETUP_SFs(pb_ts(2)) + + @:ALLOCATE_GLOBAL(rhs_pb(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) end if - @:ALLOCATE(mv_ts(1:2)) + @:ALLOCATE_GLOBAL(mv_ts(1:2)) if (qbmm .and. (.not. polytropic)) then @:ALLOCATE(mv_ts(1)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) + @:ACC_SETUP_SFs(mv_ts(1)) + @:ALLOCATE(mv_ts(2)%sf(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) - @:ALLOCATE(rhs_mv(ix_t%beg:ix_t%end, & + @:ACC_SETUP_SFs(mv_ts(2)) + + @:ALLOCATE_GLOBAL(rhs_mv(ix_t%beg:ix_t%end, & iy_t%beg:iy_t%end, & iz_t%beg:iz_t%end, 1:nnode, 1:nb)) + else if (qbmm .and. polytropic) then @:ALLOCATE(mv_ts(1)%sf(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) + @:ACC_SETUP_SFs(mv_ts(1)) + @:ALLOCATE(mv_ts(2)%sf(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) - @:ALLOCATE(rhs_mv(ix_t%beg:ix_t%beg + 1, & + @:ACC_SETUP_SFs(mv_ts(2)) + + @:ALLOCATE_GLOBAL(rhs_mv(ix_t%beg:ix_t%beg + 1, & iy_t%beg:iy_t%beg + 1, & iz_t%beg:iz_t%beg + 1, 1:nnode, 1:nb)) end if - if (hypoelasticity) then - - do i = stress_idx%beg, stress_idx%end - @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & - iy_t%beg:iy_t%end, & - iz_t%beg:iz_t%end)) - end do - end if - - if (model_eqns == 3) then - do i = internalEnergies_idx%beg, internalEnergies_idx%end - @:ALLOCATE(q_prim_vf(i)%sf(ix_t%beg:ix_t%end, & - iy_t%beg:iy_t%end, & - iz_t%beg:iz_t%end)) - end do - end if - ! Allocating the cell-average RHS variables - @:ALLOCATE(rhs_vf(1:sys_size)) + @:ALLOCATE_GLOBAL(rhs_vf(1:sys_size)) do i = 1, sys_size @:ALLOCATE(rhs_vf(i)%sf(0:m, 0:n, 0:p)) + @:ACC_SETUP_SFs(rhs_vf(i)) end do ! Opening and writing the header of the run-time information file @@ -803,7 +853,7 @@ contains end do - @:DEALLOCATE(q_cons_ts) + @:DEALLOCATE_GLOBAL(q_cons_ts) ! Deallocating the cell-average primitive ts variables if (probe_wrt) then @@ -813,7 +863,7 @@ contains end do @:DEALLOCATE(q_prim_ts(i)%vf) end do - @:DEALLOCATE(q_prim_ts) + @:DEALLOCATE_GLOBAL(q_prim_ts) end if ! Deallocating the cell-average primitive variables @@ -839,14 +889,14 @@ contains end do end if - @:DEALLOCATE(q_prim_vf) + @:DEALLOCATE_GLOBAL(q_prim_vf) ! Deallocating the cell-average RHS variables do i = 1, sys_size @:DEALLOCATE(rhs_vf(i)%sf) end do - @:DEALLOCATE(rhs_vf) + @:DEALLOCATE_GLOBAL(rhs_vf) ! Writing the footer of and closing the run-time information file if (proc_rank == 0 .and. run_time_info) then diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp index cd573c2ac..bce3fa8df 100644 --- a/src/simulation/m_viscous.fpp +++ b/src/simulation/m_viscous.fpp @@ -24,16 +24,24 @@ module m_viscous s_finalize_viscous_module type(int_bounds_info) :: iv - type(int_bounds_info) :: is1, is2, is3 - !$acc declare create(is1, is2, is3, iv) - - real(kind(0d0)), allocatable, dimension(:, :) :: Res - !$acc declare create(Res) - - !> @name Additional field for capillary source terms - !> @{ + type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous +!$acc declare create(is1_viscous, is2_viscous, is3_viscous, iv) + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), Res_viscous) +!$acc declare link(Res_viscous) +#else + real(kind(0d0)), allocatable, dimension(:, :) :: Res_viscous +!$acc declare create(Re_viscous) +#endif + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(type(scalar_field), dimension(:), tau_Re_vf) +!$acc declare link(tau_Re_vf) +#else type(scalar_field), allocatable, dimension(:) :: tau_Re_vf - !> @} +!$acc declare create(tau_Re_vf) +#endif contains @@ -50,26 +58,27 @@ contains ix%end = m - ix%beg; iy%end = n - iy%beg; iz%end = p - iz%beg ! ================================================================== - @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) + @:ALLOCATE_GLOBAL(Res_viscous(1:2, 1:maxval(Re_size))) do i = 1, 2 do j = 1, Re_size(i) - Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) + Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i) end do end do - !$acc update device(Res, Re_idx, Re_size) + !$acc update device(Res_viscous, Re_idx, Re_size) + !$acc enter data copyin(is1_viscous, is2_viscous, is3_viscous, iv) - if (cyl_coord) then - @:ALLOCATE(tau_Re_vf(1:sys_size)) - do i = 1, num_dims - @:ALLOCATE(tau_Re_vf(cont_idx%end + i)%sf(ix%beg:ix%end, & - & iy%beg:iy%end, & - & iz%beg:iz%end)) - end do - @:ALLOCATE(tau_Re_vf(E_idx)%sf(ix%beg:ix%end, & - & iy%beg:iy%end, & - & iz%beg:iz%end)) - end if + @:ALLOCATE_GLOBAL(tau_Re_vf(1:sys_size)) + do i = 1, num_dims + @:ALLOCATE(tau_Re_vf(cont_idx%end + i)%sf(ix%beg:ix%end, & + & iy%beg:iy%end, & + & iz%beg:iz%end)) + @:ACC_SETUP_SFs(tau_Re_vf(cont_idx%end + i)) + end do + @:ALLOCATE(tau_Re_vf(E_idx)%sf(ix%beg:ix%end, & + & iy%beg:iy%end, & + & iz%beg:iz%end)) + @:ACC_SETUP_SFs(tau_Re_vf(E_idx)) end subroutine s_initialize_viscous_module @@ -101,12 +110,14 @@ contains type(int_bounds_info) :: ix, iy, iz - !$acc update device(ix, iy, iz) + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + + !$acc update device(is1_viscous, is2_viscous, is3_viscous) !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end - do j = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = momxb, E_idx tau_Re_vf(i)%sf(j, k, l) = 0d0 @@ -116,9 +127,9 @@ contains end do if (Re_size(1) > 0) then ! Shear stresses !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re ) - do l = iz%beg, iz%end + do l = is3_viscous%beg, is3_viscous%end do k = -1, 1 - do j = ix%beg, ix%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = 1, num_fluids @@ -188,7 +199,7 @@ contains if (Re_size(i) > 0) Re_visc(i) = 0d0 !$acc loop seq do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res(i, q) & + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + Re_visc(i) end do @@ -223,9 +234,9 @@ contains if (Re_size(2) > 0) then ! Bulk stresses !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re ) - do l = iz%beg, iz%end + do l = is3_viscous%beg, is3_viscous%end do k = -1, 1 - do j = ix%beg, ix%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = 1, num_fluids @@ -295,7 +306,7 @@ contains if (Re_size(i) > 0) Re_visc(i) = 0d0 !$acc loop seq do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res(i, q) & + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + Re_visc(i) end do @@ -327,9 +338,9 @@ contains if (Re_size(1) > 0) then ! Shear stresses !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re ) - do l = iz%beg, iz%end + do l = is3_viscous%beg, is3_viscous%end do k = -1, 1 - do j = ix%beg, ix%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = 1, num_fluids @@ -399,7 +410,7 @@ contains if (Re_size(i) > 0) Re_visc(i) = 0d0 !$acc loop seq do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res(i, q) & + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + Re_visc(i) end do @@ -435,9 +446,9 @@ contains if (Re_size(2) > 0) then ! Bulk stresses !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re ) - do l = iz%beg, iz%end + do l = is3_viscous%beg, is3_viscous%end do k = -1, 1 - do j = ix%beg, ix%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = 1, num_fluids @@ -507,7 +518,7 @@ contains if (Re_size(i) > 0) Re_visc(i) = 0d0 !$acc loop seq do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res(i, q) & + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + Re_visc(i) end do @@ -553,7 +564,7 @@ contains qL_prim_rsy_vf, qR_prim_rsy_vf, & qL_prim_rsz_vf, qR_prim_rsz_vf - type(vector_field), dimension(1:num_dims) :: qL_prim, qR_prim + type(vector_field), dimension(num_dims), intent(INOUT) :: qL_prim, qR_prim type(vector_field) :: q_prim_qp @@ -562,7 +573,7 @@ contains dqL_prim_dy_n, dqR_prim_dy_n, & dqL_prim_dz_n, dqR_prim_dz_n - type(vector_field) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp + type(vector_field), dimension(1), intent(INOUT) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp type(int_bounds_info), intent(IN) :: ix, iy, iz @@ -590,19 +601,19 @@ contains call s_apply_scalar_divergence_theorem( & qL_prim(i)%vf(iv%beg:iv%end), & qR_prim(i)%vf(iv%beg:iv%end), & - dq_prim_dx_qp%vf(iv%beg:iv%end), i, & + dq_prim_dx_qp(1)%vf(iv%beg:iv%end), i, & ix, iy, iz, iv, dx, m, buff_size) elseif (i == 2) then call s_apply_scalar_divergence_theorem( & qL_prim(i)%vf(iv%beg:iv%end), & qR_prim(i)%vf(iv%beg:iv%end), & - dq_prim_dy_qp%vf(iv%beg:iv%end), i, & + dq_prim_dy_qp(1)%vf(iv%beg:iv%end), i, & ix, iy, iz, iv, dy, n, buff_size) else call s_apply_scalar_divergence_theorem( & qL_prim(i)%vf(iv%beg:iv%end), & qR_prim(i)%vf(iv%beg:iv%end), & - dq_prim_dz_qp%vf(iv%beg:iv%end), i, & + dq_prim_dz_qp(1)%vf(iv%beg:iv%end), i, & ix, iy, iz, iv, dz, p, buff_size) end if end do @@ -612,10 +623,14 @@ contains iv%beg = mom_idx%beg; iv%end = mom_idx%end !$acc update device(iv) + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + + !$acc update device(is1_viscous, is2_viscous, is3_viscous) + !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end + do l = is3_viscous%beg, is3_viscous%end do k = iy%beg, iy%end - do j = ix%beg + 1, ix%end + do j = is1_viscous%beg + 1, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = & @@ -628,9 +643,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end - do j = ix%beg, ix%end - 1 + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = & @@ -645,9 +660,9 @@ contains if (n > 0) then !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do j = iy%beg + 1, iy%end - do k = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = & @@ -660,9 +675,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do j = iy%beg, iy%end - 1 - do k = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = & @@ -675,9 +690,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do j = iy%beg + 1, iy%end - do k = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = & @@ -694,9 +709,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do j = iy%beg, iy%end - 1 - do k = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg + 1, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = & @@ -714,9 +729,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg + 1, ix%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = & @@ -734,9 +749,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg, ix%end - 1 + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = & @@ -756,9 +771,9 @@ contains if (p > 0) then !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg + 1, iz%end - do l = iy%beg, iy%end - do k = ix%beg, ix%end + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -772,9 +787,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg, iz%end - 1 - do l = iy%beg, iy%end - do k = ix%beg, ix%end + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -788,9 +803,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg, iy%end - do j = ix%beg + 1, ix%end + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg + 1, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -809,9 +824,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg, iy%end - do j = ix%beg, ix%end - 1 + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end @@ -830,9 +845,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do j = iy%beg + 1, iy%end - do k = ix%beg, ix%end + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -851,9 +866,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do j = iy%beg, iy%end - 1 - do k = ix%beg, ix%end + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -872,9 +887,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg + 1, iz%end - do l = iy%beg + 1, iy%end - 1 - do k = ix%beg, ix%end + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg + 1, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -893,9 +908,9 @@ contains end do !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg, iz%end - 1 - do l = iy%beg + 1, iy%end - 1 - do k = ix%beg, ix%end + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg + 1, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end @@ -912,11 +927,10 @@ contains end do end do end do - !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg + 1, iz%end - do l = iy%beg, iy%end - do k = ix%beg + 1, ix%end - 1 + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end @@ -933,11 +947,10 @@ contains end do end do end do - !$acc parallel loop collapse(3) gang vector default(present) - do j = iz%beg, iz%end - 1 - do l = iy%beg, iy%end - do k = ix%beg + 1, ix%end - 1 + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = & @@ -956,9 +969,9 @@ contains do i = iv%beg, iv%end call s_compute_fd_gradient(q_prim_qp%vf(i), & - dq_prim_dx_qp%vf(i), & - dq_prim_dy_qp%vf(i), & - dq_prim_dz_qp%vf(i), & + dq_prim_dx_qp(1)%vf(i), & + dq_prim_dy_qp(1)%vf(i), & + dq_prim_dz_qp(1)%vf(i), & ix, iy, iz, buff_size) end do @@ -966,9 +979,9 @@ contains do i = iv%beg, iv%end call s_compute_fd_gradient(q_prim_qp%vf(i), & - dq_prim_dx_qp%vf(i), & - dq_prim_dy_qp%vf(i), & - dq_prim_dy_qp%vf(i), & + dq_prim_dx_qp(1)%vf(i), & + dq_prim_dy_qp(1)%vf(i), & + dq_prim_dy_qp(1)%vf(i), & ix, iy, iz, buff_size) end do @@ -977,9 +990,9 @@ contains else do i = iv%beg, iv%end call s_compute_fd_gradient(q_prim_qp%vf(i), & - dq_prim_dx_qp%vf(i), & - dq_prim_dx_qp%vf(i), & - dq_prim_dx_qp%vf(i), & + dq_prim_dx_qp(1)%vf(i), & + dq_prim_dx_qp(1)%vf(i), & + dq_prim_dx_qp(1)%vf(i), & ix, iy, iz, buff_size) end do @@ -1191,23 +1204,23 @@ contains ! Reconstruction in s1-direction =================================== if (norm_dir == 1) then - is1 = ix; is2 = iy; is3 = iz - weno_dir = 1; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + weno_dir = 1; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn elseif (norm_dir == 2) then - is1 = iy; is2 = ix; is3 = iz - weno_dir = 2; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = iy; is2_viscous = ix; is3_viscous = iz + weno_dir = 2; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn else - is1 = iz; is2 = iy; is3 = ix - weno_dir = 3; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = iz; is2_viscous = iy; is3_viscous = ix + weno_dir = 3; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn end if - !$acc update device(is1, is2, is3, iv) + !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv) if (n > 0) then if (p > 0) then @@ -1215,19 +1228,19 @@ contains call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) else call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) end if else call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) end if if (any(Re_size > 0)) then @@ -1235,9 +1248,9 @@ contains if (norm_dir == 2) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) end do @@ -1247,9 +1260,9 @@ contains elseif (norm_dir == 3) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do l = is3_viscous%beg, is3_viscous%end vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) end do @@ -1259,9 +1272,9 @@ contains elseif (norm_dir == 1) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) end do @@ -1294,23 +1307,23 @@ contains ! Reconstruction in s1-direction =================================== if (norm_dir == 1) then - is1 = ix; is2 = iy; is3 = iz - weno_dir = 1; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + weno_dir = 1; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn elseif (norm_dir == 2) then - is1 = iy; is2 = ix; is3 = iz - weno_dir = 2; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = iy; is2_viscous = ix; is3_viscous = iz + weno_dir = 2; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn else - is1 = iz; is2 = iy; is3 = ix - weno_dir = 3; is1%beg = is1%beg + weno_polyn - is1%end = is1%end - weno_polyn + is1_viscous = iz; is2_viscous = iy; is3_viscous = ix + weno_dir = 3; is1_viscous%beg = is1_viscous%beg + weno_polyn + is1_viscous%end = is1_viscous%end - weno_polyn end if - !$acc update device(is1, is2, is3, iv) + !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv) if (n > 0) then if (p > 0) then @@ -1318,19 +1331,19 @@ contains call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) else call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) end if else call s_weno(v_vf(iv%beg:iv%end), & vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), & norm_dir, weno_dir, & - is1, is2, is3) + is1_viscous, is2_viscous, is3_viscous) end if if (any(Re_size > 0)) then @@ -1338,9 +1351,9 @@ contains if (norm_dir == 2) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) end do @@ -1350,9 +1363,9 @@ contains elseif (norm_dir == 3) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do l = is3_viscous%beg, is3_viscous%end vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) end do @@ -1362,9 +1375,9 @@ contains elseif (norm_dir == 1) then !$acc parallel loop collapse(4) gang vector default(present) do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) end do @@ -1390,11 +1403,10 @@ contains subroutine s_apply_scalar_divergence_theorem(vL_vf, vR_vf, & ! -------- dv_ds_vf, & norm_dir, & - ix, iy, iz, iv, & + ix, iy, iz, iv_in, & dL, dim, buff_size_in) - type(int_bounds_info) :: ix, iy, iz, iv - + type(int_bounds_info), intent(IN) :: ix, iy, iz, iv_in integer :: buff_size_in, dim real(kind(0d0)), dimension(-buff_size_in:dim + buff_size_in) :: dL @@ -1412,7 +1424,12 @@ contains integer :: i, j, k, l !< Generic loop iterators - !$acc update device(ix, iy, iz, iv) + is1_viscous = ix + is2_viscous = iy + is3_viscous = iz + iv = iv_in + + !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv) ! First-Order Spatial Derivatives in x-direction =================== if (norm_dir == 1) then @@ -1424,9 +1441,9 @@ contains ! spatial derivatives inside the cell. !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end - do j = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg + 1, is1_viscous%end - 1 !$acc loop seq do i = iv%beg, iv%end dv_ds_vf(i)%sf(j, k, l) = & @@ -1452,9 +1469,9 @@ contains ! spatial derivatives inside the cell. !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dv_ds_vf(i)%sf(j, k, l) = & @@ -1480,9 +1497,9 @@ contains ! spatial derivatives inside the cell. !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg, iy%end - do j = ix%beg, ix%end + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end !$acc loop seq do i = iv%beg, iv%end dv_ds_vf(i)%sf(j, k, l) = & @@ -1534,12 +1551,14 @@ contains iz%beg = -1; iz%end = 1 end if - !$acc update device(ix, iy, iz) + is1_viscous = ix; is2_viscous = iy; is3_viscous = iz + + !$acc update device(is1_viscous, is2_viscous, is3_viscous) !$acc parallel loop collapse(3) gang vector default(present) - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 grad_x%sf(j, k, l) = & (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ & (x_cc(j + 1) - x_cc(j - 1)) @@ -1549,9 +1568,9 @@ contains if (n > 0) then !$acc parallel loop collapse(3) gang vector - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 grad_y%sf(j, k, l) = & (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ & (y_cc(k + 1) - y_cc(k - 1)) @@ -1562,9 +1581,9 @@ contains if (p > 0) then !$acc parallel loop collapse(3) gang vector - do l = iz%beg + 1, iz%end - 1 - do k = iy%beg + 1, iy%end - 1 - do j = ix%beg + 1, ix%end - 1 + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end - 1 grad_z%sf(j, k, l) = & (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ & (z_cc(l + 1) - z_cc(l - 1)) @@ -1573,54 +1592,54 @@ contains end do end if - ix%beg = -buff_size_in; ix%end = m + buff_size_in; + is1_viscous%beg = -buff_size_in; is1_viscous%end = m + buff_size_in; if (n > 0) then - iy%beg = -buff_size_in; iy%end = n + buff_size_in + is2_viscous%beg = -buff_size_in; is2_viscous%end = n + buff_size_in else - iy%beg = 0; iy%end = 0 + is2_viscous%beg = 0; is2_viscous%end = 0 end if if (p > 0) then - iz%beg = -buff_size_in; iz%end = p + buff_size_in + is3_viscous%beg = -buff_size_in; is3_viscous%end = p + buff_size_in else - iz%beg = 0; iz%end = 0 + is3_viscous%beg = 0; is3_viscous%end = 0 end if - !$acc update device(ix, iy, iz) + !$acc update device(is1_viscous, is2_viscous, is3_viscous) !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end - grad_x%sf(ix%beg, k, l) = & - (-3d0*var%sf(ix%beg, k, l) + 4d0*var%sf(ix%beg + 1, k, l) - var%sf(ix%beg + 2, k, l))/ & - (x_cc(ix%beg + 2) - x_cc(ix%beg)) - grad_x%sf(ix%end, k, l) = & - (3d0*var%sf(ix%end, k, l) - 4d0*var%sf(ix%end - 1, k, l) + var%sf(ix%end - 2, k, l))/ & - (x_cc(ix%end) - x_cc(ix%end - 2)) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + grad_x%sf(is1_viscous%beg, k, l) = & + (-3d0*var%sf(is1_viscous%beg, k, l) + 4d0*var%sf(is1_viscous%beg + 1, k, l) - var%sf(is1_viscous%beg + 2, k, l))/ & + (x_cc(is1_viscous%beg + 2) - x_cc(is1_viscous%beg)) + grad_x%sf(is1_viscous%end, k, l) = & + (3d0*var%sf(is1_viscous%end, k, l) - 4d0*var%sf(is1_viscous%end - 1, k, l) + var%sf(is1_viscous%end - 2, k, l))/ & + (x_cc(is1_viscous%end) - x_cc(is1_viscous%end - 2)) end do end do if (n > 0) then !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do j = ix%beg, ix%end - grad_y%sf(j, iy%beg, l) = & - (-3d0*var%sf(j, iy%beg, l) + 4d0*var%sf(j, iy%beg + 1, l) - var%sf(j, iy%beg + 2, l))/ & - (y_cc(iy%beg + 2) - y_cc(iy%beg)) - grad_y%sf(j, iy%end, l) = & - (3d0*var%sf(j, iy%end, l) - 4d0*var%sf(j, iy%end - 1, l) + var%sf(j, iy%end - 2, l))/ & - (y_cc(iy%end) - y_cc(iy%end - 2)) + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_y%sf(j, is2_viscous%beg, l) = & + (-3d0*var%sf(j, is2_viscous%beg, l) + 4d0*var%sf(j, is2_viscous%beg + 1, l) - var%sf(j, is2_viscous%beg + 2, l))/ & + (y_cc(is2_viscous%beg + 2) - y_cc(is2_viscous%beg)) + grad_y%sf(j, is2_viscous%end, l) = & + (3d0*var%sf(j, is2_viscous%end, l) - 4d0*var%sf(j, is2_viscous%end - 1, l) + var%sf(j, is2_viscous%end - 2, l))/ & + (y_cc(is2_viscous%end) - y_cc(is2_viscous%end - 2)) end do end do if (p > 0) then !$acc parallel loop collapse(2) gang vector default(present) - do k = iy%beg, iy%end - do j = ix%beg, ix%end - grad_z%sf(j, k, iz%beg) = & - (-3d0*var%sf(j, k, iz%beg) + 4d0*var%sf(j, k, iz%beg + 1) - var%sf(j, k, iz%beg + 2))/ & - (z_cc(iz%beg + 2) - z_cc(iz%beg)) - grad_z%sf(j, k, iz%end) = & - (3d0*var%sf(j, k, iz%end) - 4d0*var%sf(j, k, iz%end - 1) + var%sf(j, k, iz%end - 2))/ & - (z_cc(iz%end) - z_cc(iz%end - 2)) + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_z%sf(j, k, is3_viscous%beg) = & + (-3d0*var%sf(j, k, is3_viscous%beg) + 4d0*var%sf(j, k, is3_viscous%beg + 1) - var%sf(j, k, is3_viscous%beg + 2))/ & + (z_cc(is3_viscous%beg + 2) - z_cc(is3_viscous%beg)) + grad_z%sf(j, k, is3_viscous%end) = & + (3d0*var%sf(j, k, is3_viscous%end) - 4d0*var%sf(j, k, is3_viscous%end - 1) + var%sf(j, k, is3_viscous%end - 2))/ & + (z_cc(is3_viscous%end) - z_cc(is3_viscous%end - 2)) end do end do end if @@ -1628,8 +1647,8 @@ contains if (bc_x%beg <= -3) then !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end grad_x%sf(0, k, l) = (-3d0*var%sf(0, k, l) + 4d0*var%sf(1, k, l) - var%sf(2, k, l))/ & (x_cc(2) - x_cc(0)) end do @@ -1637,18 +1656,18 @@ contains end if if (bc_x%end <= -3) then !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do k = iy%beg, iy%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end grad_x%sf(m, k, l) = (3d0*var%sf(m, k, l) - 4d0*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ & (x_cc(m) - x_cc(m - 2)) end do end do end if if (n > 0) then - if (bc_y%beg <= -3 .and. bc_y%beg /= -14) then + if (bc_y%beg <= -3 .and. bc_y%beg /= -13) then !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do j = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end grad_y%sf(j, 0, l) = (-3d0*var%sf(j, 0, l) + 4d0*var%sf(j, 1, l) - var%sf(j, 2, l))/ & (y_cc(2) - y_cc(0)) end do @@ -1656,8 +1675,8 @@ contains end if if (bc_y%end <= -3) then !$acc parallel loop collapse(2) gang vector default(present) - do l = iz%beg, iz%end - do j = ix%beg, ix%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end grad_y%sf(j, n, l) = (3d0*var%sf(j, n, l) - 4d0*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ & (y_cc(n) - y_cc(n - 2)) end do @@ -1666,8 +1685,8 @@ contains if (p > 0) then if (bc_z%beg <= -3) then !$acc parallel loop collapse(2) gang vector default(present) - do k = iy%beg, iy%end - do j = ix%beg, ix%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end grad_z%sf(j, k, 0) = & (-3d0*var%sf(j, k, 0) + 4d0*var%sf(j, k, 1) - var%sf(j, k, 2))/ & (z_cc(2) - z_cc(0)) @@ -1676,8 +1695,8 @@ contains end if if (bc_z%end <= -3) then !$acc parallel loop collapse(2) gang vector default(present) - do k = iy%beg, iy%end - do j = ix%beg, ix%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end grad_z%sf(j, k, p) = & (3d0*var%sf(j, k, p) - 4d0*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ & (z_cc(p) - z_cc(p - 2)) @@ -1693,14 +1712,14 @@ contains integer :: i - @:DEALLOCATE(Res) + @:DEALLOCATE_GLOBAL(Res_viscous) if (cyl_coord) then do i = 1, num_dims @:DEALLOCATE(tau_Re_vf(cont_idx%end + i)%sf) end do @:DEALLOCATE(tau_Re_vf(E_idx)%sf) - @:DEALLOCATE(tau_Re_vf) + @:DEALLOCATE_GLOBAL(tau_Re_vf) end if end subroutine s_finalize_viscous_module diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index f1facc15e..21b0ea1e1 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -20,7 +20,7 @@ module m_weno use m_variables_conversion !< State variables type conversion procedures -#ifdef MFC_OpenACC +#ifdef MFC_OPENACC use openacc #endif @@ -39,7 +39,13 @@ module m_weno !! of the characteristic decomposition are stored in custom-constructed WENO- !! stencils (WS) that are annexed to each position of a given scalar field. !> @{ + +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :, :), v_rs_ws_x, v_rs_ws_y, v_rs_ws_z) +!$acc declare link(v_rs_ws_x, v_rs_ws_y, v_rs_ws_z) +#else real(kind(0d0)), allocatable, dimension(:, :, :, :) :: v_rs_ws_x, v_rs_ws_y, v_rs_ws_z +#endif !> @} ! WENO Coefficients ======================================================== @@ -50,6 +56,17 @@ module m_weno !! second dimension identifies the position of its coefficients and the last !! dimension denotes the cell-location in the relevant coordinate direction. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbL_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbL_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbL_z) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbR_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbR_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), poly_coef_cbR_z) +!$acc declare link(poly_coef_cbL_x, poly_coef_cbL_y, poly_coef_cbL_z) +!$acc declare link(poly_coef_cbR_x, poly_coef_cbR_y, poly_coef_cbR_z) +#else real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbL_x real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbL_y real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbL_z @@ -57,10 +74,10 @@ module m_weno real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_x real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_y real(kind(0d0)), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_z - real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_L - real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_R -! real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_L => null() -! real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_R => null() +#endif + + ! real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_L => null() + ! real(kind(0d0)), pointer, dimension(:, :, :) :: poly_coef_R => null() !> @} !> @name The ideal weights at the left and the right cell-boundaries and at the @@ -68,6 +85,16 @@ module m_weno !! that the first dimension of the array identifies the weight, while the !! last denotes the cell-location in the relevant coordinate direction. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbL_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbL_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbL_z) + + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbR_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbR_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :), d_cbR_z) +!$acc declare link(d_cbL_x, d_cbL_y, d_cbL_z, d_cbR_x, d_cbR_y, d_cbR_z) +#else real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbL_x real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbL_y real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbL_z @@ -75,9 +102,7 @@ module m_weno real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbR_x real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbR_y real(kind(0d0)), target, allocatable, dimension(:, :) :: d_cbR_z - - real(kind(0d0)), pointer, dimension(:, :) :: d_L - real(kind(0d0)), pointer, dimension(:, :) :: d_R +#endif ! real(kind(0d0)), pointer, dimension(:, :) :: d_L => null() ! real(kind(0d0)), pointer, dimension(:, :) :: d_R => null() !> @} @@ -87,11 +112,16 @@ module m_weno !! second identifies the position of its coefficients and the last denotes !! the cell-location in the relevant coordinate direction. !> @{ +#ifdef CRAY_ACC_WAR + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), beta_coef_x) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), beta_coef_y) + @:CRAY_DECLARE_GLOBAL(real(kind(0d0)), dimension(:, :, :), beta_coef_z) +!$acc declare link(beta_coef_x, beta_coef_y, beta_coef_z) +#else real(kind(0d0)), target, allocatable, dimension(:, :, :) :: beta_coef_x real(kind(0d0)), target, allocatable, dimension(:, :, :) :: beta_coef_y real(kind(0d0)), target, allocatable, dimension(:, :, :) :: beta_coef_z - - real(kind(0d0)), pointer, dimension(:, :, :) :: beta_coef +#endif ! real(kind(0d0)), pointer, dimension(:, :, :) :: beta_coef => null() !> @} @@ -99,19 +129,25 @@ module m_weno integer :: v_size !< Number of WENO-reconstructed cell-average variables + !$acc declare create(v_size) + !> @name Indical bounds in the s1-, s2- and s3-directions !> @{ - type(int_bounds_info) :: is1, is2, is3 + type(int_bounds_info) :: is1_weno, is2_weno, is3_weno + !$acc declare create(is1_weno, is2_weno, is3_weno) + ! !> @} real(kind(0d0)) :: test - - !$acc declare create( & - !$acc v_rs_ws_x, v_rs_ws_y, v_rs_ws_z, & - !$acc poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z, & - !$acc poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z,d_cbL_x, & - !$acc d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z,beta_coef_x,beta_coef_y,beta_coef_z, & - !$acc v_size, is1, is2, is3, test) +!$acc declare create(test) + +#ifndef CRAY_ACC_WAR +!$acc declare create( & +!$acc v_rs_ws_x, v_rs_ws_y, v_rs_ws_z, & +!$acc poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z, & +!$acc poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z,d_cbL_x, & +!$acc d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z,beta_coef_x,beta_coef_y,beta_coef_z) +#endif contains @@ -124,95 +160,95 @@ contains if (weno_order == 1) return ! Allocating/Computing WENO Coefficients in x-direction ============ - is1%beg = -buff_size; is1%end = m - is1%beg + is1_weno%beg = -buff_size; is1_weno%end = m - is1_weno%beg if (n == 0) then - is2%beg = 0 + is2_weno%beg = 0 else - is2%beg = -buff_size; + is2_weno%beg = -buff_size; end if - is2%end = n - is2%beg + is2_weno%end = n - is2_weno%beg if (p == 0) then - is3%beg = 0 + is3_weno%beg = 0 else - is3%beg = -buff_size + is3_weno%beg = -buff_size end if - is3%end = p - is3%beg + is3_weno%end = p - is3_weno%beg - @:ALLOCATE(poly_coef_cbL_x(is1%beg + weno_polyn:is1%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbL_x(is1_weno%beg + weno_polyn:is1_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(poly_coef_cbR_x(is1%beg + weno_polyn:is1%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbR_x(is1_weno%beg + weno_polyn:is1_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(d_cbL_x(0:weno_polyn, is1%beg + weno_polyn:is1%end - weno_polyn)) - @:ALLOCATE(d_cbR_x(0:weno_polyn, is1%beg + weno_polyn:is1%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbL_x(0:weno_polyn, is1_weno%beg + weno_polyn:is1_weno%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbR_x(0:weno_polyn, is1_weno%beg + weno_polyn:is1_weno%end - weno_polyn)) - @:ALLOCATE(beta_coef_x(is1%beg + weno_polyn:is1%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(beta_coef_x(is1_weno%beg + weno_polyn:is1_weno%end - weno_polyn, 0:weno_polyn, & 0:2*(weno_polyn - 1))) - call s_compute_weno_coefficients(1, is1) + call s_compute_weno_coefficients(1, is1_weno) - @:ALLOCATE(v_rs_ws_x(is1%beg:is1%end, & - is2%beg:is2%end, is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(v_rs_ws_x(is1_weno%beg:is1_weno%end, & + is2_weno%beg:is2_weno%end, is3_weno%beg:is3_weno%end, 1:sys_size)) ! ================================================================== ! Allocating/Computing WENO Coefficients in y-direction ============ if (n == 0) return - is2%beg = -buff_size; is2%end = n - is2%beg - is1%beg = -buff_size; is1%end = m - is1%beg + is2_weno%beg = -buff_size; is2_weno%end = n - is2_weno%beg + is1_weno%beg = -buff_size; is1_weno%end = m - is1_weno%beg if (p == 0) then - is3%beg = 0 + is3_weno%beg = 0 else - is3%beg = -buff_size + is3_weno%beg = -buff_size end if - is3%end = p - is3%beg + is3_weno%end = p - is3_weno%beg - @:ALLOCATE(poly_coef_cbL_y(is2%beg + weno_polyn:is2%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbL_y(is2_weno%beg + weno_polyn:is2_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(poly_coef_cbR_y(is2%beg + weno_polyn:is2%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbR_y(is2_weno%beg + weno_polyn:is2_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(d_cbL_y(0:weno_polyn, is2%beg + weno_polyn:is2%end - weno_polyn)) - @:ALLOCATE(d_cbR_y(0:weno_polyn, is2%beg + weno_polyn:is2%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbL_y(0:weno_polyn, is2_weno%beg + weno_polyn:is2_weno%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbR_y(0:weno_polyn, is2_weno%beg + weno_polyn:is2_weno%end - weno_polyn)) - @:ALLOCATE(beta_coef_y(is2%beg + weno_polyn:is2%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(beta_coef_y(is2_weno%beg + weno_polyn:is2_weno%end - weno_polyn, 0:weno_polyn, & 0:2*(weno_polyn - 1))) - call s_compute_weno_coefficients(2, is2) + call s_compute_weno_coefficients(2, is2_weno) - @:ALLOCATE(v_rs_ws_y(is2%beg:is2%end, & - is1%beg:is1%end, is3%beg:is3%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(v_rs_ws_y(is2_weno%beg:is2_weno%end, & + is1_weno%beg:is1_weno%end, is3_weno%beg:is3_weno%end, 1:sys_size)) ! ================================================================== ! Allocating/Computing WENO Coefficients in z-direction ============ if (p == 0) return - is2%beg = -buff_size; is2%end = n - is2%beg - is1%beg = -buff_size; is1%end = m - is1%beg - is3%beg = -buff_size; is3%end = p - is3%beg + is2_weno%beg = -buff_size; is2_weno%end = n - is2_weno%beg + is1_weno%beg = -buff_size; is1_weno%end = m - is1_weno%beg + is3_weno%beg = -buff_size; is3_weno%end = p - is3_weno%beg - @:ALLOCATE(poly_coef_cbL_z(is3%beg + weno_polyn:is3%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbL_z(is3_weno%beg + weno_polyn:is3_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(poly_coef_cbR_z(is3%beg + weno_polyn:is3%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(poly_coef_cbR_z(is3_weno%beg + weno_polyn:is3_weno%end - weno_polyn, 0:weno_polyn, & 0:weno_polyn - 1)) - @:ALLOCATE(d_cbL_z(0:weno_polyn, is3%beg + weno_polyn:is3%end - weno_polyn)) - @:ALLOCATE(d_cbR_z(0:weno_polyn, is3%beg + weno_polyn:is3%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbL_z(0:weno_polyn, is3_weno%beg + weno_polyn:is3_weno%end - weno_polyn)) + @:ALLOCATE_GLOBAL(d_cbR_z(0:weno_polyn, is3_weno%beg + weno_polyn:is3_weno%end - weno_polyn)) - @:ALLOCATE(beta_coef_z(is3%beg + weno_polyn:is3%end - weno_polyn, 0:weno_polyn, & + @:ALLOCATE_GLOBAL(beta_coef_z(is3_weno%beg + weno_polyn:is3_weno%end - weno_polyn, 0:weno_polyn, & 0:2*(weno_polyn - 1))) - call s_compute_weno_coefficients(3, is3) + call s_compute_weno_coefficients(3, is3_weno) - @:ALLOCATE(v_rs_ws_z(is3%beg:is3%end, & - is2%beg:is2%end, is1%beg:is1%end, 1:sys_size)) + @:ALLOCATE_GLOBAL(v_rs_ws_z(is3_weno%beg:is3_weno%end, & + is2_weno%beg:is2_weno%end, is1_weno%beg:is1_weno%end, 1:sys_size)) ! ================================================================== @@ -464,13 +500,13 @@ contains subroutine s_weno(v_vf, vL_rs_vf_x, vL_rs_vf_y, vL_rs_vf_z, vR_rs_vf_x, vR_rs_vf_y, vR_rs_vf_z, & ! ------------------- norm_dir, weno_dir, & - is1_d, is2_d, is3_d) + is1_weno_d, is2_weno_d, is3_weno_d) type(scalar_field), dimension(1:), intent(IN) :: v_vf real(kind(0d0)), dimension(startx:, starty:, startz:, 1:), intent(INOUT) :: vL_rs_vf_x, vL_rs_vf_y, vL_rs_vf_z, vR_rs_vf_x, vR_rs_vf_y, vR_rs_vf_z integer, intent(IN) :: norm_dir integer, intent(IN) :: weno_dir - type(int_bounds_info), intent(IN) :: is1_d, is2_d, is3_d + type(int_bounds_info), intent(IN) :: is1_weno_d, is2_weno_d, is3_weno_d real(kind(0d0)), dimension(-weno_polyn:weno_polyn - 1) :: dvd real(kind(0d0)), dimension(0:weno_polyn) :: poly @@ -479,15 +515,17 @@ contains real(kind(0d0)), dimension(0:weno_polyn) :: beta real(kind(0d0)), pointer :: beta_p(:) + real(kind(0d0)) :: v_rs1, v_rs2, v_rs3, v_rs4, v_rs5 + integer :: i, j, k, l, r, s, w integer :: t1, t2, c_rate, c_max - is1 = is1_d - is2 = is2_d - is3 = is3_d + is1_weno = is1_weno_d + is2_weno = is2_weno_d + is3_weno = is3_weno_d - !$acc update device(is1, is2, is3) + !$acc update device(is1_weno, is2_weno, is3_weno) if (weno_order /= 1) then call s_initialize_weno(v_vf, & @@ -498,9 +536,9 @@ contains if (weno_dir == 1) then !$acc parallel loop collapse(4) default(present) do i = 1, ubound(v_vf, 1) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end vL_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) vR_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) end do @@ -511,9 +549,9 @@ contains else if (weno_dir == 2) then !$acc parallel loop collapse(4) default(present) do i = 1, ubound(v_vf, 1) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end vL_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) vR_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) end do @@ -524,9 +562,9 @@ contains else if (weno_dir == 3) then !$acc parallel loop collapse(4) default(present) do i = 1, ubound(v_vf, 1) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end vL_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) vR_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) end do @@ -539,9 +577,9 @@ contains #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (weno_dir == ${WENO_DIR}$) then !$acc parallel loop collapse(4) gang vector default(present) private(beta,dvd,poly,omega,alpha) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end do i = 1, v_size ! reconstruct from left side @@ -607,13 +645,12 @@ contains else #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (weno_dir == ${WENO_DIR}$) then - !$acc parallel loop gang vector collapse (3) default(present) private(dvd, poly, beta, alpha, omega) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + !$acc parallel loop vector gang collapse(3) default(present) private(dvd, poly, beta, alpha, omega) + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end !$acc loop seq do i = 1, v_size - dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) & - v_rs_ws_${XYZ}$ (j + 1, k, l, i) dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & @@ -696,7 +733,6 @@ contains call s_preserve_monotonicity(v_rs_ws_${XYZ}$, vL_rs_vf_${XYZ}$, & vR_rs_vf_${XYZ}$) end if - end if #:endfor end if @@ -712,9 +748,9 @@ contains !! @param vR_vf Right WENO reconstructed cell-boundary values !! @param norm_dir Characteristic decommposition coordinate direction !! @param weno_dir Coordinate direction of the WENO reconstruction - !! @param is1 Index bounds in first coordinate direction - !! @param is2 Index bounds in second coordinate direction - !! @param is3 Index bounds in third coordinate direction + !! @param is1_weno Index bounds in first coordinate direction + !! @param is2_weno Index bounds in second coordinate direction + !! @param is3_weno Index bounds in third coordinate direction subroutine s_initialize_weno(v_vf, & ! --------- norm_dir, weno_dir) @@ -731,15 +767,14 @@ contains ! as to reshape the inputted data in the coordinate direction of ! the WENO reconstruction v_size = ubound(v_vf, 1) - !$acc update device(v_size) if (weno_dir == 1) then !$acc parallel loop collapse(4) gang vector default(present) do j = 1, v_size - do q = is3%beg, is3%end - do l = is2%beg, is2%end - do k = is1%beg - weno_polyn, is1%end + weno_polyn + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn v_rs_ws_x(k, l, q, j) = v_vf(j)%sf(k, l, q) end do end do @@ -776,9 +811,9 @@ contains #endif !$acc parallel loop collapse(4) gang vector default(present) do j = 1, v_size - do q = is3%beg, is3%end - do l = is2%beg, is2%end - do k = is1%beg - weno_polyn, is1%end + weno_polyn + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn v_rs_ws_y(k, l, q, j) = v_vf(j)%sf(l, k, q) end do end do @@ -808,9 +843,9 @@ contains #endif !$acc parallel loop collapse(4) gang vector default(present) do j = 1, v_size - do q = is3%beg, is3%end - do l = is2%beg, is2%end - do k = is1%beg - weno_polyn, is1%end + weno_polyn + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn v_rs_ws_z(k, l, q, j) = v_vf(j)%sf(q, l, k) end do end do @@ -872,9 +907,9 @@ contains real(kind(0d0)), parameter :: beta_mp = 4d0/3d0 !$acc parallel loop gang vector collapse (4) default(present) private(d) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end do i = 1, v_size d(-1) = v_rs_ws(j, k, l, i) & + v_rs_ws(j - 2, k, l, i) & @@ -1010,34 +1045,34 @@ contains ! Deallocating the WENO-stencil of the WENO-reconstructed variables !deallocate(vL_rs_vf_x, vR_rs_vf_x) - @:DEALLOCATE(v_rs_ws_x) + @:DEALLOCATE_GLOBAL(v_rs_ws_x) ! Deallocating WENO coefficients in x-direction ==================== - @:DEALLOCATE(poly_coef_cbL_x, poly_coef_cbR_x) - @:DEALLOCATE(d_cbL_x, d_cbR_x) - @:DEALLOCATE(beta_coef_x) + @:DEALLOCATE_GLOBAL(poly_coef_cbL_x, poly_coef_cbR_x) + @:DEALLOCATE_GLOBAL(d_cbL_x, d_cbR_x) + @:DEALLOCATE_GLOBAL(beta_coef_x) ! ================================================================== ! Deallocating WENO coefficients in y-direction ==================== if (n == 0) return !deallocate(vL_rs_vf_y, vR_rs_vf_y) - @:DEALLOCATE(v_rs_ws_y) + @:DEALLOCATE_GLOBAL(v_rs_ws_y) - @:DEALLOCATE(poly_coef_cbL_y, poly_coef_cbR_y) - @:DEALLOCATE(d_cbL_y, d_cbR_y) - @:DEALLOCATE(beta_coef_y) + @:DEALLOCATE_GLOBAL(poly_coef_cbL_y, poly_coef_cbR_y) + @:DEALLOCATE_GLOBAL(d_cbL_y, d_cbR_y) + @:DEALLOCATE_GLOBAL(beta_coef_y) ! ================================================================== ! Deallocating WENO coefficients in z-direction ==================== if (p == 0) return !deallocate(vL_rs_vf_z, vR_rs_vf_z) - @:DEALLOCATE(v_rs_ws_z) + @:DEALLOCATE_GLOBAL(v_rs_ws_z) - @:DEALLOCATE(poly_coef_cbL_z, poly_coef_cbR_z) - @:DEALLOCATE(d_cbL_z, d_cbR_z) - @:DEALLOCATE(beta_coef_z) + @:DEALLOCATE_GLOBAL(poly_coef_cbL_z, poly_coef_cbR_z) + @:DEALLOCATE_GLOBAL(d_cbL_z, d_cbR_z) + @:DEALLOCATE_GLOBAL(beta_coef_z) ! ================================================================== end subroutine s_finalize_weno_module ! -------------------------------- diff --git a/toolchain/bootstrap/format.sh b/toolchain/bootstrap/format.sh index 0a2ec24a1..a2ca71248 100644 --- a/toolchain/bootstrap/format.sh +++ b/toolchain/bootstrap/format.sh @@ -17,14 +17,8 @@ done log "Formatting MFC:" -if ! find ${@:-src} -type f | grep -Ev 'autogen' | grep -E '\.(f90|fpp)$' | \ - parallel --jobs ${JOBS:-1} -- \ - echo "\> {}" \&\& \ - python3 toolchain/indenter.py "{}" \&\& \ - fprettify "{}" --silent --indent 4 --c-relations --enable-replacements \ - --enable-decl --whitespace-comma 1 --whitespace-multdiv 0 \ - --whitespace-plusminus 1 --case 1 1 1 1 --strict-indent \ - --line-length 1000\;; then +if ! find ${@:-src} -type f | grep -Ev 'autogen' | grep -E '\.(f90|fpp)$' \ + | xargs -L 1 -P ${JOBS:-1} $SHELL toolchain/bootstrap/format_file.sh; then error "Formatting MFC failed." exit 1 fi diff --git a/toolchain/bootstrap/format_file.sh b/toolchain/bootstrap/format_file.sh new file mode 100644 index 000000000..5f086e17b --- /dev/null +++ b/toolchain/bootstrap/format_file.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +. toolchain/util.sh + +echo "> $1" + +niter=0 +old_file="" + +while : +do + niter=$((niter+1)) + new_file=`cat "$1"` + if [[ "$new_file" == "$old_file" ]]; then + break + fi + old_file="$new_file" + + if [[ "$niter" -gt 4 ]]; then + error "Failed to format $1: No steady-state (after $niter iterations)." + exit 1 + fi + + if ! python3 toolchain/indenter.py "$1"; then + error "Failed to format $1: indenter.py." + exit 1 + fi + + if ! fprettify "$1" --silent --indent 4 --c-relations --enable-replacements \ + --enable-decl --whitespace-comma 1 --whitespace-multdiv 0 \ + --whitespace-plusminus 1 --case 1 1 1 1 --strict-indent \ + --line-length 1000; then + error "Failed to format $1: fprettify." + exit 1 + fi +done + diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh index 022529548..e864d1faf 100644 --- a/toolchain/bootstrap/modules.sh +++ b/toolchain/bootstrap/modules.sh @@ -19,11 +19,11 @@ done # Get computer (if not supplied in command-line) if [ -v $u_c ]; then log "Select a system:" - log "$G""ORNL$W: Ascent (a) | Crusher (c) | Summit (s) | Wombat (w)" + log "$G""ORNL$W: Ascent (a) | Frontier (f) | Summit (s) | Wombat (w)" log "$C""ACCESS$W: Bridges2 (b) | Expanse (e) | Delta (d)" - log "$Y""GaTech$W: Phoenix (p)" - log "$R""CALTECH$W: Richardson (r)" - log_n "($G""a$W/$G""c$W/$G""s$W/$G""w$W/$C""b$W/$C""e$CR/$C""d$CR/$Y""p$CR/$R""r$CR): " + log "$Y""Gatech$W: Phoenix (p)" + log "$R""Caltech$W: Richardson (r)" + log_n "($G""a$W/$G""f$W/$G""s$W/$G""w$W/$C""b$W/$C""e$CR/$C""d$CR/$Y""p$CR/$R""r$CR): " read u_c log fi diff --git a/toolchain/cce_simulation_workgroup_256.sh b/toolchain/cce_simulation_workgroup_256.sh new file mode 100755 index 000000000..9137871d5 --- /dev/null +++ b/toolchain/cce_simulation_workgroup_256.sh @@ -0,0 +1,48 @@ +#! /bin/bash + +# On Frontier, after building simulation run the script +# ./cce_simulation_workgroup_256.sh to edit the workgroup +# size in the generated code and recompile, then set the recommended +# environment variable to use the new code. + +# Using the script requires `-hkeepfiles` at link! + +# This script needs to be run after every time simulation +# is built to generate a correct offload binary. +# This include case-optimization builds! + +CCE_LLVM_PATH=${CRAY_CCE_CLANGSHARE}/../ + +## This was errantly 1024 by default in the generated binary +WGSIZE=256 + +## Turning bitcode into human-readable IR +echo "Disassembling" +${CCE_LLVM_PATH}/bin/llvm-dis "$1/simulation-cce-openmp-pre-llc.bc" + +## Find/replace the workgroup size to what it _should be_ in the (human readable) IR +echo "Globally setting amdgpu-flat-work-group-size size to 1,$WGSIZE" +sed "s/\"amdgpu-flat-work-group-size\"\=\"1,1024\"/\"amdgpu-flat-work-group-size\"\=\"1,${WGSIZE}\"/g" "$1/simulation-cce-openmp-pre-llc.ll" > "$1/simulation-cce-openmp-pre-llc-wg${WGSIZE}.ll" + +## This is building to an AMD GPU code object, using internal copy/pasted Cray Fortran flags. +## The flags may need to be adjusted for future compiler versions +echo "Invoking LLC to compile" +${CCE_LLVM_PATH}/bin/llc -mtriple=amdgcn-amd-amdhsa -disable-promote-alloca-to-lds -mcpu=gfx90a -amdgpu-dump-hsa-metadata "$1/simulation-cce-openmp-pre-llc-wg${WGSIZE}.ll" -filetype=obj -o "$1/simulation-cce-openmp__llc_wg${WGSIZE}.amdgpu" + +## Wrapping AMD GPU code object in an object that CCE OpenACC runtime understands +echo "Linking to a CCE Offload module" +${CCE_LLVM_PATH}/bin/lld -flavor gnu --no-undefined -shared -o "$1/simulation-wg${WGSIZE}.lld.exe" "$1/simulation-cce-openmp__llc_wg${WGSIZE}.amdgpu" + +## Backend/hidden env var that tells the runtime where to find the offload object +echo "Now " +echo "export CRAY_ACC_MODULE=${PWD}/build/simulation/simulation-wg${WGSIZE}.lld.exe" +echo "to use the new GPU offload code." +echo "To use the original build" +echo "unset CRAY_ACC_MODULE" + +# # This goes inside an sbatch script for multi-node submissions (> 1000 nodes important) +# # Requet an nvme via +# #SBATCH -C nvme +# # Put this in the sbatch script before execution via srun +# sbcast -pf ${PWD}/build/simulation/simulation-wg${WGSIZE}.lld.exe /mnt/bb/$USER/simulation-wg${WGSIZE}.lld.exe +# export CRAY_ACC_MODULE=/mnt/bb/$USER/simulation-wg${WGSIZE}.lld.exe" diff --git a/toolchain/dependencies/CMakeLists.txt b/toolchain/dependencies/CMakeLists.txt index 200f69979..69454adb3 100644 --- a/toolchain/dependencies/CMakeLists.txt +++ b/toolchain/dependencies/CMakeLists.txt @@ -9,15 +9,18 @@ if (MFC_SILO OR NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") enable_language(C CXX) endif() +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) + # Imports include(GNUInstallDirs) include(ExternalProject) # Options -option(MFC_FFTW "Build the FFTW3 dependency" OFF) -option(MFC_HDF5 "Build the HDF5 dependency" OFF) -option(MFC_SILO "Build the SILO dependency" OFF) +option(MFC_FFTW "Build the FFTW3 dependency" OFF) +option(MFC_HDF5 "Build the HDF5 dependency" OFF) +option(MFC_SILO "Build the SILO dependency" OFF) +option(MFC_HIPFORT "Build the HIPFORT dependency" OFF) # FFTW3 @@ -38,44 +41,62 @@ endif() # HDF5 if (MFC_HDF5) - if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") - ExternalProject_Add(hdf5 - GIT_REPOSITORY "https://github.com/HDFGroup/hdf5" - GIT_TAG hdf5-1_12_2 - GIT_SHALLOW ON - GIT_PROGRESS ON - CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" - -DBUILD_SHARED_LIBS=OFF - -DFORTRAN_LIBRARIES=ON - -DBUILD_TESTING=OFF - -DHDF5_BUILD_UTILS=OFF - -DHDF5_BUILD_TOOLS=ON - -DHDF5_BUILD_EXAMPLES=OFF - ) - else() - message(WARNING "The Fortran compiler vendor is Cray so HDF5 will not be built. We will use cray-hdf5 instead.") - add_custom_target(hdf5) - endif() + ExternalProject_Add(hdf5 + GIT_REPOSITORY "https://github.com/HDFGroup/hdf5" + GIT_TAG hdf5-1_12_2 + GIT_SHALLOW ON + GIT_PROGRESS ON + CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + -DBUILD_SHARED_LIBS=OFF + -DFORTRAN_LIBRARIES=ON + -DBUILD_TESTING=OFF + -DHDF5_BUILD_UTILS=OFF + -DHDF5_BUILD_TOOLS=ON + -DHDF5_BUILD_EXAMPLES=OFF + ) endif() # SILO if (MFC_SILO) + # If we are using the CCE, HDF5 is not built, and we wish to find + # the system's cray-hdf5. Otherwise, we point SILO to find HDF5 in + # our common install directory using SILO_HDF5_DIR. ExternalProject_Add(silo GIT_REPOSITORY "https://github.com/henryleberre/Silo" - GIT_TAG 964d993039b467e3d472d3a70afd03380fe2c320 + GIT_TAG af955eb5dd009caf00c41ca51611b37c052b042c GIT_SHALLOW ON GIT_PROGRESS ON - CMAKE_ARGS -DSILO_ENABLE_SHARED=OFF - -DSILO_ENABLE_SILOCK=OFF - -DSILO_ENABLE_BROWSER=OFF - -DHDF5_USE_STATIC_LIBRARIES=ON + CMAKE_ARGS -DSILO_ENABLE_SHARED=OFF + -DSILO_ENABLE_SILOCK=OFF + -DSILO_ENABLE_BROWSER=OFF + -DHDF5_USE_STATIC_LIBRARIES=ON "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" - $<$:"-DSILO_HDF5_DIR=\"${CMAKE_INSTALL_PREFIX}\""> + "-DSILO_HDF5_DIR=${CMAKE_INSTALL_PREFIX}" + "$<$:-DCMAKE_MODULE_PATH=${CMAKE_SOURCE_DIR}/../cmake>" ) - if (MFC_HDF5 AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + if (MFC_HDF5) add_dependencies(silo hdf5) endif() endif() +# HIPFORT +if (MFC_HIPFORT) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + ExternalProject_Add(hipfort + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/hipfort" + GIT_TAG develop + GIT_SHALLOW ON + GIT_PROGRESS ON + CMAKE_ARGS "-DHIPFORT_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DHIPFORT_AR=${CMAKE_AR}" + "-DHIPFORT_RANLIB=${CMAKE_RANLIB}" + "-DHIPFORT_COMPILER_FLAGS=-f free -e F -O0 -h ipa0" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) + else() + message(WARNING "The Fortran compiler vendor is not Cray so HIPFORT will not be built.") + add_custom_target(hipfort) + endif() +endif() diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 3eb109f0a..70c1af39f 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -63,7 +63,7 @@ def add_common_arguments(p, mask = None): if "n" not in mask: for target in DEPENDENCY_TARGETS: - p.add_argument(f"--no-{target.name}", action="store_true", help=f"Do not build the {target.name} dependency. Use the system's instead.") + p.add_argument(f"--sys-{target.name}", action="store_true", help=f"Do not build the {target.name} dependency. Use the system's instead.") if "g" not in mask: p.add_argument("-g", "--gpus", nargs="+", type=int, default=None, help="(Optional GPU override) List of GPU #s to use (environment default if unspecified).") @@ -88,8 +88,8 @@ def add_common_arguments(p, mask = None): test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.") test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.") test.add_argument("-m", "--max-attempts", type=int, default=3, help="Maximum number of attempts to run a test.") - - test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") + test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") + test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") test_meg = test.add_mutually_exclusive_group() test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.") @@ -115,6 +115,8 @@ def add_common_arguments(p, mask = None): run.add_argument("-b", "--binary", choices=["mpirun", "jsrun", "srun", "mpiexec"], type=str, default=None, help="(Interactive) Override MPI execution binary") run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.") run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.") + run.add_argument("--omni", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM omniperf.") + run.add_argument("--roc", nargs=argparse.REMAINDER, type=str, help="Profile with ROCM rocprof.") run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.") run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 34d3cf9f1..067204f29 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -95,7 +95,7 @@ def is_buildable(self) -> bool: if ARG("no_build"): return False - if self.isDependency and ARG(f"no_{self.name}"): + if self.isDependency and ARG(f"sys_{self.name}", False): return False return True @@ -107,6 +107,8 @@ def configure(self): install_prefixes = ';'.join([install_dirpath, get_dependency_install_dirpath()]) + mod_dirs = ';'.join(['build/install/dependencies/include/hipfort/amdgcn']) + flags: list = self.flags.copy() + [ # Disable CMake warnings intended for developers (us). # See: https://cmake.org/cmake/help/latest/manual/cmake.1.html. @@ -133,8 +135,14 @@ def configure(self): # Location prefix to install bin/, lib/, include/, etc. # See: https://cmake.org/cmake/help/latest/command/install.html. f"-DCMAKE_INSTALL_PREFIX={install_dirpath}", + # Fortran .mod include directories. Currently used for the HIPFORT + # dependency that has this missing from its config files. + f"-DCMAKE_Fortran_MODULE_DIRECTORY={mod_dirs}", ] + if ARG("verbose"): + flags.append('--debug-find') + if not self.isDependency: flags.append(f"-DMFC_MPI={ 'ON' if ARG('mpi') else 'OFF'}") flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}") @@ -189,17 +197,17 @@ def clean(self): if system(command).returncode != 0: raise MFCException(f"Failed to clean the [bold magenta]{self.name}[/bold magenta] target.") - FFTW = MFCTarget('fftw', ['-DMFC_FFTW=ON'], True, False, False, MFCTarget.Dependencies([], [], []), -1) HDF5 = MFCTarget('hdf5', ['-DMFC_HDF5=ON'], True, False, False, MFCTarget.Dependencies([], [], []), -1) SILO = MFCTarget('silo', ['-DMFC_SILO=ON'], True, False, False, MFCTarget.Dependencies([HDF5], [], []), -1) +HIPFORT = MFCTarget('hipfort', ['-DMFC_HIPFORT=ON'], True, False, False, MFCTarget.Dependencies([], [], []), -1) PRE_PROCESS = MFCTarget('pre_process', ['-DMFC_PRE_PROCESS=ON'], False, True, False, MFCTarget.Dependencies([], [], []), 0) -SIMULATION = MFCTarget('simulation', ['-DMFC_SIMULATION=ON'], False, True, False, MFCTarget.Dependencies([], [FFTW], []), 1) +SIMULATION = MFCTarget('simulation', ['-DMFC_SIMULATION=ON'], False, True, False, MFCTarget.Dependencies([], [FFTW], [HIPFORT]), 1) POST_PROCESS = MFCTarget('post_process', ['-DMFC_POST_PROCESS=ON'], False, True, False, MFCTarget.Dependencies([FFTW, SILO], [], []), 2) -SYSCHECK = MFCTarget('syscheck', ['-DMFC_SYSCHECK=ON'], False, False, True, MFCTarget.Dependencies([], [], []), -1) +SYSCHECK = MFCTarget('syscheck', ['-DMFC_SYSCHECK=ON'], False, False, True, MFCTarget.Dependencies([], [], [HIPFORT]), -1) DOCUMENTATION = MFCTarget('documentation', ['-DMFC_DOCUMENTATION=ON'], False, False, False, MFCTarget.Dependencies([], [], []), -1) -TARGETS = { FFTW, HDF5, SILO, PRE_PROCESS, SIMULATION, POST_PROCESS, SYSCHECK, DOCUMENTATION } +TARGETS = { FFTW, HDF5, SILO, HIPFORT, PRE_PROCESS, SIMULATION, POST_PROCESS, SYSCHECK, DOCUMENTATION } DEFAULT_TARGETS = { target for target in TARGETS if target.isDefault } REQUIRED_TARGETS = { target for target in TARGETS if target.isRequired } @@ -242,7 +250,14 @@ def __build_target(target: typing.Union[MFCTarget, str], history: typing.Set[str history.add(target.name) - build(target.requires.compute(), history) + for dep in target.requires.compute(): + # If we have already built and installed this target, + # do not do so again. This can be inferred by whether + # the target requesting this dependency is already configured. + if dep.isDependency and target.is_configured(): + continue + + build([dep], history) if not target.is_configured(): target.configure() diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index 3d6af7f27..c0c518594 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -28,7 +28,6 @@ class MFCException(Exception): pass - def system(command: typing.List[str], print_cmd = None, **kwargs) -> subprocess.CompletedProcess: cmd = [ str(x) for x in command if not isspace(str(x)) ] diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index 3a00e4752..751beedcd 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -158,7 +158,7 @@ def __get_sim_fpp(self, print: bool) -> str: if print: cons.print("Case optimization is enabled.") - nterms = -100 + nterms = 1 bubble_model = int(self.case_dict.get("bubble_model", "-100")) diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index aacf4fc0e..1b06ed4cb 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -49,10 +49,15 @@ def __str__(self) -> str: gARG: dict = {} -def ARG(arg: str) -> typing.Any: +def ARG(arg: str, dflt = None) -> typing.Any: # pylint: disable=global-variable-not-assigned global gARG - return gARG[arg] + if arg in gARG: + return gARG[arg] + if dflt is not None: + return dflt + + raise KeyError(f"{arg} is not an argument.") def ARGS() -> dict: # pylint: disable=global-variable-not-assigned diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index adca675da..1c5e6de74 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -189,9 +189,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): h5dump = f"{HDF5.get_install_dirpath()}/bin/h5dump" - if ARG("no_hdf5"): + if ARG("sys_hdf5"): if not does_command_exist("h5dump"): - raise MFCException("--no-hdf5 was specified and h5dump couldn't be found.") + raise MFCException("--sys-hdf5 was specified and h5dump couldn't be found.") h5dump = shutil.which("h5dump") diff --git a/toolchain/modules b/toolchain/modules index ea62a0841..51b710162 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -47,11 +47,13 @@ p-cpu gcc/10.3.0-o57x6h openmpi/4.1.4 p-gpu cuda/11.7.0-7sdye3 nvhpc/22.11 p-gpu MFC_CUDA_CC=70,80 CC=nvc CXX=nvc++ FC=nvfortran -c OLCF Crusher -c-all cmake/3.23.2 cray-fftw/3.3.10.2 hdf5/1.12.1 cray-python/3.9.13.1 -c-all ninja/1.10.2 cray-mpich/8.1.23 -c-cpu -c-gpu rocm/5.1.0 craype-accel-amd-gfx90a +f OLCF Frontier +f-gpu rocm/5.5.1 craype-accel-amd-gfx90a +f-all cpe/23.09 +f-all cray-fftw cray-hdf5 cray-mpich cce/16.0.1 +f-all rocm/5.5.1 cray-python omniperf +f-cpu + d NCSA Delta d-all python/3.11.6 diff --git a/toolchain/requirements.txt b/toolchain/requirements.txt index fd2377a6d..9c6a914e4 100644 --- a/toolchain/requirements.txt +++ b/toolchain/requirements.txt @@ -1,3 +1,5 @@ +numpy +pandas rich fypp mako diff --git a/toolchain/templates/frontier.mako b/toolchain/templates/frontier.mako new file mode 100644 index 000000000..2668ba82f --- /dev/null +++ b/toolchain/templates/frontier.mako @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +<%namespace name="helpers" file="helpers.mako"/> + +% if engine == 'batch': +#SBATCH --nodes=${nodes} +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --job-name="${name}" +#SBATCH --output="${name}.out" +#SBATCH --time=${walltime} +% if account: +#SBATCH --account=${account} +% endif +% if partition: +#SBATCH --partition=${partition} +% endif +% if quality_of_service: +#SBATCH --qos=${quality_of_service} +% endif +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +${helpers.template_prologue()} + +ok ":) Loading modules:\n" +cd "${MFC_ROOTDIR}" +% if engine == 'batch': +. ./mfc.sh load -c f -m ${'g' if gpu else 'c'} +% endif +cd - > /dev/null +echo + +% for target in targets: + ${helpers.run_prologue(target)} + + % if not mpi: + (set -x; ${' '.join([f"'{x}'" for x in profiler ])} "${target.get_install_binpath()}") + % else: + (set -x; ${' '.join([f"'{x}'" for x in profiler ])} \ + srun -N ${nodes} \ + -n ${tasks_per_node} \ + ${' '.join([f"'{x}'" for x in ARG('--') ])} \ + "${target.get_install_binpath()}") + % endif + + ${helpers.run_epilogue(target)} + + echo +% endfor + +${helpers.template_epilogue()} diff --git a/toolchain/templates/include/helpers.mako b/toolchain/templates/include/helpers.mako index 262fc5917..8f4d8fefd 100644 --- a/toolchain/templates/include/helpers.mako +++ b/toolchain/templates/include/helpers.mako @@ -58,6 +58,10 @@ exit $code <%def name="run_prologue(target)"> ok ":) Running$MAGENTA ${target.name}$COLOR_RESET:\n" +if [ '${target.name}' == 'simulation' ]; then + export CRAY_ACC_MODULE='${target.get_staging_dirpath()}/simulation-wg256.lld.exe' +fi + cd "${os.path.dirname(input)}" t_${target.name}_start=$(date +%s) @@ -75,6 +79,8 @@ if [ $code -ne 0 ]; then exit 1 fi +unset CRAY_ACC_MODULE + % if output_summary: cd "${MFC_ROOTDIR}" @@ -86,4 +92,4 @@ EOL cd - > /dev/null % endif - \ No newline at end of file +