From 64d9677e58f26a157e207afbdcc705e5cab90c7f Mon Sep 17 00:00:00 2001 From: Henry Le Berre Date: Mon, 13 May 2024 20:48:33 +0200 Subject: [PATCH] -1.2k LOC in s_mpi_sendrecv_variables_buffers (#409) --- src/common/m_helper.fpp | 22 +- src/simulation/m_boundary_conditions.fpp | 12 +- src/simulation/m_mpi_proxy.fpp | 1686 +++------------------- 3 files changed, 256 insertions(+), 1464 deletions(-) diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp index 021b0acb6f..6aa8fef1dc 100644 --- a/src/common/m_helper.fpp +++ b/src/common/m_helper.fpp @@ -33,7 +33,9 @@ module m_helper f_cross, & f_create_transform_matrix, & f_create_bbox, & - s_print_2D_array + s_print_2D_array, & + f_xor, & + f_logical_to_int contains @@ -516,4 +518,22 @@ contains end function f_create_bbox + function f_xor(lhs, rhs) result(res) + logical, intent(in) :: lhs, rhs + logical :: res + + res = (lhs .and. .not. rhs) .or. (.not. lhs .and. rhs) + end function f_xor + + function f_logical_to_int(predicate) result(int) + logical, intent(in) :: predicate + integer :: int + + if (predicate) then + int = 1 + else + int = 0 + end if + end function f_logical_to_int + end module m_helper diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp index f99c0114a0..f2db79ff43 100644 --- a/src/simulation/m_boundary_conditions.fpp +++ b/src/simulation/m_boundary_conditions.fpp @@ -45,7 +45,7 @@ contains case (-16) ! No-slip wall BC at beginning call s_no_slip_wall(q_prim_vf, pb, mv, 1, -1) case default ! Processor BC at beginning - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 1, -1) end select @@ -61,7 +61,7 @@ contains case (-16) ! No-slip wall bc at end call s_no_slip_wall(q_prim_vf, pb, mv, 1, 1) case default ! Processor BC at end - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 1, 1) end select @@ -105,7 +105,7 @@ contains case (-16) ! No-slip wall BC at beginning call s_no_slip_wall(q_prim_vf, pb, mv, 2, -1) case default ! Processor BC at beginning - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 2, -1) end select @@ -121,7 +121,7 @@ contains case (-16) ! No-slip wall BC at end call s_no_slip_wall(q_prim_vf, pb, mv, 2, 1) case default ! Processor BC at end - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 2, 1) end select @@ -165,7 +165,7 @@ contains case (-16) ! No-slip wall BC at beginning call s_no_slip_wall(q_prim_vf, pb, mv, 3, -1) case default ! Processor BC at beginning - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 3, -1) end select @@ -181,7 +181,7 @@ contains case (-16) ! No-slip wall BC at end call s_no_slip_wall(q_prim_vf, pb, mv, 3, 1) case default ! Processor BC at end - call s_mpi_sendrecv_conservative_variables_buffers( & + call s_mpi_sendrecv_variables_buffers( & q_prim_vf, pb, mv, 3, 1) end select diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 3c4d9ae5b8..5b2af08324 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -17,6 +17,8 @@ module m_mpi_proxy use mpi !< Message passing interface (MPI) module #endif + use m_helper + use m_derived_types !< Definitions of the derived types use m_global_parameters !< Definitions of the global parameters @@ -600,6 +602,8 @@ contains integer, intent(IN) :: mpi_dir integer, intent(IN) :: pbc_loc + integer :: dst_proc(1:3) + #ifdef MFC_MPI ! MPI Communication in x-direction ================================= @@ -776,41 +780,88 @@ contains !! @param q_cons_vf Cell-average conservative variables !! @param mpi_dir MPI communication coordinate direction !! @param pbc_loc Processor boundary condition (PBC) location - subroutine s_mpi_sendrecv_conservative_variables_buffers(q_cons_vf, & - pb, mv, & - mpi_dir, & - pbc_loc) + subroutine s_mpi_sendrecv_variables_buffers(q_cons_vf, & + pb, mv, & + mpi_dir, & + pbc_loc) type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(INOUT) :: pb, mv - integer, intent(IN) :: mpi_dir - integer, intent(IN) :: pbc_loc + integer, intent(IN) :: mpi_dir, pbc_loc integer :: i, j, k, l, r, q !< Generic loop iterators -!$acc update device(v_size) + integer :: buffer_counts(1:3), buffer_count + + type(int_bounds_info) :: boundary_conditions(1:3) + integer :: beg_end(1:2), grid_dims(1:3) + integer :: dst_proc, src_proc, recv_tag, send_tag + + logical :: beg_end_geq_0 + + integer :: pack_offsets(1:3), unpack_offsets(1:3) + integer :: pack_offset, unpack_offset #ifdef MFC_MPI - !nCalls_time = nCalls_time + 1 + !$acc update device(v_size) - ! MPI Communication in x-direction ================================= - if (mpi_dir == 1) then + if (qbmm .and. .not. polytropic) then + buffer_counts = (/ & + buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & + buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & + buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) & + /) + else + buffer_counts = (/ & + buff_size*sys_size*(n + 1)*(p + 1), & + buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & + buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) & + /) + end if - if (pbc_loc == -1) then ! PBC at the beginning + buffer_count = buffer_counts(mpi_dir) + boundary_conditions = (/bc_x, bc_y, bc_z/) + beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/) + beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0 - if (bc_x%end >= 0) then ! PBC at the beginning and end + ! Implements: + ! pbc_loc bc_x >= 0 -> [send/recv]_tag [dst/src]_proc + ! -1 (=0) 0 -> [1,0] [0,0] | 0 0 [1,0] [beg,beg] + ! -1 (=0) 1 -> [0,0] [1,0] | 0 1 [0,0] [end,beg] + ! +1 (=1) 0 -> [0,1] [1,1] | 1 0 [0,1] [end,end] + ! +1 (=1) 1 -> [1,1] [0,1] | 1 1 [1,1] [beg,end] - ! Packing buffer to be sent to bc_x%end + send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1)) + recv_tag = f_logical_to_int(pbc_loc == 1) + + dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0))) + src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1)) + + grid_dims = (/m, n, p/) + + pack_offset = 0 + if (f_xor(pbc_loc == 1, beg_end_geq_0)) then + pack_offset = grid_dims(mpi_dir) - buff_size + 1 + end if + + unpack_offset = 0 + if (pbc_loc == 1) then + unpack_offset = grid_dims(mpi_dir) + buff_size + 1 + end if + + ! Pack Buffer to Send + #:for mpi_dir in [1, 2, 3] + if (mpi_dir == ${mpi_dir}$) then + #:if mpi_dir == 1 !$acc parallel loop collapse(4) gang vector default(present) private(r) do l = 0, p do k = 0, n - do j = m - buff_size + 1, m + do j = 0, buff_size - 1 do i = 1, sys_size - r = (i - 1) + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) + r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l)) + q_cons_buff_send(r) = q_cons_vf(i)%sf(j + pack_offset, k, l) end do end do end do @@ -820,107 +871,43 @@ contains !$acc parallel loop collapse(4) gang vector default(present) private(r) do l = 0, p do k = 0, n - do j = m - buff_size + 1, m + do j = 0, buff_size - 1 do i = sys_size + 1, sys_size + 4 do q = 1, nb r = (i - 1) + (q - 1)*4 + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) + (j + buff_size*(k + (n + 1)*l)) + q_cons_buff_send(r) = pb(j + pack_offset, k, l, i - sys_size, q) end do end do end do end do end do - !$acc parallel loop collapse(4) gang vector default(present) private(r) + !$acc parallel loop collapse(5) gang vector default(present) private(r) do l = 0, p do k = 0, n - do j = m - buff_size + 1, m + do j = 0, buff_size - 1 do i = sys_size + 1, sys_size + 4 do q = 1, nb r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) + (j + buff_size*(k + (n + 1)*l)) + q_cons_buff_send(r) = mv(j + pack_offset, k, l, i - sys_size, q) end do end do end do end do end do end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send, ib_buff_send) - - if (qbmm .and. .not. polytropic) then - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the beginning only - - ! Packing buffer to be sent to bc_x%beg + #:elif mpi_dir == 2 !$acc parallel loop collapse(4) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = 1, sys_size + do i = 1, sys_size + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size r = (i - 1) + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k + pack_offset, l) end do end do end do @@ -928,15 +915,15 @@ contains if (qbmm .and. .not. polytropic) then !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = sys_size + 1, sys_size + 4 + do i = sys_size + 1, sys_size + 4 + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + q_cons_buff_send(r) = pb(j, k + pack_offset, l, i - sys_size, q) end do end do end do @@ -944,173 +931,129 @@ contains end do !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = sys_size + 1, sys_size + 4 + do i = sys_size + 1, sys_size + 4 + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + q_cons_buff_send(r) = mv(j, k + pack_offset, l, i - sys_size, q) end do end do end do end do end do - - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - !$acc update host(q_cons_buff_send) - - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - -#if defined(MFC_OpenACC) end if -#endif - - end if -#if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if -#endif - - ! Unpacking buffer received from bc_x%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = 1, sys_size - r = (i - 1) + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) -#if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if -#endif + #:else + !$acc parallel loop collapse(4) gang vector default(present) private(r) + do i = 1, sys_size + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l + pack_offset) + end do end do end do end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = sys_size + 1, sys_size + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) + if (qbmm .and. .not. polytropic) then + !$acc parallel loop collapse(5) gang vector default(present) private(r) + do i = sys_size + 1, sys_size + 4 + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + q_cons_buff_send(r) = pb(j, k, l + pack_offset, i - sys_size, q) + end do end do end do end do end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = sys_size + 1, sys_size + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) + !$acc parallel loop collapse(5) gang vector default(present) private(r) + do i = sys_size + 1, sys_size + 4 + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + q_cons_buff_send(r) = mv(j, k, l + pack_offset, i - sys_size, q) + end do end do end do end do end do - end do - - end if - - else ! PBC at the end + end if + #:endif + end if + #:endfor - if (bc_x%beg >= 0) then ! PBC at the end and beginning + ! Send/Recv + #:for cu_mpi in [False, True] + if (cu_mpi .eqv. ${'.true.' if cu_mpi else '.false.'}$) then + #:if cu_mpi + !$acc host_data use_device(q_cons_buff_recv, q_cons_buff_send, ib_buff_recv, ib_buff_send) + #:else + !$acc update host(q_cons_buff_send, ib_buff_send) + #:endif + + call MPI_SENDRECV( & + q_cons_buff_send(0), buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, & + q_cons_buff_recv(0), buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, & + MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + + #:if cu_mpi + !$acc end host_data + !$acc wait + #:else + !$acc update device(q_cons_buff_recv) + #:endif + end if + #:endfor + ! Unpack Received Buffer + #:for mpi_dir in [1, 2, 3] + if (mpi_dir == ${mpi_dir}$) then + #:if mpi_dir == 1 !$acc parallel loop collapse(4) gang vector default(present) private(r) - ! Packing buffer to be sent to bc_x%beg do l = 0, p do k = 0, n - do j = 0, buff_size - 1 + do j = -buff_size, -1 do i = 1, sys_size r = (i - 1) + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) + (j + buff_size*((k + 1) + (n + 1)*l)) + q_cons_vf(i)%sf(j + unpack_offset, k, l) = q_cons_buff_recv(r) +#if defined(__INTEL_COMPILER) + if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if +#endif end do end do end do end do if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - ! Packing buffer to be sent to bc_x%beg do l = 0, p do k = 0, n - do j = 0, buff_size - 1 + do j = -buff_size, -1 do i = sys_size + 1, sys_size + 4 do q = 1, nb r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) + (j + buff_size*((k + 1) + (n + 1)*l)) + pb(j + unpack_offset, k, l, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do @@ -1118,89 +1061,36 @@ contains end do !$acc parallel loop collapse(5) gang vector default(present) private(r) - ! Packing buffer to be sent to bc_x%beg do l = 0, p do k = 0, n - do j = 0, buff_size - 1 + do j = -buff_size, -1 do i = sys_size + 1, sys_size + 4 do q = 1, nb r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) + (j + buff_size*((k + 1) + (n + 1)*l)) + mv(j + unpack_offset, k, l, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do end do end do end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - !$acc update host(q_cons_buff_send) - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the end only - - ! Packing buffer to be sent to bc_x%end + #:elif mpi_dir == 2 !$acc parallel loop collapse(4) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m - buff_size + 1, m - do i = 1, sys_size + do i = 1, sys_size + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size r = (i - 1) + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + q_cons_vf(i)%sf(j, k + unpack_offset, l) = q_cons_buff_recv(r) +#if defined(__INTEL_COMPILER) + if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if +#endif end do end do end do @@ -1208,15 +1098,15 @@ contains if (qbmm .and. .not. polytropic) then !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m - buff_size + 1, m - do i = sys_size + 1, sys_size + 4 + do i = sys_size + 1, sys_size + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + pb(j, k + unpack_offset, l, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do @@ -1224,724 +1114,39 @@ contains end do !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m - buff_size + 1, m - do i = sys_size + 1, sys_size + 4 + do i = sys_size + 1, sys_size + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + mv(j, k + unpack_offset, l, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do end do end do - - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - !$acc update host(q_cons_buff_send) - - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(n + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_x%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - -#if defined(MFC_OpenACC) end if -#endif - - end if - - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if - - ! Unpacking buffer received from bc_x%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m + 1, m + buff_size - do i = 1, sys_size - r = (i - 1) + v_size* & - ((j - m - 1) + buff_size*(k + (n + 1)*l)) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) + #:else + ! Unpacking buffer from bc_z%beg + !$acc parallel loop collapse(4) gang vector default(present) private(r) + do i = 1, sys_size + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + q_cons_vf(i)%sf(j, k, l + unpack_offset) = q_cons_buff_recv(r) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m + 1, m + buff_size - do i = sys_size + 1, sys_size + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j - m - 1) + buff_size*(k + (n + 1)*l)) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do l = 0, p - do k = 0, n - do j = m + 1, m + buff_size - do i = sys_size + 1, sys_size + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j - m - 1) + buff_size*(k + (n + 1)*l)) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - - end do - end do - end do - end do - end do - - end if - - end if - ! END: MPI Communication in x-direction ============================ - - ! MPI Communication in y-direction ================================= - elseif (mpi_dir == 2) then - - if (pbc_loc == -1) then ! PBC at the beginning - - if (bc_y%end >= 0) then ! PBC at the beginning and end - - ! Packing buffer to be sent to bc_y%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the beginning only - - ! Packing buffer to be sent to bc_y%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if -#if defined(MFC_OpenACC) - end if -#endif - - end if -#if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if -#endif - - ! Unpacking buffer received from bc_y%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) -#if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if -#endif - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - end if - - else ! PBC at the end - - if (bc_y%beg >= 0) then ! PBC at the end and beginning - - ! Packing buffer to be sent to bc_y%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%beg, 1, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the end only - - ! Packing buffer to be sent to bc_y%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n - buff_size + 1, n - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n + buff_size - 1) + buff_size*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - if (qbmm .and. .not. polytropic) then - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - else - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 0, & - q_cons_buff_recv(0), & - buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), & - MPI_DOUBLE_PRECISION, bc_y%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - end if -#if defined(MFC_OpenACC) - end if -#endif - - end if -#if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if -#endif - - ! Unpacking buffer received form bc_y%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, p - do k = n + 1, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n - 1) + buff_size*l)) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) -#if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if -#endif - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n + 1, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n - 1) + buff_size*l)) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, p - do k = n + 1, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k - n - 1) + buff_size*l)) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - end if - - end if - ! END: MPI Communication in y-direction ============================ - - ! MPI Communication in z-direction ================================= - else - - if (pbc_loc == -1) then ! PBC at the beginning - - if (bc_z%end >= 0) then ! PBC at the beginning and end - - ! Packing buffer to be sent to bc_z%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = p - buff_size + 1, p - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) end do end do end do @@ -1950,478 +1155,45 @@ contains if (qbmm .and. .not. polytropic) then !$acc parallel loop collapse(5) gang vector default(present) private(r) do i = sys_size + 1, sys_size + 4 - do l = p - buff_size + 1, p + do l = -buff_size, -1 do k = -buff_size, n + buff_size do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + v_size* & ((j + buff_size) + (m + 2*buff_size + 1)* & ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = p - buff_size + 1, p - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 0, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 0, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the beginning only - - ! Packing buffer to be sent to bc_z%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 1, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - - !$acc end host_data - !$acc wait - else -#endif - - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 1, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 0, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - -#if defined(MFC_OpenACC) - end if -#endif - - end if -#if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if -#endif - - ! Unpacking buffer from bc_z%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) -#if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if -#endif - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - end if - - else ! PBC at the end - - if (bc_z%beg >= 0) then ! PBC at the end and beginning - - ! Packing buffer to be sent to bc_z%beg - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do - - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) + (l + buff_size))) + pb(j, k, l + unpack_offset, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do end do end do - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 1, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - - !$acc end host_data - !$acc wait - else -#endif - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%beg, 1, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - -#if defined(MFC_OpenACC) - end if -#endif - - else ! PBC at the end only - - ! Packing buffer to be sent to bc_z%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = p - buff_size + 1, p - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l) - end do - end do - end do - end do - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = p - buff_size + 1, p - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = pb(j, k, l, i - sys_size, q) - end do - end do - end do - end do - end do !$acc parallel loop collapse(5) gang vector default(present) private(r) do i = sys_size + 1, sys_size + 4 - do l = p - buff_size + 1, p + do l = -buff_size, -1 do k = -buff_size, n + buff_size do j = -buff_size, m + buff_size do q = 1, nb r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & ((j + buff_size) + (m + 2*buff_size + 1)* & ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p + buff_size - 1))) - q_cons_buff_send(r) = mv(j, k, l, i - sys_size, q) + (l + buff_size))) + mv(j, k, l + unpack_offset, i - sys_size, q) = q_cons_buff_recv(r) end do end do end do end do end do - - end if - - !call MPI_Barrier(MPI_COMM_WORLD, ierr) - -#if defined(MFC_OpenACC) - if (cu_mpi) then - !$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send ) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 0, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) - - !$acc end host_data - !$acc wait - else -#endif - !$acc update host(q_cons_buff_send) - - ! Send/receive buffer to/from bc_x%end/bc_x%beg - call MPI_SENDRECV( & - q_cons_buff_send(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 0, & - q_cons_buff_recv(0), & - buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1), & - MPI_DOUBLE_PRECISION, bc_z%end, 1, & - MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) -#if defined(MFC_OpenACC) end if -#endif - - end if - -#if defined(MFC_OpenACC) - if (cu_mpi .eqv. .false.) then - !$acc update device(q_cons_buff_recv) - end if -#endif - - ! Unpacking buffer received from bc_z%end - !$acc parallel loop collapse(4) gang vector default(present) private(r) - do i = 1, sys_size - do l = p + 1, p + buff_size - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p - 1))) - q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r) -#if defined(__INTEL_COMPILER) - - if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if -#endif - end do - end do - end do - end do - - if (qbmm .and. .not. polytropic) then - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = p + 1, p + buff_size - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p - 1))) - pb(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - !$acc parallel loop collapse(5) gang vector default(present) private(r) - do i = sys_size + 1, sys_size + 4 - do l = p + 1, p + buff_size - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l - p - 1))) - mv(j, k, l, i - sys_size, q) = q_cons_buff_recv(r) - end do - end do - end do - end do - end do - end if - + #:endif end if - - end if - ! END: MPI Communication in z-direction ============================ + #:endfor #endif - end subroutine s_mpi_sendrecv_conservative_variables_buffers ! --------- + end subroutine s_mpi_sendrecv_variables_buffers ! --------- !> The goal of this procedure is to populate the buffers of !! the cell-average conservative variables by communicating