From 9e15b167a2f41efa1a5341aebbea727e253c50d6 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Fri, 6 Oct 2023 16:30:57 +0100 Subject: [PATCH 01/12] Refactor communications into its own subroutine in multiply_module --- src/multiply_module.f90 | 167 ++++++++++++++++++++++++++-------------- 1 file changed, 111 insertions(+), 56 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 268e9e5f5..2561d9a89 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -155,13 +155,10 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer(integ),pointer :: ibseq_rem(:) integer(integ),pointer :: ibind_rem(:) integer(integ),pointer :: ib_nd_acc_rem(:) - integer(integ),pointer :: npxyz_rem(:) integer(integ),pointer :: ibndimj_rem(:) ! Arrays for remote variables to point to - integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) integer, dimension(:), allocatable :: nreqs - integer :: offset,sends,i,j + integer :: sends,i,j integer, dimension(MPI_STATUS_SIZE) :: mpi_stat integer, allocatable, dimension(:) :: recv_part real(double) :: t0,t1 @@ -212,61 +209,13 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call Mquest_start_send(a_b_c,b,nreqs,myid,a_b_c%prim%mx_ngonn,sends) !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz - + !$omp parallel default(shared) main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo - - !$omp master - icall=1 - ind_part = a_b_c%ahalo%lab_hcell(kpart) - new_partition = .true. - - ! Check if this is a periodic image of the previous partition - if(kpart>1) then - if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then - new_partition = .false. - end if - end if - - if(new_partition) then - ! Get the data - ipart = a_b_c%parts%i_cc2seq(ind_part) - nnode = a_b_c%comms%neigh_node_list(kpart) - recv_part(nnode) = recv_part(nnode)+1 - if(allocated(b_rem)) deallocate(b_rem) - if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then - lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs - else - lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) - end if - allocate(b_rem(lenb_rem)) - call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& - n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& - mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) - ! Now point the _rem variables at the appropriate parts of - ! the array where we will receive the data - offset = 0 - nbnab_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibind_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ib_nd_acc_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibseq_rem => part_array(offset+1:offset+ilen2) - offset = offset+ilen2 - npxyz_rem => part_array(offset+1:offset+3*ilen2) - offset = offset+3*ilen2 - ibndimj_rem => part_array(offset+1:offset+ilen2) - if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('mat_mult: error pointing to part_array ',kpart) - end if - ! Create ibpart_rem - call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& - ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) - end if - k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs + !$omp master + call do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & + ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz) ! Omp master doesn't include a implicit barrier. We want master ! to be finished with comms before calling the multiply kernels ! hence the explicit barrier @@ -523,6 +472,112 @@ end subroutine loc_trans !!*** + !!****f* multiply_module/do_comms * + !! + !! NAME + !! do_comms + !! USAGE + !! + !! PURPOSE + !! Prefetch data from local or remote partitions, and calculate offsets, + !! ready to call the kernel afterwards + !! INPUTS + !! + !! + !! USES + !! + !! AUTHOR + !! I. Christidi + !! CREATION DATE + !! 5/10/2023 + !! Refactored from mat_mult + !! MODIFICATION HISTORY + !! SOURCE + !! + subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & + ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz) + + use matrix_module + use matrix_comms_module + + integer, intent(out) :: k_off + integer, intent(in) :: kpart + ! Remote variables which will point to part_array + integer(integ), pointer, intent(out) :: nbnab_rem(:) + integer(integ), pointer, intent(out) :: ibseq_rem(:) + integer(integ), pointer, intent(out) :: ibind_rem(:) + integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:) + integer(integ), pointer, intent(out) :: ibndimj_rem(:) + !!!! + integer(integ), allocatable, intent(out) :: ibpart_rem(:) + type(matrix_mult), intent(in) :: a_b_c + real(double), intent(out) :: b(:) + integer, allocatable, dimension(:), intent(inout) :: recv_part + real(double), allocatable, intent(inout) :: b_rem(:) + integer, intent(inout) :: lenb_rem + integer, intent(in) :: myid, ncover_yz + + integer(integ), pointer :: npxyz_rem(:) + integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset + logical :: new_partition + ! Array for remote variables to point to + integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) + + if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode)) + + icall=1 + ind_part = a_b_c%ahalo%lab_hcell(kpart) + new_partition = .true. + + ! Check if this is a periodic image of the previous partition + if(kpart>1) then + if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then + new_partition = .false. + end if + end if + + if(new_partition) then + ! Get the data + ipart = a_b_c%parts%i_cc2seq(ind_part) + nnode = a_b_c%comms%neigh_node_list(kpart) + recv_part(nnode) = recv_part(nnode)+1 + if(allocated(b_rem)) deallocate(b_rem) + if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then + lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs + else + lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) + end if + allocate(b_rem(lenb_rem)) + call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& + n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& + mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) + ! Now point the _rem variables at the appropriate parts of + ! the array where we will receive the data + offset = 0 + nbnab_rem => part_array(offset+1:offset+n_cont) + offset = offset+n_cont + ibind_rem => part_array(offset+1:offset+n_cont) + offset = offset+n_cont + ib_nd_acc_rem => part_array(offset+1:offset+n_cont) + offset = offset+n_cont + ibseq_rem => part_array(offset+1:offset+ilen2) + offset = offset+ilen2 + npxyz_rem => part_array(offset+1:offset+3*ilen2) + offset = offset+3*ilen2 + ibndimj_rem => part_array(offset+1:offset+ilen2) + if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then + call cq_abort('mat_mult: error pointing to part_array ',kpart) + end if + ! Create ibpart_rem + call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& + ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) + end if + + k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs + end subroutine do_comms + !!****f* multiply_module/prefetch * !! !! NAME From 05317daced7f16c8e091d6cca0b6f9cf60cfc2a9 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Mon, 27 Nov 2023 18:13:48 +0000 Subject: [PATCH 02/12] Add non-blocking Mquest_get, use it to overlap comms with comps. --- src/comms_module.f90 | 76 ++++++++++++++++++++- src/multiply_module.f90 | 148 +++++++++++++++++++++++++++------------- 2 files changed, 173 insertions(+), 51 deletions(-) diff --git a/src/comms_module.f90 b/src/comms_module.f90 index 9284f1780..59237adae 100644 --- a/src/comms_module.f90 +++ b/src/comms_module.f90 @@ -177,7 +177,7 @@ end subroutine Mquest_start_send !! SOURCE !! subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,& - bind_rem,b_rem,lenb_rem,bind,b,istart,mx_babs,mx_part,tag) + bind_rem,b_rem,lenb_rem,bind,istart,mx_babs,mx_part,tag) ! Module usage use mpi @@ -194,7 +194,6 @@ subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,& integer :: bind(:) integer :: lenb_rem real(double) :: b_rem(lenb_rem) - real(double) :: b(:) ! Miscellaneous data integer :: nc_part,send_node,sent_part,myid,ilen2,ilen3,size,istart,offset integer :: nrstat(MPI_STATUS_SIZE) @@ -222,6 +221,79 @@ subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,& end subroutine Mquest_get !!*** +!!*** + +! --------------------------------------------------------------------- +! subroutine Mquest_get_nonb +! --------------------------------------------------------------------- + +!!****f* comms_module/Mquest_get_nonb * +!! +!! NAME +!! Mquest_get_nonb +!! USAGE +!! +!! PURPOSE +!! Calls non-blocking receives to get data for matrix multiplication +!! INPUTS +!! +!! +!! USES +!! +!! AUTHOR +!! I. Christidi +!! CREATION DATE +!! 2023/10/06 +!! +!! MODIFICATION HISTORY +!! +!! SOURCE +!! + subroutine Mquest_get_nonb( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,& + bind_rem,b_rem,lenb_rem,bind,istart,mx_babs,mx_part,tag,request) + + ! Module usage + use mpi + use datatypes + use matrix_comms_module, ONLY: mx_msg_per_part + use GenComms, ONLY: cq_abort + + implicit none + + ! Maxima + integer :: mx_nponn,mx_babs,mx_part + ! Arrays for receiving data + integer :: bind_rem(:) + integer :: bind(:) + integer :: lenb_rem + real(double) :: b_rem(lenb_rem) + ! Miscellaneous data + integer :: nc_part,send_node,sent_part,myid,ilen2,ilen3,size,istart,offset + integer :: tag + integer :: request(2) + + ! Local variables + integer :: ilen1,ierr,lenbind_rem + + !lenb_rem = size(b_rem) + !lenbind_rem = size(bind_rem) + ierr = 0 + ilen1 = nc_part + !if(3*ilen1+5*ilen2>lenbind_rem) call cq_abort('Get error ',3*ilen1+5*ilen2,lenbind_rem) + if(ilen3>lenb_rem) call cq_abort('Get error 2 ',ilen3,lenb_rem) + call MPI_Irecv(bind_rem,3*ilen1+5*ilen2,MPI_INTEGER, & + send_node-1,tag+1,MPI_COMM_WORLD,request(1),ierr) + if(ierr/=0) call cq_abort('Error receiving indices !',ierr) + if(ilen3.gt.0)then ! Get xyz, sequence list and elements + ierr = 0 + call MPI_Irecv(b_rem,ilen3, MPI_DOUBLE_PRECISION,send_node-1,& + tag+2,MPI_COMM_WORLD,request(2),ierr) + if(ierr/=0) call cq_abort('Error receiving data !',ierr) + endif + return + end subroutine Mquest_get_nonb +!!*** + ! --------------------------------------------------------------------- ! subroutine send_trans_data ! --------------------------------------------------------------------- diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 2561d9a89..e673699da 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -145,23 +145,33 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) real(double), allocatable, dimension(:) :: atrans integer :: lab_const integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode - integer :: icall,n_cont,kpart_next,ind_partN,k_off - integer :: stat,ilen2,lenb_rem + integer :: icall,n_cont,kpart_next,ind_partN,k_off(2) + integer :: stat,ilen2,lenb_rem(2) ! Remote variables to be allocated - integer(integ),allocatable :: ibpart_rem(:) - real(double),allocatable :: b_rem(:) + integer(integ),allocatable :: ibpart_rem(:,:) + type jagged_array_r + real(double), allocatable :: values(:) + end type jagged_array_r + type(jagged_array_r) :: b_rem(2) ! Remote variables which will point to part_array - integer(integ),pointer :: nbnab_rem(:) - integer(integ),pointer :: ibseq_rem(:) - integer(integ),pointer :: ibind_rem(:) - integer(integ),pointer :: ib_nd_acc_rem(:) - integer(integ),pointer :: ibndimj_rem(:) + type jagged_pointer_array_i + integer(integ),pointer :: values(:) + end type jagged_pointer_array_i + type(jagged_pointer_array_i) :: nbnab_rem(2) + type(jagged_pointer_array_i) :: ibseq_rem(2) + type(jagged_pointer_array_i) :: ibind_rem(2) + type(jagged_pointer_array_i) :: ib_nd_acc_rem(2) + type(jagged_pointer_array_i) :: ibndimj_rem(2) ! Arrays for remote variables to point to integer, dimension(:), allocatable :: nreqs integer :: sends,i,j integer, dimension(MPI_STATUS_SIZE) :: mpi_stat - integer, allocatable, dimension(:) :: recv_part + type jagged_array_i + integer, allocatable :: values(:) + end type jagged_array_i + type(jagged_array_i) :: recv_part(2) real(double) :: t0,t1 + integer :: request(2,2), index_rec, index_wait logical :: new_partition @@ -170,14 +180,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) if(iprint_mat>3.AND.myid==0) t0 = mtime() ! Allocate memory for the elements - allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs),STAT=stat) + allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,2),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating ibpart_rem') !allocate(atrans(a_b_c%amat(1)%length),STAT=stat) allocate(atrans(lena),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating atrans') - allocate(recv_part(0:a_b_c%comms%inode),STAT=stat) + allocate(recv_part(1)%values(0:a_b_c%comms%inode),STAT=stat) + allocate(recv_part(2)%values(0:a_b_c%comms%inode),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating recv_part') - recv_part = zero + recv_part(1)%values = zero + recv_part(2)%values = zero call stop_timer(tmr_std_allocation) !write(io_lun,*) 'Sizes: ',a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim2*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,& ! a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim1* & @@ -209,37 +221,61 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call Mquest_start_send(a_b_c,b,nreqs,myid,a_b_c%prim%mx_ngonn,sends) !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz - + + ! Receive the data from the first partition - blocking + call do_comms(k_off(2), 1, nbnab_rem(2)%values, ibseq_rem(2)%values, ibind_rem(2)%values, & + ib_nd_acc_rem(2)%values, ibndimj_rem(2)%values, ibpart_rem(:,2), a_b_c, b, recv_part(2)%values, & + b_rem(2)%values, lenb_rem(2), myid, ncover_yz) + !$omp parallel default(shared) - main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo - + main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo + + ! These indices point to elements of all the 2-element vectors of the variables needed + ! for the do_comms and m_kern_min/max calls. They alternate between the values of + ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration. + ! index_rec points to the values being received in the current iteration in do_comms, + ! and index_wait points to the values received in the previous iteration, thus computation + ! can start on them in m_kern_min/max + ! These indices are also used to point to elements of the 2x2-element request() array, + ! that contains the MPI request numbers for the non-blocking data receives. There are 2 + ! MPI_Irecv calls per call of do_comms, and request() keeps track of 2 sets of those calls, + ! thus it's of size 2x2. + ! request(:,index_rec) points to the requests from the current iteration MPI_Irecv, + ! and request(:,index_wait) points to the requests from the previous iteration, that have + ! to complete in order for the computation to start (thus the MPI_Wait). + index_rec = mod(kpart,2) + 1 + index_wait = mod(kpart+1,2) + 1 + + ! Receive the data from the current partition - non-blocking !$omp master - call do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & - ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz) - ! Omp master doesn't include a implicit barrier. We want master - ! to be finished with comms before calling the multiply kernels - ! hence the explicit barrier + call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, & + ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, & + ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, & + lenb_rem(index_rec), myid, ncover_yz, request(:,index_rec)) + ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms !$omp end master - !$omp barrier - + + ! Call the computation kernel on the previous partition + if (kpart.gt.2) call MPI_Waitall(request(:,index_wait)) if(a_b_c%mult_type.eq.1) then ! C is full mult - call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,& - ibpart_rem,ibseq_rem,ibndimj_rem,& - atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,& - a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem, lenc) + call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & + ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & + a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) else if(a_b_c%mult_type.eq.2) then ! A is partial mult - call m_kern_min( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,& - ibpart_rem,ibseq_rem,ibndimj_rem,& - atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,& - a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem, lenc) + call m_kern_min( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & + ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & + a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) end if - !$omp barrier +!!! !$omp barrier end do main_loop !$omp end parallel call start_timer(tmr_std_allocation) - if(allocated(b_rem)) deallocate(b_rem) + if(allocated(b_rem(1)%values)) deallocate(b_rem(1)%values) + if(allocated(b_rem(2)%values)) deallocate(b_rem(2)%values) call stop_timer(tmr_std_allocation) ! -------------------------------------------------- ! End of the main loop over partitions in the A-halo @@ -273,7 +309,9 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) deallocate(ibpart_rem,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem') - deallocate(recv_part,STAT=stat) + deallocate(recv_part(1)%values,STAT=stat) + if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') + deallocate(recv_part(2)%values,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') call stop_timer(tmr_std_allocation) call my_barrier @@ -495,7 +533,7 @@ end subroutine loc_trans !! SOURCE !! subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & - ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz) + ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,request) use matrix_module use matrix_comms_module @@ -509,13 +547,14 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:) integer(integ), pointer, intent(out) :: ibndimj_rem(:) !!!! - integer(integ), allocatable, intent(out) :: ibpart_rem(:) + integer(integ), intent(out) :: ibpart_rem(:) type(matrix_mult), intent(in) :: a_b_c - real(double), intent(out) :: b(:) + real(double), intent(in) :: b(:) integer, allocatable, dimension(:), intent(inout) :: recv_part real(double), allocatable, intent(inout) :: b_rem(:) - integer, intent(inout) :: lenb_rem + integer, intent(out) :: lenb_rem integer, intent(in) :: myid, ncover_yz + integer, intent(out), optional :: request(2) integer(integ), pointer :: npxyz_rem(:) integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset @@ -551,7 +590,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem allocate(b_rem(lenb_rem)) call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& - mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) + mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,request) ! Now point the _rem variables at the appropriate parts of ! the array where we will receive the data offset = 0 @@ -605,7 +644,7 @@ end subroutine do_comms !! subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, & - parts,prim,gcs,tag) + parts,prim,gcs,tag,request) ! Module usage use datatypes @@ -628,6 +667,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& type(comms_data) :: a_b_c integer(integ), dimension(:) :: bind_rem,bind integer :: lenb_rem, tag + integer, optional :: request(2) real(double), dimension(lenb_rem) :: b_rem real(double) :: b(:) ! Local variables @@ -649,13 +689,23 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& end if if(icall.eq.1) then ! Else fetch the data ilen2 = a_b_c%ilen2rec(ipart,nnode) - call Mquest_get( prim%mx_ngonn, & - a_b_c%ilen2rec(ipart,nnode),& - a_b_c%ilen3rec(ipart,nnode),& - n_cont,inode,ipart,myid,& - bind_rem,b_rem,lenb_rem,bind,b,& - a_b_c%istart(ipart,nnode), & - bmat(1)%mx_abs,parts%mx_mem_grp,tag) + if(this_part.eq.1) then + call Mquest_get( prim%mx_ngonn, & + a_b_c%ilen2rec(ipart,nnode),& + a_b_c%ilen3rec(ipart,nnode),& + n_cont,inode,ipart,myid,& + bind_rem,b_rem,lenb_rem,bind,& + a_b_c%istart(ipart,nnode), & + bmat(1)%mx_abs,parts%mx_mem_grp,tag) + else + call Mquest_get_nonb( prim%mx_ngonn, & + a_b_c%ilen2rec(ipart,nnode),& + a_b_c%ilen3rec(ipart,nnode),& + n_cont,inode,ipart,myid,& + bind_rem,b_rem,lenb_rem,bind,& + a_b_c%istart(ipart,nnode), & + bmat(1)%mx_abs,parts%mx_mem_grp,tag,request) + end if end if return end subroutine prefetch From 45015e9d336c437dcb9935dc5afc04cf17ee2e0f Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 29 Nov 2023 10:22:39 +0000 Subject: [PATCH 03/12] Some debugging: - Refactor do_comms/prefetch to use flag for non-blocking comms - Add omp barrier after calculating indices in loop - Take into account local comms - they do not produce an MPI_request --- src/multiply_module.f90 | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index e673699da..a5051386c 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -245,18 +245,21 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! to complete in order for the computation to start (thus the MPI_Wait). index_rec = mod(kpart,2) + 1 index_wait = mod(kpart+1,2) + 1 - + !$omp barrier + ! Receive the data from the current partition - non-blocking !$omp master + request(:,index_rec) = [-1,-1] call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, & ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, & ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, & - lenb_rem(index_rec), myid, ncover_yz, request(:,index_rec)) + lenb_rem(index_rec), myid, ncover_yz, .true., request(:,index_rec)) ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms !$omp end master ! Call the computation kernel on the previous partition - if (kpart.gt.2) call MPI_Waitall(request(:,index_wait)) + if (kpart.gt.2 .and. all(request(:,index_wait).ne.[-1,-1])) & + call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) if(a_b_c%mult_type.eq.1) then ! C is full mult call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & @@ -533,7 +536,7 @@ end subroutine loc_trans !! SOURCE !! subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & - ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,request) + ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,do_nonb,request) use matrix_module use matrix_comms_module @@ -554,6 +557,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem real(double), allocatable, intent(inout) :: b_rem(:) integer, intent(out) :: lenb_rem integer, intent(in) :: myid, ncover_yz + logical, intent(in), optional :: do_nonb integer, intent(out), optional :: request(2) integer(integ), pointer :: npxyz_rem(:) @@ -562,7 +566,12 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem ! Array for remote variables to point to integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) + logical :: do_nonb_local + ! Set non-blocking receive flag + do_nonb_local = .false. + if (present(do_nonb)) do_nonb_local = do_nonb + if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode)) icall=1 @@ -590,7 +599,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem allocate(b_rem(lenb_rem)) call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& - mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,request) + mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request) ! Now point the _rem variables at the appropriate parts of ! the array where we will receive the data offset = 0 @@ -644,7 +653,7 @@ end subroutine do_comms !! subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, & - parts,prim,gcs,tag,request) + parts,prim,gcs,tag,do_nonb,request) ! Module usage use datatypes @@ -667,13 +676,19 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& type(comms_data) :: a_b_c integer(integ), dimension(:) :: bind_rem,bind integer :: lenb_rem, tag + logical, intent(in), optional :: do_nonb integer, optional :: request(2) real(double), dimension(lenb_rem) :: b_rem real(double) :: b(:) ! Local variables integer :: ncover_yz,ind_part,iskip,ind_last integer :: inode,ipart,nnode + logical :: do_nonb_local + ! Set non-blocking receive flag + do_nonb_local = .false. + if (present(do_nonb)) do_nonb_local = do_nonb + ind_part = ahalo%lab_hcell(this_part) n_cont=parts%nm_group(ind_part) ipart = parts%i_cc2seq(ind_part) @@ -689,7 +704,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& end if if(icall.eq.1) then ! Else fetch the data ilen2 = a_b_c%ilen2rec(ipart,nnode) - if(this_part.eq.1) then + if(.not.do_nonb_local) then ! Use blocking receive call Mquest_get( prim%mx_ngonn, & a_b_c%ilen2rec(ipart,nnode),& a_b_c%ilen3rec(ipart,nnode),& @@ -697,7 +712,8 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& bind_rem,b_rem,lenb_rem,bind,& a_b_c%istart(ipart,nnode), & bmat(1)%mx_abs,parts%mx_mem_grp,tag) - else + else ! Use non-blocking receive + if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.') call Mquest_get_nonb( prim%mx_ngonn, & a_b_c%ilen2rec(ipart,nnode),& a_b_c%ilen3rec(ipart,nnode),& From 4e2d38c26ba030f95129324f632a59de2c0d8071 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 29 Nov 2023 17:35:29 +0000 Subject: [PATCH 04/12] More debugging: move pointers out of functions, correct barriers. Tests run but produce wrong results. --- src/multiply_module.f90 | 119 +++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 57 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index a5051386c..6089b157d 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -145,8 +145,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) real(double), allocatable, dimension(:) :: atrans integer :: lab_const integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode - integer :: icall,n_cont,kpart_next,ind_partN,k_off(2) - integer :: stat,ilen2,lenb_rem(2) + integer :: icall,n_cont(2),kpart_next,ind_partN,k_off(2) + integer :: stat,ilen2(2),lenb_rem(2) ! Remote variables to be allocated integer(integ),allocatable :: ibpart_rem(:,:) type jagged_array_r @@ -162,7 +162,11 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) type(jagged_pointer_array_i) :: ibind_rem(2) type(jagged_pointer_array_i) :: ib_nd_acc_rem(2) type(jagged_pointer_array_i) :: ibndimj_rem(2) + type(jagged_pointer_array_i) :: npxyz_rem(2) ! Arrays for remote variables to point to + integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs, 2) + integer :: offset integer, dimension(:), allocatable :: nreqs integer :: sends,i,j integer, dimension(MPI_STATUS_SIZE) :: mpi_stat @@ -173,7 +177,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) real(double) :: t0,t1 integer :: request(2,2), index_rec, index_wait - logical :: new_partition + logical :: new_partition(2) call start_timer(tmr_std_matmult) @@ -223,13 +227,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz ! Receive the data from the first partition - blocking - call do_comms(k_off(2), 1, nbnab_rem(2)%values, ibseq_rem(2)%values, ibind_rem(2)%values, & - ib_nd_acc_rem(2)%values, ibndimj_rem(2)%values, ibpart_rem(:,2), a_b_c, b, recv_part(2)%values, & - b_rem(2)%values, lenb_rem(2), myid, ncover_yz) + call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part(2)%values, & + b_rem(2)%values, lenb_rem(2), myid, ncover_yz, new_partition(2)) + request = MPI_REQUEST_NULL !$omp parallel default(shared) main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo + ! The following include MPI operations, so they have to be on the master thread only. + !$omp master + ! These indices point to elements of all the 2-element vectors of the variables needed ! for the do_comms and m_kern_min/max calls. They alternate between the values of ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration. @@ -245,21 +252,47 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! to complete in order for the computation to start (thus the MPI_Wait). index_rec = mod(kpart,2) + 1 index_wait = mod(kpart+1,2) + 1 - !$omp barrier ! Receive the data from the current partition - non-blocking - !$omp master - request(:,index_rec) = [-1,-1] - call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, & - ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, & - ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, & - lenb_rem(index_rec), myid, ncover_yz, .true., request(:,index_rec)) - ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms + call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), & + a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, & + lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) + + ! Check that previous partition data have been received before starting computation + if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & + call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) + if(new_partition(index_wait)) then + ! Now point the _rem variables at the appropriate parts of + ! the array where we will receive the data + offset = 0 + nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + offset = offset+ilen2(index_wait) + npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) + offset = offset+3*ilen2(index_wait) + ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then + call cq_abort('mat_mult: error pointing to part_array ',kpart-1) + end if + ! Create ibpart_rem + call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & + ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& + ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) + end if + + ! Omp master doesn't include an implicit barrier. We want master + ! to be finished with comms before calling the multiply kernels + ! hence the explicit barrier !$omp end master + !$omp barrier ! Call the computation kernel on the previous partition - if (kpart.gt.2 .and. all(request(:,index_wait).ne.[-1,-1])) & - call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) if(a_b_c%mult_type.eq.1) then ! C is full mult call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & @@ -273,7 +306,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) end if -!!! !$omp barrier + !$omp barrier end do main_loop !$omp end parallel call start_timer(tmr_std_allocation) @@ -535,37 +568,30 @@ end subroutine loc_trans !! MODIFICATION HISTORY !! SOURCE !! - subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, & - ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,do_nonb,request) + subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part, & + b_rem, lenb_rem, myid, ncover_yz, new_partition, do_nonb, request) use matrix_module use matrix_comms_module integer, intent(out) :: k_off integer, intent(in) :: kpart - ! Remote variables which will point to part_array - integer(integ), pointer, intent(out) :: nbnab_rem(:) - integer(integ), pointer, intent(out) :: ibseq_rem(:) - integer(integ), pointer, intent(out) :: ibind_rem(:) - integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:) - integer(integ), pointer, intent(out) :: ibndimj_rem(:) - !!!! - integer(integ), intent(out) :: ibpart_rem(:) type(matrix_mult), intent(in) :: a_b_c real(double), intent(in) :: b(:) integer, allocatable, dimension(:), intent(inout) :: recv_part real(double), allocatable, intent(inout) :: b_rem(:) integer, intent(out) :: lenb_rem integer, intent(in) :: myid, ncover_yz + ! Array for remote variables to point to and indices to unpack it + integer, target, intent(out) :: part_array(3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) + integer, intent(out) :: n_cont, ilen2 + !!! + logical, intent(out) :: new_partition logical, intent(in), optional :: do_nonb integer, intent(out), optional :: request(2) - integer(integ), pointer :: npxyz_rem(:) - integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset - logical :: new_partition - ! Array for remote variables to point to - integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) + integer :: icall, ind_part, ipart, nnode, offset logical :: do_nonb_local ! Set non-blocking receive flag @@ -579,8 +605,8 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem new_partition = .true. ! Check if this is a periodic image of the previous partition - if(kpart>1) then - if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then + if(kpart>2) then + if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-2)) then new_partition = .false. end if end if @@ -600,28 +626,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request) - ! Now point the _rem variables at the appropriate parts of - ! the array where we will receive the data - offset = 0 - nbnab_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibind_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ib_nd_acc_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibseq_rem => part_array(offset+1:offset+ilen2) - offset = offset+ilen2 - npxyz_rem => part_array(offset+1:offset+3*ilen2) - offset = offset+3*ilen2 - ibndimj_rem => part_array(offset+1:offset+ilen2) - if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('mat_mult: error pointing to part_array ',kpart) - end if - ! Create ibpart_rem - call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& - ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) - end if + end if k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs end subroutine do_comms From 6be243ca60f7298eed62bcea89647a9a951bc5a5 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Fri, 15 Dec 2023 12:33:46 +0000 Subject: [PATCH 05/12] Fix bug with computation only being called with the wrong kpart, and for only kpart=[2,end-1]. Get rid of not needed variable icall of prefetch, in various places. Code now produces correct results for MPI nprocs=1 and nthreads=any, but deadlocks for more than 1 MPI procs. Separated the MPI_waits, but was not fixed. --- src/exx_kernel_default.f90 | 13 +++--- src/multiply_module.f90 | 85 +++++++++++++++++++++++++++++++------- src/multisiteSF_module.f90 | 11 ++--- 3 files changed, 80 insertions(+), 29 deletions(-) diff --git a/src/exx_kernel_default.f90 b/src/exx_kernel_default.f90 index c5bed2b8c..950b7798e 100644 --- a/src/exx_kernel_default.f90 +++ b/src/exx_kernel_default.f90 @@ -146,7 +146,7 @@ subroutine get_X_matrix( exxspin, level ) integer :: exxspin integer :: lab_const integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode - integer :: icall,n_cont,kpart_next,ind_partN,k_off + integer :: n_cont,kpart_next,ind_partN,k_off integer :: icall2,stat,ilen2,lenb_rem ! Remote variables to be allocated integer(integ),allocatable :: ibpart_rem(:) @@ -356,7 +356,7 @@ subroutine get_X_matrix( exxspin, level ) ! !$omp parallel default(none) & ! !$omp shared(a, b, c, a_b_c, myid, lena, lenc, tmr_std_allocation, & ! !$omp ncover_yz, ibpart_rem, atrans, usegemm) & - ! !$omp private(kpart, icall, ind_part, ipart, nnode, b_rem, & + ! !$omp private(kpart, ind_part, ipart, nnode, b_rem, & ! !$omp lenb_rem, n_cont, part_array, ilen2, offset, & ! !$omp nbnab_rem, ibind_rem, ib_nd_acc_rem, ibseq_rem, & ! !$omp npxyz_rem, ibndimj_rem, k_off, icall2) @@ -375,7 +375,6 @@ subroutine get_X_matrix( exxspin, level ) xyz_ghost = zero r_ghost = zero do kpart = 1,mult(S_X_SX)%ahalo%np_in_halo ! Main loop - icall=1 ind_part = mult(S_X_SX)%ahalo%lab_hcell(kpart) ! !print*, 'inode', inode,'kpart', kpart, ind_part @@ -385,9 +384,7 @@ subroutine get_X_matrix( exxspin, level ) end if ! if(kpart>1) then ! Is it a periodic image of the previous partition ? - if(ind_part.eq.mult(S_X_SX)%ahalo%lab_hcell(kpart-1)) then - icall=0 - else ! Get the data + if(ind_part.ne.mult(S_X_SX)%ahalo%lab_hcell(kpart-1)) then ! Get the data ipart = mult(S_X_SX)%parts%i_cc2seq(ind_part) nnode = mult(S_X_SX)%comms%neigh_node_list(kpart) recv_part(nnode) = recv_part(nnode)+1 @@ -401,7 +398,7 @@ subroutine get_X_matrix( exxspin, level ) ! allocate(b_rem(lenb_rem)) ! - call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat,icall, & + call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat, & n_cont,part_array,mult(S_X_SX)%bindex,b_rem,lenb_rem,mat_p(matK( exxspin ))%matrix, & myid,ilen2,mx_msg_per_part,mult(S_X_SX)%parts,mult(S_X_SX)%prim,mult(S_X_SX)%gcs,& (recv_part(nnode)-1)*2) @@ -447,7 +444,7 @@ subroutine get_X_matrix( exxspin, level ) call stop_timer(tmr_std_exx_allocat,.true.) ! ! - call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat,icall, & + call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat, & n_cont,part_array,mult(S_X_SX)%bindex,b_rem,lenb_rem,mat_p(matK( exxspin ))%matrix, & myid,ilen2,mx_msg_per_part,mult(S_X_SX)%parts,mult(S_X_SX)%prim,mult(S_X_SX)%gcs,& (recv_part(nnode)-1)*2) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 6089b157d..1d8670f9f 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -145,7 +145,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) real(double), allocatable, dimension(:) :: atrans integer :: lab_const integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode - integer :: icall,n_cont(2),kpart_next,ind_partN,k_off(2) + integer :: n_cont(2),kpart_next,ind_partN,k_off(2) integer :: stat,ilen2(2),lenb_rem(2) ! Remote variables to be allocated integer(integ),allocatable :: ibpart_rem(:,:) @@ -259,9 +259,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) ! Check that previous partition data have been received before starting computation - if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & - call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) - if(new_partition(index_wait)) then + ! if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & + ! call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) + if (kpart.gt.2) then + if (request(1,index_wait).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr) + if (request(2,index_wait).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr) + end if + + if(new_partition(index_wait)) then ! Now point the _rem variables at the appropriate parts of ! the array where we will receive the data offset = 0 @@ -294,13 +301,13 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! Call the computation kernel on the previous partition if(a_b_c%mult_type.eq.1) then ! C is full mult - call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) else if(a_b_c%mult_type.eq.2) then ! A is partial mult - call m_kern_min( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & @@ -308,7 +315,60 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) end if !$omp barrier end do main_loop + + ! Do the computation on the last partition + !$omp master + ! Check that the last partition data have been received before starting computation + index_wait = mod(kpart+1,2) + 1 +! if (all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & +! call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) + if (request(1,index_wait).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr) + if (request(2,index_wait).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr) + if(new_partition(index_wait)) then + ! Now point the _rem variables at the appropriate parts of + ! the array where we will receive the data + offset = 0 + nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + offset = offset+ilen2(index_wait) + npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) + offset = offset+3*ilen2(index_wait) + ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then + call cq_abort('mat_mult: error pointing to part_array ',kpart-1) + end if + ! Create ibpart_rem + call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & + ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& + ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) + end if + !$omp end master + !$omp barrier + ! Call the computation kernel on the last partition + if(a_b_c%mult_type.eq.1) then ! C is full mult + call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & + ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & + a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) + else if(a_b_c%mult_type.eq.2) then ! A is partial mult + call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & + nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & + ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & + a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) + end if + !$omp end parallel + call start_timer(tmr_std_allocation) if(allocated(b_rem(1)%values)) deallocate(b_rem(1)%values) if(allocated(b_rem(2)%values)) deallocate(b_rem(2)%values) @@ -591,7 +651,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part logical, intent(in), optional :: do_nonb integer, intent(out), optional :: request(2) - integer :: icall, ind_part, ipart, nnode, offset + integer :: ind_part, ipart, nnode, offset logical :: do_nonb_local ! Set non-blocking receive flag @@ -600,7 +660,6 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode)) - icall=1 ind_part = a_b_c%ahalo%lab_hcell(kpart) new_partition = .true. @@ -623,7 +682,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) end if allocate(b_rem(lenb_rem)) - call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& + call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request) end if @@ -656,7 +715,7 @@ end subroutine do_comms !! Adding tag for MPI compliance !! SOURCE !! - subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& + subroutine prefetch(this_part,ahalo,a_b_c,bmat,& n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, & parts,prim,gcs,tag,do_nonb,request) @@ -675,7 +734,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& type(primary_set) :: prim type(cover_set) :: gcs integer :: mx_mpp - integer :: this_part,icall,n_cont,myid,ilen2 + integer :: this_part,n_cont,myid,ilen2 type(matrix), dimension(:) :: bmat type(matrix_halo) :: ahalo type(comms_data) :: a_b_c @@ -700,14 +759,12 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,& inode = parts%i_cc2node(ind_part) nnode = a_b_c%neigh_node_list(this_part) if(inode.eq.myid+1) then ! If this is local, then copy - icall = 0 ncover_yz=gcs%ncovery*gcs%ncoverz ilen2 = bmat(ipart)%part_nabs call Mquest_get_local(ipart,& bind_rem,b_rem,lenb_rem,bind,bmat,& ind_part,b,myid) - end if - if(icall.eq.1) then ! Else fetch the data + else ! Else fetch the data ilen2 = a_b_c%ilen2rec(ipart,nnode) if(.not.do_nonb_local) then ! Use blocking receive call Mquest_get( prim%mx_ngonn, & diff --git a/src/multisiteSF_module.f90 b/src/multisiteSF_module.f90 index d2c9792c8..b8c4bb972 100644 --- a/src/multisiteSF_module.f90 +++ b/src/multisiteSF_module.f90 @@ -1238,7 +1238,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_ ! This will be dynamically allocated/deallocated by the system integer :: lab_const integer :: ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode - integer :: icall,n_cont,kpart_next,ind_partN,k_off + integer :: n_cont,kpart_next,ind_partN,k_off integer :: stat,ilen2,lenb_rem ! Remote variables to be allocated integer(integ),allocatable :: ibpart_rem(:) @@ -1305,14 +1305,11 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_ do kpart = 1,a_b_c%ahalo%np_in_halo ! Main loop. Loop for halo-partition kpart !write(io_lun,*) 'Part: ',kpart,myid - icall=1 ind_part = a_b_c%ahalo%lab_hcell(kpart) !write(io_lun,*) 'ind_part: ',ind_part if(kpart>1) then ! Is it a periodic image of the previous partition ? - if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then - icall=0 - else ! Get the data + if(ind_part.ne.a_b_c%ahalo%lab_hcell(kpart-1)) then ! Get the data !write(io_lun,*) myid,' seq: ',size(a_b_c%parts%i_cc2seq) ipart = a_b_c%parts%i_cc2seq(ind_part) !write(io_lun,*) myid,' Alloc b_rem part: ',ipart @@ -1328,7 +1325,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_ lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) end if allocate(b_rem(lenb_rem)) - call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& + call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) !write(io_lun,*) 'b_rem: ',lenb_rem @@ -1372,7 +1369,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_ call start_timer(tmr_std_allocation) allocate(b_rem(lenb_rem)) call stop_timer(tmr_std_allocation) - call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& + call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) lenb_rem = size(b_rem) From eca1b1f142ec2cd179577a8be30a274f6edaaeb0 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 20 Dec 2023 14:52:00 +0000 Subject: [PATCH 06/12] Solve deadlock: - recv_part should be stored in one sequential list throughout the loop, not two, since it is used for the MPI_Irecv tags that have to match the MPI_Issend ones. - the periodicity check has to check the previous partition (kpart-1), not kpart-2. Adapt a bunch of stuff accordingly. Simplify the way the computation is taken care of for the last partition (now inside the loop). --- src/multiply_module.f90 | 150 +++++++++++++--------------------------- 1 file changed, 48 insertions(+), 102 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 1d8670f9f..8ab580b12 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -170,10 +170,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer, dimension(:), allocatable :: nreqs integer :: sends,i,j integer, dimension(MPI_STATUS_SIZE) :: mpi_stat - type jagged_array_i - integer, allocatable :: values(:) - end type jagged_array_i - type(jagged_array_i) :: recv_part(2) + integer, allocatable :: recv_part(:) real(double) :: t0,t1 integer :: request(2,2), index_rec, index_wait @@ -189,11 +186,9 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) !allocate(atrans(a_b_c%amat(1)%length),STAT=stat) allocate(atrans(lena),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating atrans') - allocate(recv_part(1)%values(0:a_b_c%comms%inode),STAT=stat) - allocate(recv_part(2)%values(0:a_b_c%comms%inode),STAT=stat) + allocate(recv_part(0:a_b_c%comms%inode),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating recv_part') - recv_part(1)%values = zero - recv_part(2)%values = zero + recv_part = zero call stop_timer(tmr_std_allocation) !write(io_lun,*) 'Sizes: ',a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim2*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,& ! a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim1* & @@ -227,12 +222,12 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz ! Receive the data from the first partition - blocking - call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part(2)%values, & + call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part, & b_rem(2)%values, lenb_rem(2), myid, ncover_yz, new_partition(2)) request = MPI_REQUEST_NULL !$omp parallel default(shared) - main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo + main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo+1 ! The following include MPI operations, so they have to be on the master thread only. !$omp master @@ -253,46 +248,53 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) index_rec = mod(kpart,2) + 1 index_wait = mod(kpart+1,2) + 1 - ! Receive the data from the current partition - non-blocking - call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), & - a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, & - lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) - ! Check that previous partition data have been received before starting computation - ! if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & - ! call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) if (kpart.gt.2) then if (request(1,index_wait).ne.MPI_REQUEST_NULL) & call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr) if (request(2,index_wait).ne.MPI_REQUEST_NULL) & call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr) - end if - - if(new_partition(index_wait)) then - ! Now point the _rem variables at the appropriate parts of - ! the array where we will receive the data - offset = 0 - nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - offset = offset+ilen2(index_wait) - npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) - offset = offset+3*ilen2(index_wait) - ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('mat_mult: error pointing to part_array ',kpart-1) - end if - ! Create ibpart_rem - call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & - ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& - ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) end if + ! If that previous partition was a periodic one, copy over arrays from previous index + if(.not.new_partition(index_wait)) then + part_array(:,index_wait) = part_array(:,index_rec) + n_cont(index_wait) = n_cont(index_rec) + ilen2(index_wait) = ilen2(index_rec) + b_rem(index_wait) = b_rem(index_rec) + lenb_rem(index_wait) = lenb_rem(index_rec) + end if + + ! Now point the _rem variables at the appropriate parts of + ! the array where we have received the data + offset = 0 + nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) + offset = offset+n_cont(index_wait) + ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + offset = offset+ilen2(index_wait) + npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) + offset = offset+3*ilen2(index_wait) + ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) + if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & + 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then + call cq_abort('mat_mult: error pointing to part_array ',kpart-1) + end if + ! Create ibpart_rem + call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & + ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& + ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) + + ! Receive the data from the current partition - non-blocking + if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then + call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), & + a_b_c, b, recv_part, b_rem(index_rec)%values, & + lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) + end if + ! Omp master doesn't include an implicit barrier. We want master ! to be finished with comms before calling the multiply kernels ! hence the explicit barrier @@ -314,59 +316,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) end if !$omp barrier - end do main_loop - - ! Do the computation on the last partition - !$omp master - ! Check that the last partition data have been received before starting computation - index_wait = mod(kpart+1,2) + 1 -! if (all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) & -! call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr) - if (request(1,index_wait).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr) - if (request(2,index_wait).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr) - if(new_partition(index_wait)) then - ! Now point the _rem variables at the appropriate parts of - ! the array where we will receive the data - offset = 0 - nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - offset = offset+ilen2(index_wait) - npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) - offset = offset+3*ilen2(index_wait) - ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('mat_mult: error pointing to part_array ',kpart-1) - end if - ! Create ibpart_rem - call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & - ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& - ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) - end if - !$omp end master - !$omp barrier - ! Call the computation kernel on the last partition - if(a_b_c%mult_type.eq.1) then ! C is full mult - call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & - nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & - ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & - a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) - else if(a_b_c%mult_type.eq.2) then ! A is partial mult - call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & - nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & - ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & - a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) - end if - + end do main_loop !$omp end parallel call start_timer(tmr_std_allocation) @@ -405,9 +355,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) deallocate(ibpart_rem,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem') - deallocate(recv_part(1)%values,STAT=stat) - if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') - deallocate(recv_part(2)%values,STAT=stat) + deallocate(recv_part,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') call stop_timer(tmr_std_allocation) call my_barrier @@ -658,14 +606,12 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part do_nonb_local = .false. if (present(do_nonb)) do_nonb_local = do_nonb - if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode)) - ind_part = a_b_c%ahalo%lab_hcell(kpart) new_partition = .true. ! Check if this is a periodic image of the previous partition - if(kpart>2) then - if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-2)) then + if(kpart>1) then + if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then new_partition = .false. end if end if From bd1ef120e91ba715aed6a7227ba61c56944a3b83 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 20 Dec 2023 15:26:52 +0000 Subject: [PATCH 07/12] Some clean up of variables from PR 295 --- src/multiply_module.f90 | 111 +++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 58 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 8ab580b12..f6e250e45 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -148,21 +148,18 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer :: n_cont(2),kpart_next,ind_partN,k_off(2) integer :: stat,ilen2(2),lenb_rem(2) ! Remote variables to be allocated - integer(integ),allocatable :: ibpart_rem(:,:) + integer(integ),allocatable :: ibpart_rem(:) type jagged_array_r real(double), allocatable :: values(:) end type jagged_array_r type(jagged_array_r) :: b_rem(2) ! Remote variables which will point to part_array - type jagged_pointer_array_i - integer(integ),pointer :: values(:) - end type jagged_pointer_array_i - type(jagged_pointer_array_i) :: nbnab_rem(2) - type(jagged_pointer_array_i) :: ibseq_rem(2) - type(jagged_pointer_array_i) :: ibind_rem(2) - type(jagged_pointer_array_i) :: ib_nd_acc_rem(2) - type(jagged_pointer_array_i) :: ibndimj_rem(2) - type(jagged_pointer_array_i) :: npxyz_rem(2) + integer(integ), dimension(:), pointer :: nbnab_rem + integer(integ), dimension(:), pointer :: ibseq_rem + integer(integ), dimension(:), pointer :: ibind_rem + integer(integ), dimension(:), pointer :: ib_nd_acc_rem + integer(integ), dimension(:), pointer :: ibndimj_rem + integer(integ), dimension(:), pointer :: npxyz_rem ! Arrays for remote variables to point to integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ & 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs, 2) @@ -172,7 +169,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer, dimension(MPI_STATUS_SIZE) :: mpi_stat integer, allocatable :: recv_part(:) real(double) :: t0,t1 - integer :: request(2,2), index_rec, index_wait + integer :: request(2,2), index_rec, index_comp logical :: new_partition(2) @@ -181,9 +178,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) if(iprint_mat>3.AND.myid==0) t0 = mtime() ! Allocate memory for the elements - allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,2),STAT=stat) + allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating ibpart_rem') - !allocate(atrans(a_b_c%amat(1)%length),STAT=stat) allocate(atrans(lena),STAT=stat) if(stat/=0) call cq_abort('mat_mult: error allocating atrans') allocate(recv_part(0:a_b_c%comms%inode),STAT=stat) @@ -234,59 +230,58 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! These indices point to elements of all the 2-element vectors of the variables needed ! for the do_comms and m_kern_min/max calls. They alternate between the values of - ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration. + ! (index_rec,index_comp)=(1,2) and (2,1) from iteration to iteration. ! index_rec points to the values being received in the current iteration in do_comms, - ! and index_wait points to the values received in the previous iteration, thus computation + ! and index_comp points to the values received in the previous iteration, thus computation ! can start on them in m_kern_min/max ! These indices are also used to point to elements of the 2x2-element request() array, ! that contains the MPI request numbers for the non-blocking data receives. There are 2 ! MPI_Irecv calls per call of do_comms, and request() keeps track of 2 sets of those calls, ! thus it's of size 2x2. ! request(:,index_rec) points to the requests from the current iteration MPI_Irecv, - ! and request(:,index_wait) points to the requests from the previous iteration, that have + ! and request(:,index_comp) points to the requests from the previous iteration, that have ! to complete in order for the computation to start (thus the MPI_Wait). index_rec = mod(kpart,2) + 1 - index_wait = mod(kpart+1,2) + 1 + index_comp = mod(kpart+1,2) + 1 ! Check that previous partition data have been received before starting computation if (kpart.gt.2) then - if (request(1,index_wait).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr) - if (request(2,index_wait).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr) + if (request(1,index_comp).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr) + if (request(2,index_comp).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr) end if ! If that previous partition was a periodic one, copy over arrays from previous index - if(.not.new_partition(index_wait)) then - part_array(:,index_wait) = part_array(:,index_rec) - n_cont(index_wait) = n_cont(index_rec) - ilen2(index_wait) = ilen2(index_rec) - b_rem(index_wait) = b_rem(index_rec) - lenb_rem(index_wait) = lenb_rem(index_rec) + if(.not.new_partition(index_comp)) then + part_array(:,index_comp) = part_array(:,index_rec) + n_cont(index_comp) = n_cont(index_rec) + ilen2(index_comp) = ilen2(index_rec) + b_rem(index_comp) = b_rem(index_rec) + lenb_rem(index_comp) = lenb_rem(index_rec) end if ! Now point the _rem variables at the appropriate parts of ! the array where we have received the data offset = 0 - nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait) - offset = offset+n_cont(index_wait) - ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - offset = offset+ilen2(index_wait) - npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait) - offset = offset+3*ilen2(index_wait) - ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait) - if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ & + nbnab_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp) + offset = offset+n_cont(index_comp) + ibind_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp) + offset = offset+n_cont(index_comp) + ib_nd_acc_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp) + offset = offset+n_cont(index_comp) + ibseq_rem => part_array(offset+1:offset+ilen2(index_comp),index_comp) + offset = offset+ilen2(index_comp) + npxyz_rem => part_array(offset+1:offset+3*ilen2(index_comp),index_comp) + offset = offset+3*ilen2(index_comp) + ibndimj_rem => part_array(offset+1:offset+ilen2(index_comp),index_comp) + if(offset+ilen2(index_comp)>3*a_b_c%parts%mx_mem_grp+ & 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then call cq_abort('mat_mult: error pointing to part_array ',kpart-1) end if ! Create ibpart_rem - call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, & - ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,& - ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz) + call end_part_comms(myid,n_cont(index_comp),nbnab_rem, & + ibind_rem,npxyz_rem,ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) ! Receive the data from the current partition - non-blocking if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then @@ -303,17 +298,17 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! Call the computation kernel on the previous partition if(a_b_c%mult_type.eq.1) then ! C is full mult - call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & - nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & - ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + call m_kern_max( k_off(index_comp),kpart-1,ib_nd_acc_rem, ibind_rem, & + nbnab_rem,ibpart_rem,ibseq_rem, & + ibndimj_rem, atrans,b_rem(index_comp)%values,c,a_b_c%ahalo,a_b_c%chalo, & a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) + a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc) else if(a_b_c%mult_type.eq.2) then ! A is partial mult - call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, & - nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, & - ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, & + call m_kern_min( k_off(index_comp),kpart-1,ib_nd_acc_rem, ibind_rem, & + nbnab_rem,ibpart_rem,ibseq_rem, & + ibndimj_rem, atrans,b_rem(index_comp)%values,c,a_b_c%ahalo,a_b_c%chalo, & a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & - a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc) + a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc) end if !$omp barrier end do main_loop @@ -586,7 +581,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part integer, intent(in) :: kpart type(matrix_mult), intent(in) :: a_b_c real(double), intent(in) :: b(:) - integer, allocatable, dimension(:), intent(inout) :: recv_part + integer, dimension(:), intent(inout) :: recv_part real(double), allocatable, intent(inout) :: b_rem(:) integer, intent(out) :: lenb_rem integer, intent(in) :: myid, ncover_yz @@ -712,23 +707,23 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,& ind_part,b,myid) else ! Else fetch the data ilen2 = a_b_c%ilen2rec(ipart,nnode) - if(.not.do_nonb_local) then ! Use blocking receive - call Mquest_get( prim%mx_ngonn, & + if(do_nonb_local) then ! Use non-blocking receive + if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.') + call Mquest_get_nonb( prim%mx_ngonn, & a_b_c%ilen2rec(ipart,nnode),& a_b_c%ilen3rec(ipart,nnode),& n_cont,inode,ipart,myid,& bind_rem,b_rem,lenb_rem,bind,& a_b_c%istart(ipart,nnode), & - bmat(1)%mx_abs,parts%mx_mem_grp,tag) - else ! Use non-blocking receive - if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.') - call Mquest_get_nonb( prim%mx_ngonn, & + bmat(1)%mx_abs,parts%mx_mem_grp,tag,request) + else ! Use blocking receive + call Mquest_get( prim%mx_ngonn, & a_b_c%ilen2rec(ipart,nnode),& a_b_c%ilen3rec(ipart,nnode),& n_cont,inode,ipart,myid,& bind_rem,b_rem,lenb_rem,bind,& a_b_c%istart(ipart,nnode), & - bmat(1)%mx_abs,parts%mx_mem_grp,tag,request) + bmat(1)%mx_abs,parts%mx_mem_grp,tag) end if end if return From 26b2585a42ccfa19c53653d453c4ae7e52f0a136 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 20 Dec 2023 15:43:11 +0000 Subject: [PATCH 08/12] Attempt to fix failing test in CI for "double free or corruption" - passes locally though.... --- src/multiply_module.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index f6e250e45..fa1948bdf 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -350,7 +350,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) deallocate(ibpart_rem,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem') - deallocate(recv_part,STAT=stat) + if(allocated(recv_part)) deallocate(recv_part,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') call stop_timer(tmr_std_allocation) call my_barrier From a0b6b2e125ac2a8d2d5f0a0c005d5aa08d950840 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 20 Dec 2023 16:56:39 +0000 Subject: [PATCH 09/12] Further attempt to fix failing CI test. Using \"-check bounds\" when compiling, showed that the index of recv_part did not start at 0 when it was not an allocatable in a function that used it. --- src/multiply_module.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index fa1948bdf..7d5fb0c93 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -350,7 +350,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call start_timer(tmr_std_allocation) deallocate(ibpart_rem,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem') - if(allocated(recv_part)) deallocate(recv_part,STAT=stat) + deallocate(recv_part,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') call stop_timer(tmr_std_allocation) call my_barrier @@ -581,7 +581,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part integer, intent(in) :: kpart type(matrix_mult), intent(in) :: a_b_c real(double), intent(in) :: b(:) - integer, dimension(:), intent(inout) :: recv_part + integer, allocatable, dimension(:), intent(inout) :: recv_part real(double), allocatable, intent(inout) :: b_rem(:) integer, intent(out) :: lenb_rem integer, intent(in) :: myid, ncover_yz From 4d77876157d134bd750c2f2a97e0cc203ad332cc Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 10 Jan 2024 11:20:11 +0000 Subject: [PATCH 10/12] Update makefile for kathleen. --- src/system/system.kathleen.ucl.ac.uk.make | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/system/system.kathleen.ucl.ac.uk.make b/src/system/system.kathleen.ucl.ac.uk.make index ae8a383a0..e9e093596 100644 --- a/src/system/system.kathleen.ucl.ac.uk.make +++ b/src/system/system.kathleen.ucl.ac.uk.make @@ -26,16 +26,16 @@ COMPFLAGS_F77= $(COMPFLAGS) # LibXC: choose between LibXC compatibility below or Conquest XC library # Conquest XC library -XC_LIBRARY = CQ -XC_LIB = -XC_COMPFLAGS = +#XC_LIBRARY = CQ +#XC_LIB = +#XC_COMPFLAGS = # LibXC compatibility # Choose LibXC version: v4 (deprecated) or v5/6 (v5 and v6 have the same interface) -# XC_LIBRARY = LibXC_v4 +XC_LIBRARY = LibXC_v4 #XC_LIBRARY = LibXC_v5 -#XC_LIB = -lxcf90 -lxc -#XC_COMPFLAGS = -I/usr/local/include +XC_LIB = -L/shared/ucl/apps/libxc/4.2.3/intel-2018/lib -lxcf90 -lxc +XC_COMPFLAGS = -I/shared/ucl/apps/libxc/4.2.3/intel-2018/include # Set FFT library FFT_LIB=-lmkl_rt From c72017285852d1c3552bbdef7f95d9a210cf86a6 Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 24 Jan 2024 11:58:28 +0000 Subject: [PATCH 11/12] Add .gitignore and needed housekeeping, to make benchmark folders manageable. --- benchmarks/.gitignore | 12 ++++++++++++ ...22_G200_8proc => Conquest_output_K222_G200_8proc} | 0 benchmarks/water_64mols/Conquest_input | 2 +- .../water_64mols/{H2O_coord.in => H2O_coord.dat} | 0 4 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 benchmarks/.gitignore rename benchmarks/K222_G200/{output_K222_G200_8proc => Conquest_output_K222_G200_8proc} (100%) rename benchmarks/water_64mols/{H2O_coord.in => H2O_coord.dat} (100%) diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..a5a7a0c06 --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,12 @@ +# Ignore everything +* +# Don't ignore directories, so we can recurse into them +!*/ +# Don't ignore .gitignore and specific files in all subdirectories +!.gitignore +!*/Conquest_input +!*/Conquest_output* +!*/README.md +!*/*.ion +!*/*.dat +!*/*.xtl diff --git a/benchmarks/K222_G200/output_K222_G200_8proc b/benchmarks/K222_G200/Conquest_output_K222_G200_8proc similarity index 100% rename from benchmarks/K222_G200/output_K222_G200_8proc rename to benchmarks/K222_G200/Conquest_output_K222_G200_8proc diff --git a/benchmarks/water_64mols/Conquest_input b/benchmarks/water_64mols/Conquest_input index 9d456a900..353816b1b 100644 --- a/benchmarks/water_64mols/Conquest_input +++ b/benchmarks/water_64mols/Conquest_input @@ -1,5 +1,5 @@ IO.Title Water static test, DZ, GridCutoff=50Ha -IO.Coordinates H2O_coord.in +IO.Coordinates H2O_coord.dat IO.FractionalAtomicCoords F IO.Iprint 1 IO.WriteOutToFile F diff --git a/benchmarks/water_64mols/H2O_coord.in b/benchmarks/water_64mols/H2O_coord.dat similarity index 100% rename from benchmarks/water_64mols/H2O_coord.in rename to benchmarks/water_64mols/H2O_coord.dat From 6ab122feba1ee4bf006e8f0bc4f440c5c211e36c Mon Sep 17 00:00:00 2001 From: Ilektra Christidi Date: Wed, 13 Mar 2024 21:33:04 +0000 Subject: [PATCH 12/12] Remove unneeded barriers. --- src/comms_module.f90 | 2 -- src/generic_comms.f90 | 4 ++++ src/multiply_module.f90 | 47 +++++++++++++++++++---------------------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/comms_module.f90 b/src/comms_module.f90 index 59237adae..e97ae6d2e 100644 --- a/src/comms_module.f90 +++ b/src/comms_module.f90 @@ -142,8 +142,6 @@ subroutine Mquest_start_send(a_b_c,b,nreq,myid,mx_nponn,sends) endif enddo ! Partitions to send enddo ! Nodes to send to - ! Synchronise with other nodes - call MPI_Barrier(MPI_COMM_WORLD,ierr) return end subroutine Mquest_start_send !!*** diff --git a/src/generic_comms.f90 b/src/generic_comms.f90 index 245d63eed..bd934aaf0 100644 --- a/src/generic_comms.f90 +++ b/src/generic_comms.f90 @@ -287,13 +287,17 @@ subroutine init_comms(myid,number_of_procs) ! Local variables integer :: ierr + character(len=MPI_MAX_PROCESSOR_NAME) :: pname + integer :: rlen call MPI_INIT(ierr) if(ierr.ne.0) write(io_lun,*) 'ierr is ',ierr call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr ) call MPI_COMM_SIZE( MPI_COMM_WORLD, number_of_procs, ierr ) + call MPI_GET_PROCESSOR_NAME(pname, rlen, ierr) inode = myid+1 ionode = 1 + write(io_lun,*) 'Process ',myid,' is running on node ',pname if(inode==ionode) open(unit=warning_lun, file='Conquest_warnings') call mtmini() return diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 7d5fb0c93..08ad45fec 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -244,15 +244,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) index_rec = mod(kpart,2) + 1 index_comp = mod(kpart+1,2) + 1 - ! Check that previous partition data have been received before starting computation - if (kpart.gt.2) then - if (request(1,index_comp).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr) - if (request(2,index_comp).ne.MPI_REQUEST_NULL) & - call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr) - end if - - ! If that previous partition was a periodic one, copy over arrays from previous index + ! If the previous partition was a periodic one, copy over arrays from previous index. + ! No need to wait for communication to finish. if(.not.new_partition(index_comp)) then part_array(:,index_comp) = part_array(:,index_rec) n_cont(index_comp) = n_cont(index_rec) @@ -261,6 +254,23 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) lenb_rem(index_comp) = lenb_rem(index_rec) end if + ! Receive the data from the current partition - non-blocking + if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then + call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), & + a_b_c, b, recv_part, b_rem(index_rec)%values, & + lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) + end if + + ! Check that previous partition data have been received before starting computation + if (kpart.gt.2) then + if (request(1,index_comp).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr) + if (request(2,index_comp).ne.MPI_REQUEST_NULL) & + call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr) + if (request(1,index_comp).ne.MPI_REQUEST_NULL .or. request(2,index_comp).ne.MPI_REQUEST_NULL) & + call cq_abort('mat_mult: error freeing MPI_request after MPI_Wait') + end if + ! Now point the _rem variables at the appropriate parts of ! the array where we have received the data offset = 0 @@ -283,13 +293,6 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call end_part_comms(myid,n_cont(index_comp),nbnab_rem, & ibind_rem,npxyz_rem,ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) - ! Receive the data from the current partition - non-blocking - if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then - call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), & - a_b_c, b, recv_part, b_rem(index_rec)%values, & - lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec)) - end if - ! Omp master doesn't include an implicit barrier. We want master ! to be finished with comms before calling the multiply kernels ! hence the explicit barrier @@ -310,7 +313,6 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc) end if - !$omp barrier end do main_loop !$omp end parallel @@ -330,30 +332,25 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) !write(io_lun,*) 'Send done ',i,myid end do end if - call my_barrier call start_timer(tmr_std_allocation) deallocate(nreqs,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating nreqs') call stop_timer(tmr_std_allocation) - call my_barrier ! --- for type 2, make backward local transpose of A-matrix ----------- if(a_b_c%mult_type.eq.2) then invdir=1 call loc_trans( a_b_c%ltrans, a_b_c%ahalo,a,lena,atrans,lena,invdir) end if - call my_barrier call start_timer(tmr_std_allocation) deallocate(atrans,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating atrans') call stop_timer(tmr_std_allocation) - call my_barrier call start_timer(tmr_std_allocation) deallocate(ibpart_rem,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem') deallocate(recv_part,STAT=stat) if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part') call stop_timer(tmr_std_allocation) - call my_barrier !deallocate(b_rem,STAT=stat) !if(stat/=0) call cq_abort('mat_mult: error deallocating b_rem') !call my_barrier @@ -627,9 +624,9 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request) end if - - k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs - end subroutine do_comms + + k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs + end subroutine do_comms !!****f* multiply_module/prefetch * !!