From 9e15b167a2f41efa1a5341aebbea727e253c50d6 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Fri, 6 Oct 2023 16:30:57 +0100
Subject: [PATCH 01/12] Refactor communications into its own subroutine in
 multiply_module

---
 src/multiply_module.f90 | 167 ++++++++++++++++++++++++++--------------
 1 file changed, 111 insertions(+), 56 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 268e9e5f5..2561d9a89 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -155,13 +155,10 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     integer(integ),pointer :: ibseq_rem(:)
     integer(integ),pointer :: ibind_rem(:)
     integer(integ),pointer :: ib_nd_acc_rem(:)
-    integer(integ),pointer :: npxyz_rem(:)
     integer(integ),pointer :: ibndimj_rem(:)
     ! Arrays for remote variables to point to
-    integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
-         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
     integer, dimension(:), allocatable :: nreqs
-    integer :: offset,sends,i,j
+    integer :: sends,i,j
     integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
     integer, allocatable, dimension(:) :: recv_part
     real(double) :: t0,t1
@@ -212,61 +209,13 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call Mquest_start_send(a_b_c,b,nreqs,myid,a_b_c%prim%mx_ngonn,sends)
     !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid
     ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz
-
+    
     !$omp parallel default(shared)
     main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo
-
-       !$omp master
-       icall=1
-       ind_part = a_b_c%ahalo%lab_hcell(kpart)
-       new_partition = .true.
-       
-       ! Check if this is a periodic image of the previous partition
-       if(kpart>1) then
-          if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then
-             new_partition = .false.
-          end if
-       end if
-
-       if(new_partition) then
-          ! Get the data
-          ipart = a_b_c%parts%i_cc2seq(ind_part)
-          nnode = a_b_c%comms%neigh_node_list(kpart)
-          recv_part(nnode) = recv_part(nnode)+1
-          if(allocated(b_rem)) deallocate(b_rem)
-          if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then
-             lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs
-          else
-             lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode)
-          end if
-          allocate(b_rem(lenb_rem))
-          call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
-               n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
-               mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
-          ! Now point the _rem variables at the appropriate parts of
-          ! the array where we will receive the data
-          offset = 0
-          nbnab_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ibind_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ib_nd_acc_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ibseq_rem => part_array(offset+1:offset+ilen2)
-          offset = offset+ilen2
-          npxyz_rem => part_array(offset+1:offset+3*ilen2)
-          offset = offset+3*ilen2
-          ibndimj_rem => part_array(offset+1:offset+ilen2)
-          if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ &
-               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
-             call cq_abort('mat_mult: error pointing to part_array ',kpart)
-          end if
-          ! Create ibpart_rem
-          call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,&
-               ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
-       end if
        
-       k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
+       !$omp master
+       call do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
+            ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz)       
        ! Omp master doesn't include a implicit barrier. We want master
        ! to be finished with comms before calling the multiply kernels
        ! hence the explicit barrier
@@ -523,6 +472,112 @@ end subroutine loc_trans
   !!***
 
 
+  !!****f* multiply_module/do_comms *
+  !!
+  !!  NAME
+  !!   do_comms
+  !!  USAGE
+  !!
+  !!  PURPOSE
+  !!   Prefetch data from local or remote partitions, and calculate offsets,
+  !!   ready to call the kernel afterwards
+  !!  INPUTS
+  !!
+  !!
+  !!  USES
+  !!
+  !!  AUTHOR
+  !!   I. Christidi
+  !!  CREATION DATE
+  !!   5/10/2023
+  !!    Refactored from mat_mult
+  !!  MODIFICATION HISTORY
+  !!  SOURCE
+  !!
+  subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
+       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz)
+
+    use matrix_module
+    use matrix_comms_module
+
+    integer, intent(out) :: k_off
+    integer, intent(in) :: kpart
+    ! Remote variables which will point to part_array
+    integer(integ), pointer, intent(out) :: nbnab_rem(:)
+    integer(integ), pointer, intent(out) :: ibseq_rem(:)
+    integer(integ), pointer, intent(out) :: ibind_rem(:)
+    integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:)
+    integer(integ), pointer, intent(out) :: ibndimj_rem(:)
+    !!!!
+    integer(integ), allocatable, intent(out) :: ibpart_rem(:)
+    type(matrix_mult), intent(in) :: a_b_c
+    real(double), intent(out) :: b(:)
+    integer, allocatable, dimension(:), intent(inout) :: recv_part
+    real(double), allocatable, intent(inout) :: b_rem(:)
+    integer, intent(inout) :: lenb_rem
+    integer, intent(in) :: myid, ncover_yz
+
+    integer(integ), pointer :: npxyz_rem(:)
+    integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset
+    logical :: new_partition
+    ! Array for remote variables to point to
+    integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
+         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
+
+    if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode))
+
+    icall=1
+    ind_part = a_b_c%ahalo%lab_hcell(kpart)
+    new_partition = .true.
+    
+    ! Check if this is a periodic image of the previous partition
+    if(kpart>1) then
+       if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then
+          new_partition = .false.
+       end if
+    end if
+    
+    if(new_partition) then
+       ! Get the data
+       ipart = a_b_c%parts%i_cc2seq(ind_part)
+       nnode = a_b_c%comms%neigh_node_list(kpart)
+       recv_part(nnode) = recv_part(nnode)+1
+       if(allocated(b_rem)) deallocate(b_rem)
+       if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then
+          lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs
+       else
+          lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode)
+       end if
+       allocate(b_rem(lenb_rem))
+       call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
+            n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
+            mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
+          ! Now point the _rem variables at the appropriate parts of
+          ! the array where we will receive the data
+          offset = 0
+          nbnab_rem => part_array(offset+1:offset+n_cont)
+          offset = offset+n_cont
+          ibind_rem => part_array(offset+1:offset+n_cont)
+          offset = offset+n_cont
+          ib_nd_acc_rem => part_array(offset+1:offset+n_cont)
+          offset = offset+n_cont
+          ibseq_rem => part_array(offset+1:offset+ilen2)
+          offset = offset+ilen2
+          npxyz_rem => part_array(offset+1:offset+3*ilen2)
+          offset = offset+3*ilen2
+          ibndimj_rem => part_array(offset+1:offset+ilen2)
+          if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ &
+               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
+             call cq_abort('mat_mult: error pointing to part_array ',kpart)
+          end if
+          ! Create ibpart_rem
+          call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,&
+               ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
+       end if
+       
+       k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
+     end subroutine do_comms
+
   !!****f* multiply_module/prefetch *
   !!
   !!  NAME

From 05317daced7f16c8e091d6cca0b6f9cf60cfc2a9 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Mon, 27 Nov 2023 18:13:48 +0000
Subject: [PATCH 02/12] Add non-blocking Mquest_get, use it to overlap comms
 with comps.

---
 src/comms_module.f90    |  76 ++++++++++++++++++++-
 src/multiply_module.f90 | 148 +++++++++++++++++++++++++++-------------
 2 files changed, 173 insertions(+), 51 deletions(-)

diff --git a/src/comms_module.f90 b/src/comms_module.f90
index 9284f1780..59237adae 100644
--- a/src/comms_module.f90
+++ b/src/comms_module.f90
@@ -177,7 +177,7 @@ end subroutine Mquest_start_send
 !!  SOURCE
 !!
   subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,&
-       bind_rem,b_rem,lenb_rem,bind,b,istart,mx_babs,mx_part,tag)
+       bind_rem,b_rem,lenb_rem,bind,istart,mx_babs,mx_part,tag)
 
     ! Module usage
     use mpi
@@ -194,7 +194,6 @@ subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,&
     integer :: bind(:)
     integer :: lenb_rem
     real(double) :: b_rem(lenb_rem)
-    real(double) :: b(:)
     ! Miscellaneous data
     integer :: nc_part,send_node,sent_part,myid,ilen2,ilen3,size,istart,offset
     integer :: nrstat(MPI_STATUS_SIZE)
@@ -222,6 +221,79 @@ subroutine Mquest_get( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,&
   end subroutine Mquest_get
 !!***
 
+!!***
+
+! ---------------------------------------------------------------------
+!   subroutine Mquest_get_nonb
+! ---------------------------------------------------------------------
+
+!!****f* comms_module/Mquest_get_nonb *
+!!
+!!  NAME 
+!!   Mquest_get_nonb
+!!  USAGE
+!! 
+!!  PURPOSE
+!!   Calls non-blocking receives to get data for matrix multiplication
+!!  INPUTS
+!! 
+!! 
+!!  USES
+!! 
+!!  AUTHOR
+!!   I. Christidi
+!!  CREATION DATE
+!!    2023/10/06
+!!  
+!!  MODIFICATION HISTORY
+!!
+!!  SOURCE
+!!
+  subroutine Mquest_get_nonb( mx_nponn, ilen2,ilen3,nc_part,send_node,sent_part,myid,&
+       bind_rem,b_rem,lenb_rem,bind,istart,mx_babs,mx_part,tag,request)
+
+    ! Module usage
+    use mpi
+    use datatypes
+    use matrix_comms_module, ONLY: mx_msg_per_part
+    use GenComms, ONLY: cq_abort
+
+    implicit none
+
+    ! Maxima
+    integer :: mx_nponn,mx_babs,mx_part
+    ! Arrays for receiving data
+    integer :: bind_rem(:)
+    integer :: bind(:)
+    integer :: lenb_rem
+    real(double) :: b_rem(lenb_rem)
+    ! Miscellaneous data
+    integer :: nc_part,send_node,sent_part,myid,ilen2,ilen3,size,istart,offset
+    integer :: tag
+    integer :: request(2)
+
+    ! Local variables
+    integer :: ilen1,ierr,lenbind_rem
+
+    !lenb_rem = size(b_rem)
+    !lenbind_rem = size(bind_rem)
+    ierr = 0
+    ilen1 = nc_part
+    !if(3*ilen1+5*ilen2>lenbind_rem) call cq_abort('Get error ',3*ilen1+5*ilen2,lenbind_rem)
+    if(ilen3>lenb_rem) call cq_abort('Get error 2 ',ilen3,lenb_rem)
+    call MPI_Irecv(bind_rem,3*ilen1+5*ilen2,MPI_INTEGER, &
+         send_node-1,tag+1,MPI_COMM_WORLD,request(1),ierr)
+    if(ierr/=0) call cq_abort('Error receiving indices !',ierr)
+    if(ilen3.gt.0)then ! Get xyz, sequence list and elements
+       ierr = 0
+       call MPI_Irecv(b_rem,ilen3, MPI_DOUBLE_PRECISION,send_node-1,&
+            tag+2,MPI_COMM_WORLD,request(2),ierr)
+       if(ierr/=0) call cq_abort('Error receiving data !',ierr)
+    endif
+    return
+  end subroutine Mquest_get_nonb
+!!***
+
 ! ---------------------------------------------------------------------
 !   subroutine send_trans_data
 ! ---------------------------------------------------------------------
diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 2561d9a89..e673699da 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -145,23 +145,33 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     real(double), allocatable, dimension(:) :: atrans
     integer :: lab_const
     integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
-    integer :: icall,n_cont,kpart_next,ind_partN,k_off
-    integer :: stat,ilen2,lenb_rem
+    integer :: icall,n_cont,kpart_next,ind_partN,k_off(2)
+    integer :: stat,ilen2,lenb_rem(2)
     ! Remote variables to be allocated
-    integer(integ),allocatable :: ibpart_rem(:)
-    real(double),allocatable :: b_rem(:)
+    integer(integ),allocatable :: ibpart_rem(:,:)
+    type jagged_array_r
+       real(double), allocatable :: values(:)
+    end type jagged_array_r
+    type(jagged_array_r) :: b_rem(2)
     ! Remote variables which will point to part_array
-    integer(integ),pointer :: nbnab_rem(:)
-    integer(integ),pointer :: ibseq_rem(:)
-    integer(integ),pointer :: ibind_rem(:)
-    integer(integ),pointer :: ib_nd_acc_rem(:)
-    integer(integ),pointer :: ibndimj_rem(:)
+     type jagged_pointer_array_i
+       integer(integ),pointer :: values(:)
+    end type jagged_pointer_array_i
+    type(jagged_pointer_array_i) :: nbnab_rem(2)
+    type(jagged_pointer_array_i) :: ibseq_rem(2)
+    type(jagged_pointer_array_i) :: ibind_rem(2)
+    type(jagged_pointer_array_i) :: ib_nd_acc_rem(2)
+    type(jagged_pointer_array_i) :: ibndimj_rem(2)
     ! Arrays for remote variables to point to
     integer, dimension(:), allocatable :: nreqs
     integer :: sends,i,j
     integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
-    integer, allocatable, dimension(:) :: recv_part
+    type jagged_array_i
+       integer, allocatable :: values(:)
+    end type jagged_array_i
+    type(jagged_array_i) :: recv_part(2)
     real(double) :: t0,t1
+    integer :: request(2,2), index_rec, index_wait
 
     logical :: new_partition
 
@@ -170,14 +180,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     if(iprint_mat>3.AND.myid==0) t0 = mtime()
     ! Allocate memory for the elements
-    allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs),STAT=stat)
+    allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,2),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating ibpart_rem')
     !allocate(atrans(a_b_c%amat(1)%length),STAT=stat)
     allocate(atrans(lena),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating atrans')
-    allocate(recv_part(0:a_b_c%comms%inode),STAT=stat)
+    allocate(recv_part(1)%values(0:a_b_c%comms%inode),STAT=stat)
+    allocate(recv_part(2)%values(0:a_b_c%comms%inode),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating recv_part')
-    recv_part = zero
+    recv_part(1)%values = zero
+    recv_part(2)%values = zero
     call stop_timer(tmr_std_allocation)
     !write(io_lun,*) 'Sizes: ',a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim2*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,&
     !     a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim1* &
@@ -209,37 +221,61 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call Mquest_start_send(a_b_c,b,nreqs,myid,a_b_c%prim%mx_ngonn,sends)
     !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid
     ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz
-    
+
+    ! Receive the data from the first partition - blocking
+    call do_comms(k_off(2), 1, nbnab_rem(2)%values, ibseq_rem(2)%values, ibind_rem(2)%values, &
+         ib_nd_acc_rem(2)%values, ibndimj_rem(2)%values, ibpart_rem(:,2), a_b_c, b, recv_part(2)%values, &
+         b_rem(2)%values, lenb_rem(2), myid, ncover_yz)
+
     !$omp parallel default(shared)
-    main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo
-       
+    main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo
+
+       ! These indices point to elements of all the 2-element vectors of the variables needed
+       ! for the do_comms and m_kern_min/max calls. They alternate between the values of
+       ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration.
+       ! index_rec points to the values being received in the current iteration in do_comms,
+       ! and index_wait points to the values received in the previous iteration, thus computation
+       ! can start on them in m_kern_min/max
+       ! These indices are also used to point to elements of the 2x2-element request() array,
+       ! that contains the MPI request numbers for the non-blocking data receives. There are 2
+       ! MPI_Irecv calls per call of do_comms, and request() keeps track of 2 sets of those calls,
+       ! thus it's of size 2x2.
+       ! request(:,index_rec) points to the requests from the current iteration MPI_Irecv,
+       ! and request(:,index_wait) points to the requests from the previous iteration, that have
+       ! to complete in order for the computation to start (thus the MPI_Wait).
+       index_rec = mod(kpart,2) + 1
+       index_wait = mod(kpart+1,2) + 1
+
+       ! Receive the data from the current partition - non-blocking
        !$omp master
-       call do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
-            ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz)       
-       ! Omp master doesn't include a implicit barrier. We want master
-       ! to be finished with comms before calling the multiply kernels
-       ! hence the explicit barrier
+       call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, &
+            ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, &
+            ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, &
+            lenb_rem(index_rec), myid, ncover_yz, request(:,index_rec))
+       ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms
        !$omp end master
-       !$omp barrier
-       
+
+       ! Call the computation kernel on the previous partition
+       if (kpart.gt.2) call MPI_Waitall(request(:,index_wait))
        if(a_b_c%mult_type.eq.1) then  ! C is full mult
-          call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,&
-               ibpart_rem,ibseq_rem,ibndimj_rem,&
-               atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,&
-               a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-               a_b_c%prim%mx_iprim, lena, lenb_rem, lenc)
+          call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+               nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
+               ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+               a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
+               a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
        else if(a_b_c%mult_type.eq.2) then ! A is partial mult
-          call m_kern_min( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,&
-               ibpart_rem,ibseq_rem,ibndimj_rem,&
-               atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,&
-               a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-               a_b_c%prim%mx_iprim, lena, lenb_rem, lenc)
+          call m_kern_min( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+               nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
+               ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+               a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
+               a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
        end if
-       !$omp barrier
+!!!       !$omp barrier
     end do main_loop
     !$omp end parallel
     call start_timer(tmr_std_allocation)
-    if(allocated(b_rem)) deallocate(b_rem)
+    if(allocated(b_rem(1)%values)) deallocate(b_rem(1)%values)
+    if(allocated(b_rem(2)%values)) deallocate(b_rem(2)%values)
     call stop_timer(tmr_std_allocation)
     ! --------------------------------------------------
     ! End of the main loop over partitions in the A-halo
@@ -273,7 +309,9 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     deallocate(ibpart_rem,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
-    deallocate(recv_part,STAT=stat)
+    deallocate(recv_part(1)%values,STAT=stat)
+    if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
+    deallocate(recv_part(2)%values,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
     call stop_timer(tmr_std_allocation)
     call my_barrier
@@ -495,7 +533,7 @@ end subroutine loc_trans
   !!  SOURCE
   !!
   subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
-       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz)
+       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,request)
 
     use matrix_module
     use matrix_comms_module
@@ -509,13 +547,14 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
     integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:)
     integer(integ), pointer, intent(out) :: ibndimj_rem(:)
     !!!!
-    integer(integ), allocatable, intent(out) :: ibpart_rem(:)
+    integer(integ), intent(out) :: ibpart_rem(:)
     type(matrix_mult), intent(in) :: a_b_c
-    real(double), intent(out) :: b(:)
+    real(double), intent(in) :: b(:)
     integer, allocatable, dimension(:), intent(inout) :: recv_part
     real(double), allocatable, intent(inout) :: b_rem(:)
-    integer, intent(inout) :: lenb_rem
+    integer, intent(out) :: lenb_rem
     integer, intent(in) :: myid, ncover_yz
+    integer, intent(out), optional :: request(2)
 
     integer(integ), pointer :: npxyz_rem(:)
     integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset
@@ -551,7 +590,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
        allocate(b_rem(lenb_rem))
        call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
             n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
-            mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
+            mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,request)
           ! Now point the _rem variables at the appropriate parts of
           ! the array where we will receive the data
           offset = 0
@@ -605,7 +644,7 @@ end subroutine do_comms
   !!
   subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
        n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, &
-       parts,prim,gcs,tag)
+       parts,prim,gcs,tag,request)
 
     ! Module usage
     use datatypes
@@ -628,6 +667,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     type(comms_data) :: a_b_c
     integer(integ), dimension(:)  :: bind_rem,bind
     integer :: lenb_rem, tag
+    integer, optional :: request(2)
     real(double), dimension(lenb_rem) :: b_rem
     real(double) :: b(:)
     ! Local variables
@@ -649,13 +689,23 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     end if
     if(icall.eq.1) then ! Else fetch the data
        ilen2 = a_b_c%ilen2rec(ipart,nnode)
-       call Mquest_get( prim%mx_ngonn, &
-            a_b_c%ilen2rec(ipart,nnode),&
-            a_b_c%ilen3rec(ipart,nnode),&
-            n_cont,inode,ipart,myid,&
-            bind_rem,b_rem,lenb_rem,bind,b,&
-            a_b_c%istart(ipart,nnode), &
-            bmat(1)%mx_abs,parts%mx_mem_grp,tag)
+       if(this_part.eq.1) then
+          call Mquest_get( prim%mx_ngonn, &
+               a_b_c%ilen2rec(ipart,nnode),&
+               a_b_c%ilen3rec(ipart,nnode),&
+               n_cont,inode,ipart,myid,&
+               bind_rem,b_rem,lenb_rem,bind,&
+               a_b_c%istart(ipart,nnode), &
+               bmat(1)%mx_abs,parts%mx_mem_grp,tag)
+       else
+          call Mquest_get_nonb( prim%mx_ngonn, &
+               a_b_c%ilen2rec(ipart,nnode),&
+               a_b_c%ilen3rec(ipart,nnode),&
+               n_cont,inode,ipart,myid,&
+               bind_rem,b_rem,lenb_rem,bind,&
+               a_b_c%istart(ipart,nnode), &
+               bmat(1)%mx_abs,parts%mx_mem_grp,tag,request)
+       end if
     end if
     return
   end subroutine prefetch

From 45015e9d336c437dcb9935dc5afc04cf17ee2e0f Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 29 Nov 2023 10:22:39 +0000
Subject: [PATCH 03/12] Some debugging: - Refactor do_comms/prefetch to use
 flag for non-blocking comms - Add omp barrier after calculating indices in
 loop - Take into account local comms - they do not produce an MPI_request

---
 src/multiply_module.f90 | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index e673699da..a5051386c 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -245,18 +245,21 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
        ! to complete in order for the computation to start (thus the MPI_Wait).
        index_rec = mod(kpart,2) + 1
        index_wait = mod(kpart+1,2) + 1
-
+       !$omp barrier
+       
        ! Receive the data from the current partition - non-blocking
        !$omp master
+       request(:,index_rec) = [-1,-1]
        call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, &
             ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, &
             ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, &
-            lenb_rem(index_rec), myid, ncover_yz, request(:,index_rec))
+            lenb_rem(index_rec), myid, ncover_yz, .true., request(:,index_rec))
        ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms
        !$omp end master
 
        ! Call the computation kernel on the previous partition
-       if (kpart.gt.2) call MPI_Waitall(request(:,index_wait))
+       if (kpart.gt.2 .and. all(request(:,index_wait).ne.[-1,-1])) &
+            call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
        if(a_b_c%mult_type.eq.1) then  ! C is full mult
           call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
                nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
@@ -533,7 +536,7 @@ end subroutine loc_trans
   !!  SOURCE
   !!
   subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
-       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,request)
+       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,do_nonb,request)
 
     use matrix_module
     use matrix_comms_module
@@ -554,6 +557,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
     real(double), allocatable, intent(inout) :: b_rem(:)
     integer, intent(out) :: lenb_rem
     integer, intent(in) :: myid, ncover_yz
+    logical, intent(in), optional :: do_nonb
     integer, intent(out), optional :: request(2)
 
     integer(integ), pointer :: npxyz_rem(:)
@@ -562,7 +566,12 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
     ! Array for remote variables to point to
     integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
          5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
+    logical :: do_nonb_local
 
+    ! Set non-blocking receive flag
+    do_nonb_local = .false.
+    if (present(do_nonb)) do_nonb_local = do_nonb
+    
     if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode))
 
     icall=1
@@ -590,7 +599,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
        allocate(b_rem(lenb_rem))
        call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
             n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
-            mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,request)
+            mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request)
           ! Now point the _rem variables at the appropriate parts of
           ! the array where we will receive the data
           offset = 0
@@ -644,7 +653,7 @@ end subroutine do_comms
   !!
   subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
        n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, &
-       parts,prim,gcs,tag,request)
+       parts,prim,gcs,tag,do_nonb,request)
 
     ! Module usage
     use datatypes
@@ -667,13 +676,19 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     type(comms_data) :: a_b_c
     integer(integ), dimension(:)  :: bind_rem,bind
     integer :: lenb_rem, tag
+    logical, intent(in), optional :: do_nonb
     integer, optional :: request(2)
     real(double), dimension(lenb_rem) :: b_rem
     real(double) :: b(:)
     ! Local variables
     integer :: ncover_yz,ind_part,iskip,ind_last
     integer :: inode,ipart,nnode
+    logical :: do_nonb_local
 
+    ! Set non-blocking receive flag
+    do_nonb_local = .false.
+    if (present(do_nonb)) do_nonb_local = do_nonb
+    
     ind_part = ahalo%lab_hcell(this_part)
     n_cont=parts%nm_group(ind_part)
     ipart = parts%i_cc2seq(ind_part)
@@ -689,7 +704,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     end if
     if(icall.eq.1) then ! Else fetch the data
        ilen2 = a_b_c%ilen2rec(ipart,nnode)
-       if(this_part.eq.1) then
+       if(.not.do_nonb_local) then ! Use blocking receive
           call Mquest_get( prim%mx_ngonn, &
                a_b_c%ilen2rec(ipart,nnode),&
                a_b_c%ilen3rec(ipart,nnode),&
@@ -697,7 +712,8 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
                bind_rem,b_rem,lenb_rem,bind,&
                a_b_c%istart(ipart,nnode), &
                bmat(1)%mx_abs,parts%mx_mem_grp,tag)
-       else
+       else ! Use non-blocking receive
+          if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.')
           call Mquest_get_nonb( prim%mx_ngonn, &
                a_b_c%ilen2rec(ipart,nnode),&
                a_b_c%ilen3rec(ipart,nnode),&

From 4e2d38c26ba030f95129324f632a59de2c0d8071 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 29 Nov 2023 17:35:29 +0000
Subject: [PATCH 04/12] More debugging: move pointers out of functions, correct
 barriers. Tests run but produce wrong results.

---
 src/multiply_module.f90 | 119 +++++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 57 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index a5051386c..6089b157d 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -145,8 +145,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     real(double), allocatable, dimension(:) :: atrans
     integer :: lab_const
     integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
-    integer :: icall,n_cont,kpart_next,ind_partN,k_off(2)
-    integer :: stat,ilen2,lenb_rem(2)
+    integer :: icall,n_cont(2),kpart_next,ind_partN,k_off(2)
+    integer :: stat,ilen2(2),lenb_rem(2)
     ! Remote variables to be allocated
     integer(integ),allocatable :: ibpart_rem(:,:)
     type jagged_array_r
@@ -162,7 +162,11 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     type(jagged_pointer_array_i) :: ibind_rem(2)
     type(jagged_pointer_array_i) :: ib_nd_acc_rem(2)
     type(jagged_pointer_array_i) :: ibndimj_rem(2)
+    type(jagged_pointer_array_i) :: npxyz_rem(2)
     ! Arrays for remote variables to point to
+    integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
+         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs, 2)
+    integer :: offset
     integer, dimension(:), allocatable :: nreqs
     integer :: sends,i,j
     integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
@@ -173,7 +177,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     real(double) :: t0,t1
     integer :: request(2,2), index_rec, index_wait
 
-    logical :: new_partition
+    logical :: new_partition(2)
 
     call start_timer(tmr_std_matmult)
 
@@ -223,13 +227,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz
 
     ! Receive the data from the first partition - blocking
-    call do_comms(k_off(2), 1, nbnab_rem(2)%values, ibseq_rem(2)%values, ibind_rem(2)%values, &
-         ib_nd_acc_rem(2)%values, ibndimj_rem(2)%values, ibpart_rem(:,2), a_b_c, b, recv_part(2)%values, &
-         b_rem(2)%values, lenb_rem(2), myid, ncover_yz)
+    call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part(2)%values, &
+         b_rem(2)%values, lenb_rem(2), myid, ncover_yz, new_partition(2))
 
+    request = MPI_REQUEST_NULL
     !$omp parallel default(shared)
     main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo
 
+       ! The following include MPI operations, so they have to be on the master thread only.
+       !$omp master
+
        ! These indices point to elements of all the 2-element vectors of the variables needed
        ! for the do_comms and m_kern_min/max calls. They alternate between the values of
        ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration.
@@ -245,21 +252,47 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
        ! to complete in order for the computation to start (thus the MPI_Wait).
        index_rec = mod(kpart,2) + 1
        index_wait = mod(kpart+1,2) + 1
-       !$omp barrier
        
        ! Receive the data from the current partition - non-blocking
-       !$omp master
-       request(:,index_rec) = [-1,-1]
-       call do_comms(k_off(index_rec), kpart, nbnab_rem(index_rec)%values, ibseq_rem(index_rec)%values, &
-            ibind_rem(index_rec)%values, ib_nd_acc_rem(index_rec)%values, ibndimj_rem(index_rec)%values, &
-            ibpart_rem(:,index_rec), a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, &
-            lenb_rem(index_rec), myid, ncover_yz, .true., request(:,index_rec))
-       ! Omp master doesn't include an implicit barrier, so this is fine for non-blocking comms
+       call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), &
+            a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, &
+            lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
+
+       ! Check that previous partition data have been received before starting computation
+       if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
+            call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
+       if(new_partition(index_wait)) then
+          ! Now point the _rem variables at the appropriate parts of
+          ! the array where we will receive the data
+          offset = 0
+          nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+          offset = offset+n_cont(index_wait)
+          ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+          offset = offset+n_cont(index_wait)
+          ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+          offset = offset+n_cont(index_wait)
+          ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+          offset = offset+ilen2(index_wait)
+          npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
+          offset = offset+3*ilen2(index_wait)
+          ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+          if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
+               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
+             call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
+          end if
+          ! Create ibpart_rem
+          call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
+               ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
+               ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
+       end if
+
+       ! Omp master doesn't include an implicit barrier. We want master
+       ! to be finished with comms before calling the multiply kernels
+       ! hence the explicit barrier
        !$omp end master
+       !$omp barrier
 
        ! Call the computation kernel on the previous partition
-       if (kpart.gt.2 .and. all(request(:,index_wait).ne.[-1,-1])) &
-            call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
        if(a_b_c%mult_type.eq.1) then  ! C is full mult
           call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
                nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
@@ -273,7 +306,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
                a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
        end if
-!!!       !$omp barrier
+       !$omp barrier
     end do main_loop
     !$omp end parallel
     call start_timer(tmr_std_allocation)
@@ -535,37 +568,30 @@ end subroutine loc_trans
   !!  MODIFICATION HISTORY
   !!  SOURCE
   !!
-  subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem, &
-       ibndimj_rem, ibpart_rem, a_b_c, b, recv_part, b_rem, lenb_rem, myid, ncover_yz,do_nonb,request)
+  subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part, &
+       b_rem, lenb_rem, myid, ncover_yz, new_partition, do_nonb, request)
 
     use matrix_module
     use matrix_comms_module
 
     integer, intent(out) :: k_off
     integer, intent(in) :: kpart
-    ! Remote variables which will point to part_array
-    integer(integ), pointer, intent(out) :: nbnab_rem(:)
-    integer(integ), pointer, intent(out) :: ibseq_rem(:)
-    integer(integ), pointer, intent(out) :: ibind_rem(:)
-    integer(integ), pointer, intent(out) :: ib_nd_acc_rem(:)
-    integer(integ), pointer, intent(out) :: ibndimj_rem(:)
-    !!!!
-    integer(integ), intent(out) :: ibpart_rem(:)
     type(matrix_mult), intent(in) :: a_b_c
     real(double), intent(in) :: b(:)
     integer, allocatable, dimension(:), intent(inout) :: recv_part
     real(double), allocatable, intent(inout) :: b_rem(:)
     integer, intent(out) :: lenb_rem
     integer, intent(in) :: myid, ncover_yz
+    ! Array for remote variables to point to and indices to unpack it
+    integer, target, intent(out) :: part_array(3*a_b_c%parts%mx_mem_grp+ &
+         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
+    integer, intent(out) :: n_cont, ilen2
+    !!!
+    logical, intent(out) :: new_partition
     logical, intent(in), optional :: do_nonb
     integer, intent(out), optional :: request(2)
 
-    integer(integ), pointer :: npxyz_rem(:)
-    integer :: icall, ind_part, ipart, nnode, n_cont, ilen2, offset
-    logical :: new_partition
-    ! Array for remote variables to point to
-    integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
-         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
+    integer :: icall, ind_part, ipart, nnode, offset
     logical :: do_nonb_local
 
     ! Set non-blocking receive flag
@@ -579,8 +605,8 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
     new_partition = .true.
     
     ! Check if this is a periodic image of the previous partition
-    if(kpart>1) then
-       if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then
+    if(kpart>2) then
+       if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-2)) then
           new_partition = .false.
        end if
     end if
@@ -600,28 +626,7 @@ subroutine do_comms(k_off, kpart, nbnab_rem, ibseq_rem, ibind_rem, ib_nd_acc_rem
        call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
             n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
             mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request)
-          ! Now point the _rem variables at the appropriate parts of
-          ! the array where we will receive the data
-          offset = 0
-          nbnab_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ibind_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ib_nd_acc_rem => part_array(offset+1:offset+n_cont)
-          offset = offset+n_cont
-          ibseq_rem => part_array(offset+1:offset+ilen2)
-          offset = offset+ilen2
-          npxyz_rem => part_array(offset+1:offset+3*ilen2)
-          offset = offset+3*ilen2
-          ibndimj_rem => part_array(offset+1:offset+ilen2)
-          if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ &
-               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
-             call cq_abort('mat_mult: error pointing to part_array ',kpart)
-          end if
-          ! Create ibpart_rem
-          call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,&
-               ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
-       end if
+    end if
        
        k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
      end subroutine do_comms

From 6be243ca60f7298eed62bcea89647a9a951bc5a5 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Fri, 15 Dec 2023 12:33:46 +0000
Subject: [PATCH 05/12] Fix bug with computation only being called with the
 wrong kpart, and for only kpart=[2,end-1]. Get rid of not needed variable
 icall of prefetch, in various places. Code now produces correct results for
 MPI nprocs=1 and nthreads=any, but deadlocks for more than 1 MPI procs.
 Separated the MPI_waits, but was not fixed.

---
 src/exx_kernel_default.f90 | 13 +++---
 src/multiply_module.f90    | 85 +++++++++++++++++++++++++++++++-------
 src/multisiteSF_module.f90 | 11 ++---
 3 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/src/exx_kernel_default.f90 b/src/exx_kernel_default.f90
index c5bed2b8c..950b7798e 100644
--- a/src/exx_kernel_default.f90
+++ b/src/exx_kernel_default.f90
@@ -146,7 +146,7 @@ subroutine get_X_matrix( exxspin, level )
     integer :: exxspin
     integer :: lab_const
     integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
-    integer :: icall,n_cont,kpart_next,ind_partN,k_off
+    integer :: n_cont,kpart_next,ind_partN,k_off
     integer :: icall2,stat,ilen2,lenb_rem
     ! Remote variables to be allocated
     integer(integ),allocatable :: ibpart_rem(:)
@@ -356,7 +356,7 @@ subroutine get_X_matrix( exxspin, level )
     ! !$omp parallel default(none) &
     ! !$omp          shared(a, b, c, a_b_c, myid, lena, lenc, tmr_std_allocation, &
     ! !$omp                 ncover_yz, ibpart_rem, atrans, usegemm) &
-    ! !$omp          private(kpart, icall, ind_part, ipart, nnode, b_rem, &
+    ! !$omp          private(kpart, ind_part, ipart, nnode, b_rem, &
     ! !$omp                  lenb_rem, n_cont, part_array, ilen2, offset, &
     ! !$omp                  nbnab_rem, ibind_rem, ib_nd_acc_rem, ibseq_rem, &
     ! !$omp                  npxyz_rem, ibndimj_rem, k_off, icall2)
@@ -375,7 +375,6 @@ subroutine get_X_matrix( exxspin, level )
     xyz_ghost = zero
     r_ghost   = zero
     do kpart = 1,mult(S_X_SX)%ahalo%np_in_halo  ! Main loop
-       icall=1
        ind_part = mult(S_X_SX)%ahalo%lab_hcell(kpart)    
        !
        !print*, 'inode', inode,'kpart', kpart, ind_part
@@ -385,9 +384,7 @@ subroutine get_X_matrix( exxspin, level )
        end if
        !
        if(kpart>1) then  ! Is it a periodic image of the previous partition ?
-          if(ind_part.eq.mult(S_X_SX)%ahalo%lab_hcell(kpart-1)) then
-             icall=0
-          else ! Get the data
+          if(ind_part.ne.mult(S_X_SX)%ahalo%lab_hcell(kpart-1)) then ! Get the data
              ipart = mult(S_X_SX)%parts%i_cc2seq(ind_part)
              nnode = mult(S_X_SX)%comms%neigh_node_list(kpart)
              recv_part(nnode) = recv_part(nnode)+1
@@ -401,7 +398,7 @@ subroutine get_X_matrix( exxspin, level )
              !
              allocate(b_rem(lenb_rem))
              !
-             call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat,icall, &
+             call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat, &
                   n_cont,part_array,mult(S_X_SX)%bindex,b_rem,lenb_rem,mat_p(matK(  exxspin  ))%matrix,   &
                   myid,ilen2,mx_msg_per_part,mult(S_X_SX)%parts,mult(S_X_SX)%prim,mult(S_X_SX)%gcs,&
                   (recv_part(nnode)-1)*2)
@@ -447,7 +444,7 @@ subroutine get_X_matrix( exxspin, level )
           call stop_timer(tmr_std_exx_allocat,.true.)
           !
           !
-          call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat,icall, &
+          call prefetch(kpart,mult(S_X_SX)%ahalo,mult(S_X_SX)%comms,mult(S_X_SX)%bmat, &
                n_cont,part_array,mult(S_X_SX)%bindex,b_rem,lenb_rem,mat_p(matK(  exxspin  ))%matrix,   & 
                myid,ilen2,mx_msg_per_part,mult(S_X_SX)%parts,mult(S_X_SX)%prim,mult(S_X_SX)%gcs,&
                (recv_part(nnode)-1)*2)
diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 6089b157d..1d8670f9f 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -145,7 +145,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     real(double), allocatable, dimension(:) :: atrans
     integer :: lab_const
     integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
-    integer :: icall,n_cont(2),kpart_next,ind_partN,k_off(2)
+    integer :: n_cont(2),kpart_next,ind_partN,k_off(2)
     integer :: stat,ilen2(2),lenb_rem(2)
     ! Remote variables to be allocated
     integer(integ),allocatable :: ibpart_rem(:,:)
@@ -259,9 +259,16 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
             lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
 
        ! Check that previous partition data have been received before starting computation
-       if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
-            call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
-       if(new_partition(index_wait)) then
+       !      if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
+       !           call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
+       if (kpart.gt.2) then
+          if (request(1,index_wait).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr)
+          if (request(2,index_wait).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr)
+        end if
+        
+        if(new_partition(index_wait)) then
           ! Now point the _rem variables at the appropriate parts of
           ! the array where we will receive the data
           offset = 0
@@ -294,13 +301,13 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
 
        ! Call the computation kernel on the previous partition
        if(a_b_c%mult_type.eq.1) then  ! C is full mult
-          call m_kern_max( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+          call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
                nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
                ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
                a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
        else if(a_b_c%mult_type.eq.2) then ! A is partial mult
-          call m_kern_min( k_off(index_wait),kpart,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+          call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
                nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
                ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
@@ -308,7 +315,60 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
        end if
        !$omp barrier
     end do main_loop
+
+    ! Do the computation on the last partition
+    !$omp master
+    ! Check that the last partition data have been received before starting computation
+    index_wait = mod(kpart+1,2) + 1
+!    if (all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
+!         call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
+    if (request(1,index_wait).ne.MPI_REQUEST_NULL) &
+         call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr)
+    if (request(2,index_wait).ne.MPI_REQUEST_NULL) &
+         call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr)
+    if(new_partition(index_wait)) then
+       ! Now point the _rem variables at the appropriate parts of
+       ! the array where we will receive the data
+       offset = 0
+       nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+       offset = offset+n_cont(index_wait)
+       ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+       offset = offset+n_cont(index_wait)
+       ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+       offset = offset+n_cont(index_wait)
+       ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+       offset = offset+ilen2(index_wait)
+       npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
+       offset = offset+3*ilen2(index_wait)
+       ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+       if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
+            5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
+          call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
+       end if
+       ! Create ibpart_rem
+       call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
+            ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
+            ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
+    end if
+    !$omp end master
+    !$omp barrier
+    ! Call the computation kernel on the last partition
+    if(a_b_c%mult_type.eq.1) then  ! C is full mult
+       call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+            nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
+            ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+            a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
+            a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
+    else if(a_b_c%mult_type.eq.2) then ! A is partial mult
+       call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
+            nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
+            ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+            a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
+            a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
+    end if
+    
     !$omp end parallel
+
     call start_timer(tmr_std_allocation)
     if(allocated(b_rem(1)%values)) deallocate(b_rem(1)%values)
     if(allocated(b_rem(2)%values)) deallocate(b_rem(2)%values)
@@ -591,7 +651,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
     logical, intent(in), optional :: do_nonb
     integer, intent(out), optional :: request(2)
 
-    integer :: icall, ind_part, ipart, nnode, offset
+    integer :: ind_part, ipart, nnode, offset
     logical :: do_nonb_local
 
     ! Set non-blocking receive flag
@@ -600,7 +660,6 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
     
     if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode))
 
-    icall=1
     ind_part = a_b_c%ahalo%lab_hcell(kpart)
     new_partition = .true.
     
@@ -623,7 +682,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
           lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode)
        end if
        allocate(b_rem(lenb_rem))
-       call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
+       call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,&
             n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
             mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request)
     end if
@@ -656,7 +715,7 @@ end subroutine do_comms
   !!    Adding tag for MPI compliance
   !!  SOURCE
   !!
-  subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
+  subroutine prefetch(this_part,ahalo,a_b_c,bmat,&
        n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, &
        parts,prim,gcs,tag,do_nonb,request)
 
@@ -675,7 +734,7 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     type(primary_set) :: prim
     type(cover_set) :: gcs
     integer :: mx_mpp
-    integer :: this_part,icall,n_cont,myid,ilen2
+    integer :: this_part,n_cont,myid,ilen2
     type(matrix), dimension(:) :: bmat
     type(matrix_halo) :: ahalo
     type(comms_data) :: a_b_c
@@ -700,14 +759,12 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
     inode = parts%i_cc2node(ind_part)
     nnode = a_b_c%neigh_node_list(this_part)
     if(inode.eq.myid+1) then ! If this is local, then copy
-       icall = 0
        ncover_yz=gcs%ncovery*gcs%ncoverz
        ilen2 = bmat(ipart)%part_nabs
        call Mquest_get_local(ipart,&
             bind_rem,b_rem,lenb_rem,bind,bmat,&
             ind_part,b,myid)
-    end if
-    if(icall.eq.1) then ! Else fetch the data
+    else ! Else fetch the data
        ilen2 = a_b_c%ilen2rec(ipart,nnode)
        if(.not.do_nonb_local) then ! Use blocking receive
           call Mquest_get( prim%mx_ngonn, &
diff --git a/src/multisiteSF_module.f90 b/src/multisiteSF_module.f90
index d2c9792c8..b8c4bb972 100644
--- a/src/multisiteSF_module.f90
+++ b/src/multisiteSF_module.f90
@@ -1238,7 +1238,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_
     ! This will be dynamically allocated/deallocated by the system
     integer :: lab_const
     integer :: ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
-    integer :: icall,n_cont,kpart_next,ind_partN,k_off
+    integer :: n_cont,kpart_next,ind_partN,k_off
     integer :: stat,ilen2,lenb_rem
     ! Remote variables to be allocated
     integer(integ),allocatable :: ibpart_rem(:)
@@ -1305,14 +1305,11 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_
 
     do kpart = 1,a_b_c%ahalo%np_in_halo  ! Main loop. Loop for halo-partition kpart
        !write(io_lun,*) 'Part: ',kpart,myid
-       icall=1
        ind_part = a_b_c%ahalo%lab_hcell(kpart)
        !write(io_lun,*) 'ind_part: ',ind_part
 
        if(kpart>1) then  ! Is it a periodic image of the previous partition ?
-          if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then 
-             icall=0
-          else ! Get the data
+          if(ind_part.ne.a_b_c%ahalo%lab_hcell(kpart-1)) then ! Get the data
              !write(io_lun,*) myid,' seq: ',size(a_b_c%parts%i_cc2seq)
              ipart = a_b_c%parts%i_cc2seq(ind_part)
              !write(io_lun,*) myid,' Alloc b_rem part: ',ipart
@@ -1328,7 +1325,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_
                 lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode)
              end if
              allocate(b_rem(lenb_rem))
-             call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&  
+             call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,&  
                   n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
                   mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
              !write(io_lun,*) 'b_rem: ',lenb_rem
@@ -1372,7 +1369,7 @@ subroutine LFD_make_Subspace_halo(myid,b,lenb,bsub,len_bsub,label_kj_sub,len_kj_
           call start_timer(tmr_std_allocation)
           allocate(b_rem(lenb_rem))
           call stop_timer(tmr_std_allocation)
-          call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& 
+          call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,& 
                n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
                mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
           lenb_rem = size(b_rem)

From eca1b1f142ec2cd179577a8be30a274f6edaaeb0 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 20 Dec 2023 14:52:00 +0000
Subject: [PATCH 06/12] Solve deadlock: - recv_part should be stored in one
 sequential list throughout the loop, not two, since it is used for the
 MPI_Irecv tags that have to match the MPI_Issend ones. - the periodicity
 check has to check the previous partition (kpart-1), not kpart-2. Adapt a
 bunch of stuff accordingly. Simplify the way the computation is taken care of
 for the last partition (now inside the loop).

---
 src/multiply_module.f90 | 150 +++++++++++++---------------------------
 1 file changed, 48 insertions(+), 102 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 1d8670f9f..8ab580b12 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -170,10 +170,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     integer, dimension(:), allocatable :: nreqs
     integer :: sends,i,j
     integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
-    type jagged_array_i
-       integer, allocatable :: values(:)
-    end type jagged_array_i
-    type(jagged_array_i) :: recv_part(2)
+    integer, allocatable :: recv_part(:)
     real(double) :: t0,t1
     integer :: request(2,2), index_rec, index_wait
 
@@ -189,11 +186,9 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     !allocate(atrans(a_b_c%amat(1)%length),STAT=stat)
     allocate(atrans(lena),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating atrans')
-    allocate(recv_part(1)%values(0:a_b_c%comms%inode),STAT=stat)
-    allocate(recv_part(2)%values(0:a_b_c%comms%inode),STAT=stat)
+    allocate(recv_part(0:a_b_c%comms%inode),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating recv_part')
-    recv_part(1)%values = zero
-    recv_part(2)%values = zero
+    recv_part = zero
     call stop_timer(tmr_std_allocation)
     !write(io_lun,*) 'Sizes: ',a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim2*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,&
     !     a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim1* &
@@ -227,12 +222,12 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz
 
     ! Receive the data from the first partition - blocking
-    call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part(2)%values, &
+    call do_comms(k_off(2), 1, part_array(:,2), n_cont(2), ilen2(2), a_b_c, b, recv_part, &
          b_rem(2)%values, lenb_rem(2), myid, ncover_yz, new_partition(2))
 
     request = MPI_REQUEST_NULL
     !$omp parallel default(shared)
-    main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo
+    main_loop: do kpart = 2,a_b_c%ahalo%np_in_halo+1
 
        ! The following include MPI operations, so they have to be on the master thread only.
        !$omp master
@@ -253,46 +248,53 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
        index_rec = mod(kpart,2) + 1
        index_wait = mod(kpart+1,2) + 1
        
-       ! Receive the data from the current partition - non-blocking
-       call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), &
-            a_b_c, b, recv_part(index_rec)%values, b_rem(index_rec)%values, &
-            lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
-
        ! Check that previous partition data have been received before starting computation
-       !      if (kpart.gt.2 .and. all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
-       !           call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
        if (kpart.gt.2) then
           if (request(1,index_wait).ne.MPI_REQUEST_NULL) &
                call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr)
           if (request(2,index_wait).ne.MPI_REQUEST_NULL) &
                call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr)
-        end if
-        
-        if(new_partition(index_wait)) then
-          ! Now point the _rem variables at the appropriate parts of
-          ! the array where we will receive the data
-          offset = 0
-          nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-          offset = offset+n_cont(index_wait)
-          ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-          offset = offset+n_cont(index_wait)
-          ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-          offset = offset+n_cont(index_wait)
-          ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-          offset = offset+ilen2(index_wait)
-          npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
-          offset = offset+3*ilen2(index_wait)
-          ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-          if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
-               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
-             call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
-          end if
-          ! Create ibpart_rem
-          call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
-               ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
-               ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
        end if
 
+       ! If that previous partition was a periodic one, copy over arrays from previous index
+       if(.not.new_partition(index_wait)) then
+          part_array(:,index_wait) = part_array(:,index_rec)
+          n_cont(index_wait) = n_cont(index_rec)
+          ilen2(index_wait) = ilen2(index_rec)
+          b_rem(index_wait) = b_rem(index_rec)
+          lenb_rem(index_wait) = lenb_rem(index_rec)
+       end if
+           
+        ! Now point the _rem variables at the appropriate parts of
+        ! the array where we have received the data
+        offset = 0
+        nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+        offset = offset+n_cont(index_wait)
+        ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+        offset = offset+n_cont(index_wait)
+        ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
+        offset = offset+n_cont(index_wait)
+        ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+        offset = offset+ilen2(index_wait)
+        npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
+        offset = offset+3*ilen2(index_wait)
+        ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
+        if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
+             5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
+           call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
+        end if
+        ! Create ibpart_rem
+        call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
+             ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
+             ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
+
+        ! Receive the data from the current partition - non-blocking
+        if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then
+           call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), &
+                a_b_c, b, recv_part, b_rem(index_rec)%values, &
+                lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
+        end if
+           
        ! Omp master doesn't include an implicit barrier. We want master
        ! to be finished with comms before calling the multiply kernels
        ! hence the explicit barrier
@@ -314,59 +316,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
                a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
        end if
        !$omp barrier
-    end do main_loop
-
-    ! Do the computation on the last partition
-    !$omp master
-    ! Check that the last partition data have been received before starting computation
-    index_wait = mod(kpart+1,2) + 1
-!    if (all(request(:,index_wait).ne.[MPI_REQUEST_NULL,MPI_REQUEST_NULL])) &
-!         call MPI_Waitall(2,request(:,index_wait),MPI_STATUSES_IGNORE,ierr)
-    if (request(1,index_wait).ne.MPI_REQUEST_NULL) &
-         call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr)
-    if (request(2,index_wait).ne.MPI_REQUEST_NULL) &
-         call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr)
-    if(new_partition(index_wait)) then
-       ! Now point the _rem variables at the appropriate parts of
-       ! the array where we will receive the data
-       offset = 0
-       nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-       offset = offset+n_cont(index_wait)
-       ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-       offset = offset+n_cont(index_wait)
-       ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-       offset = offset+n_cont(index_wait)
-       ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-       offset = offset+ilen2(index_wait)
-       npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
-       offset = offset+3*ilen2(index_wait)
-       ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-       if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
-            5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
-          call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
-       end if
-       ! Create ibpart_rem
-       call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
-            ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
-            ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
-    end if
-    !$omp end master
-    !$omp barrier
-    ! Call the computation kernel on the last partition
-    if(a_b_c%mult_type.eq.1) then  ! C is full mult
-       call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
-            nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
-            ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
-            a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-            a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
-    else if(a_b_c%mult_type.eq.2) then ! A is partial mult
-       call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
-            nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
-            ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
-            a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-            a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
-    end if
-    
+    end do main_loop    
     !$omp end parallel
 
     call start_timer(tmr_std_allocation)
@@ -405,9 +355,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     deallocate(ibpart_rem,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
-    deallocate(recv_part(1)%values,STAT=stat)
-    if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
-    deallocate(recv_part(2)%values,STAT=stat)
+    deallocate(recv_part,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
     call stop_timer(tmr_std_allocation)
     call my_barrier
@@ -658,14 +606,12 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
     do_nonb_local = .false.
     if (present(do_nonb)) do_nonb_local = do_nonb
     
-    if(.not.allocated(recv_part)) allocate(recv_part(0:a_b_c%comms%inode))
-
     ind_part = a_b_c%ahalo%lab_hcell(kpart)
     new_partition = .true.
     
     ! Check if this is a periodic image of the previous partition
-    if(kpart>2) then
-       if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-2)) then
+    if(kpart>1) then
+       if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then
           new_partition = .false.
        end if
     end if

From bd1ef120e91ba715aed6a7227ba61c56944a3b83 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 20 Dec 2023 15:26:52 +0000
Subject: [PATCH 07/12] Some clean up of variables from PR 295

---
 src/multiply_module.f90 | 111 +++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 58 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 8ab580b12..f6e250e45 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -148,21 +148,18 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     integer :: n_cont(2),kpart_next,ind_partN,k_off(2)
     integer :: stat,ilen2(2),lenb_rem(2)
     ! Remote variables to be allocated
-    integer(integ),allocatable :: ibpart_rem(:,:)
+    integer(integ),allocatable :: ibpart_rem(:)
     type jagged_array_r
        real(double), allocatable :: values(:)
     end type jagged_array_r
     type(jagged_array_r) :: b_rem(2)
     ! Remote variables which will point to part_array
-     type jagged_pointer_array_i
-       integer(integ),pointer :: values(:)
-    end type jagged_pointer_array_i
-    type(jagged_pointer_array_i) :: nbnab_rem(2)
-    type(jagged_pointer_array_i) :: ibseq_rem(2)
-    type(jagged_pointer_array_i) :: ibind_rem(2)
-    type(jagged_pointer_array_i) :: ib_nd_acc_rem(2)
-    type(jagged_pointer_array_i) :: ibndimj_rem(2)
-    type(jagged_pointer_array_i) :: npxyz_rem(2)
+    integer(integ), dimension(:), pointer :: nbnab_rem
+    integer(integ), dimension(:), pointer :: ibseq_rem
+    integer(integ), dimension(:), pointer :: ibind_rem
+    integer(integ), dimension(:), pointer :: ib_nd_acc_rem
+    integer(integ), dimension(:), pointer :: ibndimj_rem
+    integer(integ), dimension(:), pointer :: npxyz_rem
     ! Arrays for remote variables to point to
     integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
          5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs, 2)
@@ -172,7 +169,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
     integer, allocatable :: recv_part(:)
     real(double) :: t0,t1
-    integer :: request(2,2), index_rec, index_wait
+    integer :: request(2,2), index_rec, index_comp
 
     logical :: new_partition(2)
 
@@ -181,9 +178,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     if(iprint_mat>3.AND.myid==0) t0 = mtime()
     ! Allocate memory for the elements
-    allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,2),STAT=stat)
+    allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating ibpart_rem')
-    !allocate(atrans(a_b_c%amat(1)%length),STAT=stat)
     allocate(atrans(lena),STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error allocating atrans')
     allocate(recv_part(0:a_b_c%comms%inode),STAT=stat)
@@ -234,59 +230,58 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
 
        ! These indices point to elements of all the 2-element vectors of the variables needed
        ! for the do_comms and m_kern_min/max calls. They alternate between the values of
-       ! (index_rec,index_wait)=(1,2) and (2,1) from iteration to iteration.
+       ! (index_rec,index_comp)=(1,2) and (2,1) from iteration to iteration.
        ! index_rec points to the values being received in the current iteration in do_comms,
-       ! and index_wait points to the values received in the previous iteration, thus computation
+       ! and index_comp points to the values received in the previous iteration, thus computation
        ! can start on them in m_kern_min/max
        ! These indices are also used to point to elements of the 2x2-element request() array,
        ! that contains the MPI request numbers for the non-blocking data receives. There are 2
        ! MPI_Irecv calls per call of do_comms, and request() keeps track of 2 sets of those calls,
        ! thus it's of size 2x2.
        ! request(:,index_rec) points to the requests from the current iteration MPI_Irecv,
-       ! and request(:,index_wait) points to the requests from the previous iteration, that have
+       ! and request(:,index_comp) points to the requests from the previous iteration, that have
        ! to complete in order for the computation to start (thus the MPI_Wait).
        index_rec = mod(kpart,2) + 1
-       index_wait = mod(kpart+1,2) + 1
+       index_comp = mod(kpart+1,2) + 1
        
        ! Check that previous partition data have been received before starting computation
        if (kpart.gt.2) then
-          if (request(1,index_wait).ne.MPI_REQUEST_NULL) &
-               call MPI_Wait(request(1,index_wait),MPI_STATUSES_IGNORE,ierr)
-          if (request(2,index_wait).ne.MPI_REQUEST_NULL) &
-               call MPI_Wait(request(2,index_wait),MPI_STATUSES_IGNORE,ierr)
+          if (request(1,index_comp).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr)
+          if (request(2,index_comp).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr)
        end if
 
        ! If that previous partition was a periodic one, copy over arrays from previous index
-       if(.not.new_partition(index_wait)) then
-          part_array(:,index_wait) = part_array(:,index_rec)
-          n_cont(index_wait) = n_cont(index_rec)
-          ilen2(index_wait) = ilen2(index_rec)
-          b_rem(index_wait) = b_rem(index_rec)
-          lenb_rem(index_wait) = lenb_rem(index_rec)
+       if(.not.new_partition(index_comp)) then
+          part_array(:,index_comp) = part_array(:,index_rec)
+          n_cont(index_comp) = n_cont(index_rec)
+          ilen2(index_comp) = ilen2(index_rec)
+          b_rem(index_comp) = b_rem(index_rec)
+          lenb_rem(index_comp) = lenb_rem(index_rec)
        end if
            
         ! Now point the _rem variables at the appropriate parts of
         ! the array where we have received the data
         offset = 0
-        nbnab_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-        offset = offset+n_cont(index_wait)
-        ibind_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-        offset = offset+n_cont(index_wait)
-        ib_nd_acc_rem(index_wait)%values => part_array(offset+1:offset+n_cont(index_wait),index_wait)
-        offset = offset+n_cont(index_wait)
-        ibseq_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-        offset = offset+ilen2(index_wait)
-        npxyz_rem(index_wait)%values => part_array(offset+1:offset+3*ilen2(index_wait),index_wait)
-        offset = offset+3*ilen2(index_wait)
-        ibndimj_rem(index_wait)%values => part_array(offset+1:offset+ilen2(index_wait),index_wait)
-        if(offset+ilen2(index_wait)>3*a_b_c%parts%mx_mem_grp+ &
+        nbnab_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp)
+        offset = offset+n_cont(index_comp)
+        ibind_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp)
+        offset = offset+n_cont(index_comp)
+        ib_nd_acc_rem => part_array(offset+1:offset+n_cont(index_comp),index_comp)
+        offset = offset+n_cont(index_comp)
+        ibseq_rem => part_array(offset+1:offset+ilen2(index_comp),index_comp)
+        offset = offset+ilen2(index_comp)
+        npxyz_rem => part_array(offset+1:offset+3*ilen2(index_comp),index_comp)
+        offset = offset+3*ilen2(index_comp)
+        ibndimj_rem => part_array(offset+1:offset+ilen2(index_comp),index_comp)
+        if(offset+ilen2(index_comp)>3*a_b_c%parts%mx_mem_grp+ &
              5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
            call cq_abort('mat_mult: error pointing to part_array ',kpart-1)
         end if
         ! Create ibpart_rem
-        call end_part_comms(myid,n_cont(index_wait),nbnab_rem(index_wait)%values, &
-             ibind_rem(index_wait)%values,npxyz_rem(index_wait)%values,&
-             ibpart_rem(:,index_wait),ncover_yz,a_b_c%gcs%ncoverz)
+        call end_part_comms(myid,n_cont(index_comp),nbnab_rem, &
+             ibind_rem,npxyz_rem,ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
 
         ! Receive the data from the current partition - non-blocking
         if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then
@@ -303,17 +298,17 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
 
        ! Call the computation kernel on the previous partition
        if(a_b_c%mult_type.eq.1) then  ! C is full mult
-          call m_kern_max( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
-               nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
-               ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+          call m_kern_max( k_off(index_comp),kpart-1,ib_nd_acc_rem, ibind_rem, &
+               nbnab_rem,ibpart_rem,ibseq_rem, &
+               ibndimj_rem, atrans,b_rem(index_comp)%values,c,a_b_c%ahalo,a_b_c%chalo, &
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-               a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
+               a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc)
        else if(a_b_c%mult_type.eq.2) then ! A is partial mult
-          call m_kern_min( k_off(index_wait),kpart-1,ib_nd_acc_rem(index_wait)%values, ibind_rem(index_wait)%values, &
-               nbnab_rem(index_wait)%values,ibpart_rem(:,index_wait),ibseq_rem(index_wait)%values, &
-               ibndimj_rem(index_wait)%values, atrans,b_rem(index_wait)%values,c,a_b_c%ahalo,a_b_c%chalo, &
+          call m_kern_min( k_off(index_comp),kpart-1,ib_nd_acc_rem, ibind_rem, &
+               nbnab_rem,ibpart_rem,ibseq_rem, &
+               ibndimj_rem, atrans,b_rem(index_comp)%values,c,a_b_c%ahalo,a_b_c%chalo, &
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
-               a_b_c%prim%mx_iprim, lena, lenb_rem(index_wait), lenc)
+               a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc)
        end if
        !$omp barrier
     end do main_loop    
@@ -586,7 +581,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
     integer, intent(in) :: kpart
     type(matrix_mult), intent(in) :: a_b_c
     real(double), intent(in) :: b(:)
-    integer, allocatable, dimension(:), intent(inout) :: recv_part
+    integer, dimension(:), intent(inout) :: recv_part
     real(double), allocatable, intent(inout) :: b_rem(:)
     integer, intent(out) :: lenb_rem
     integer, intent(in) :: myid, ncover_yz
@@ -712,23 +707,23 @@ subroutine prefetch(this_part,ahalo,a_b_c,bmat,&
             ind_part,b,myid)
     else ! Else fetch the data
        ilen2 = a_b_c%ilen2rec(ipart,nnode)
-       if(.not.do_nonb_local) then ! Use blocking receive
-          call Mquest_get( prim%mx_ngonn, &
+       if(do_nonb_local) then ! Use non-blocking receive
+          if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.')
+          call Mquest_get_nonb( prim%mx_ngonn, &
                a_b_c%ilen2rec(ipart,nnode),&
                a_b_c%ilen3rec(ipart,nnode),&
                n_cont,inode,ipart,myid,&
                bind_rem,b_rem,lenb_rem,bind,&
                a_b_c%istart(ipart,nnode), &
-               bmat(1)%mx_abs,parts%mx_mem_grp,tag)
-       else ! Use non-blocking receive
-          if (.not.present(request)) call cq_abort('Need to provide MPI request argument for non-blocking receive.')
-          call Mquest_get_nonb( prim%mx_ngonn, &
+               bmat(1)%mx_abs,parts%mx_mem_grp,tag,request)
+       else ! Use blocking receive
+          call Mquest_get( prim%mx_ngonn, &
                a_b_c%ilen2rec(ipart,nnode),&
                a_b_c%ilen3rec(ipart,nnode),&
                n_cont,inode,ipart,myid,&
                bind_rem,b_rem,lenb_rem,bind,&
                a_b_c%istart(ipart,nnode), &
-               bmat(1)%mx_abs,parts%mx_mem_grp,tag,request)
+               bmat(1)%mx_abs,parts%mx_mem_grp,tag)
        end if
     end if
     return

From 26b2585a42ccfa19c53653d453c4ae7e52f0a136 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 20 Dec 2023 15:43:11 +0000
Subject: [PATCH 08/12] Attempt to fix failing test in CI for "double free or
 corruption" - passes locally though....

---
 src/multiply_module.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index f6e250e45..fa1948bdf 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -350,7 +350,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     deallocate(ibpart_rem,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
-    deallocate(recv_part,STAT=stat)
+    if(allocated(recv_part)) deallocate(recv_part,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
     call stop_timer(tmr_std_allocation)
     call my_barrier

From a0b6b2e125ac2a8d2d5f0a0c005d5aa08d950840 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 20 Dec 2023 16:56:39 +0000
Subject: [PATCH 09/12] Further attempt to fix failing CI test. Using \"-check
 bounds\" when compiling, showed that the index of recv_part did not start at
 0 when it was not an allocatable in a function that used it.

---
 src/multiply_module.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index fa1948bdf..7d5fb0c93 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -350,7 +350,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
     call start_timer(tmr_std_allocation)
     deallocate(ibpart_rem,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
-    if(allocated(recv_part)) deallocate(recv_part,STAT=stat)
+    deallocate(recv_part,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
     call stop_timer(tmr_std_allocation)
     call my_barrier
@@ -581,7 +581,7 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
     integer, intent(in) :: kpart
     type(matrix_mult), intent(in) :: a_b_c
     real(double), intent(in) :: b(:)
-    integer, dimension(:), intent(inout) :: recv_part
+    integer, allocatable, dimension(:), intent(inout) :: recv_part
     real(double), allocatable, intent(inout) :: b_rem(:)
     integer, intent(out) :: lenb_rem
     integer, intent(in) :: myid, ncover_yz

From 4d77876157d134bd750c2f2a97e0cc203ad332cc Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 10 Jan 2024 11:20:11 +0000
Subject: [PATCH 10/12] Update makefile for kathleen.

---
 src/system/system.kathleen.ucl.ac.uk.make | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/system/system.kathleen.ucl.ac.uk.make b/src/system/system.kathleen.ucl.ac.uk.make
index ae8a383a0..e9e093596 100644
--- a/src/system/system.kathleen.ucl.ac.uk.make
+++ b/src/system/system.kathleen.ucl.ac.uk.make
@@ -26,16 +26,16 @@ COMPFLAGS_F77= $(COMPFLAGS)
 # LibXC: choose between LibXC compatibility below or Conquest XC library
 
 # Conquest XC library
-XC_LIBRARY = CQ
-XC_LIB =
-XC_COMPFLAGS =
+#XC_LIBRARY = CQ
+#XC_LIB =
+#XC_COMPFLAGS =
 
 # LibXC compatibility
 # Choose LibXC version: v4 (deprecated) or v5/6 (v5 and v6 have the same interface)
-# XC_LIBRARY = LibXC_v4
+XC_LIBRARY = LibXC_v4
 #XC_LIBRARY = LibXC_v5
-#XC_LIB = -lxcf90 -lxc
-#XC_COMPFLAGS = -I/usr/local/include
+XC_LIB = -L/shared/ucl/apps/libxc/4.2.3/intel-2018/lib -lxcf90 -lxc
+XC_COMPFLAGS = -I/shared/ucl/apps/libxc/4.2.3/intel-2018/include
 
 # Set FFT library
 FFT_LIB=-lmkl_rt

From c72017285852d1c3552bbdef7f95d9a210cf86a6 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 24 Jan 2024 11:58:28 +0000
Subject: [PATCH 11/12] Add .gitignore and needed housekeeping, to make
 benchmark folders manageable.

---
 benchmarks/.gitignore                                | 12 ++++++++++++
 ...22_G200_8proc => Conquest_output_K222_G200_8proc} |  0
 benchmarks/water_64mols/Conquest_input               |  2 +-
 .../water_64mols/{H2O_coord.in => H2O_coord.dat}     |  0
 4 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/.gitignore
 rename benchmarks/K222_G200/{output_K222_G200_8proc => Conquest_output_K222_G200_8proc} (100%)
 rename benchmarks/water_64mols/{H2O_coord.in => H2O_coord.dat} (100%)

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 000000000..a5a7a0c06
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1,12 @@
+# Ignore everything
+*
+# Don't ignore directories, so we can recurse into them
+!*/
+# Don't ignore .gitignore and specific files in all subdirectories
+!.gitignore
+!*/Conquest_input
+!*/Conquest_output*
+!*/README.md
+!*/*.ion
+!*/*.dat
+!*/*.xtl
diff --git a/benchmarks/K222_G200/output_K222_G200_8proc b/benchmarks/K222_G200/Conquest_output_K222_G200_8proc
similarity index 100%
rename from benchmarks/K222_G200/output_K222_G200_8proc
rename to benchmarks/K222_G200/Conquest_output_K222_G200_8proc
diff --git a/benchmarks/water_64mols/Conquest_input b/benchmarks/water_64mols/Conquest_input
index 9d456a900..353816b1b 100644
--- a/benchmarks/water_64mols/Conquest_input
+++ b/benchmarks/water_64mols/Conquest_input
@@ -1,5 +1,5 @@
 IO.Title    Water static test, DZ, GridCutoff=50Ha
-IO.Coordinates    H2O_coord.in
+IO.Coordinates    H2O_coord.dat
 IO.FractionalAtomicCoords F
 IO.Iprint 1
 IO.WriteOutToFile F
diff --git a/benchmarks/water_64mols/H2O_coord.in b/benchmarks/water_64mols/H2O_coord.dat
similarity index 100%
rename from benchmarks/water_64mols/H2O_coord.in
rename to benchmarks/water_64mols/H2O_coord.dat

From 6ab122feba1ee4bf006e8f0bc4f440c5c211e36c Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Wed, 13 Mar 2024 21:33:04 +0000
Subject: [PATCH 12/12] Remove unneeded barriers.

---
 src/comms_module.f90    |  2 --
 src/generic_comms.f90   |  4 ++++
 src/multiply_module.f90 | 47 +++++++++++++++++++----------------------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/comms_module.f90 b/src/comms_module.f90
index 59237adae..e97ae6d2e 100644
--- a/src/comms_module.f90
+++ b/src/comms_module.f90
@@ -142,8 +142,6 @@ subroutine Mquest_start_send(a_b_c,b,nreq,myid,mx_nponn,sends)
           endif
        enddo ! Partitions to send
     enddo ! Nodes to send to
-    ! Synchronise with other nodes
-    call MPI_Barrier(MPI_COMM_WORLD,ierr)
     return
   end subroutine Mquest_start_send
 !!***
diff --git a/src/generic_comms.f90 b/src/generic_comms.f90
index 245d63eed..bd934aaf0 100644
--- a/src/generic_comms.f90
+++ b/src/generic_comms.f90
@@ -287,13 +287,17 @@ subroutine init_comms(myid,number_of_procs)
 
     ! Local variables
     integer :: ierr
+    character(len=MPI_MAX_PROCESSOR_NAME) :: pname
+    integer :: rlen
 
     call MPI_INIT(ierr)
     if(ierr.ne.0) write(io_lun,*) 'ierr is ',ierr
     call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
     call MPI_COMM_SIZE( MPI_COMM_WORLD, number_of_procs, ierr )
+    call MPI_GET_PROCESSOR_NAME(pname, rlen, ierr)
     inode  = myid+1
     ionode = 1
+    write(io_lun,*) 'Process ',myid,' is running on node ',pname
     if(inode==ionode) open(unit=warning_lun, file='Conquest_warnings')
     call mtmini()
     return
diff --git a/src/multiply_module.f90 b/src/multiply_module.f90
index 7d5fb0c93..08ad45fec 100644
--- a/src/multiply_module.f90
+++ b/src/multiply_module.f90
@@ -244,15 +244,8 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
        index_rec = mod(kpart,2) + 1
        index_comp = mod(kpart+1,2) + 1
        
-       ! Check that previous partition data have been received before starting computation
-       if (kpart.gt.2) then
-          if (request(1,index_comp).ne.MPI_REQUEST_NULL) &
-               call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr)
-          if (request(2,index_comp).ne.MPI_REQUEST_NULL) &
-               call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr)
-       end if
-
-       ! If that previous partition was a periodic one, copy over arrays from previous index
+       ! If the previous partition was a periodic one, copy over arrays from previous index.
+       ! No need to wait for communication to finish.
        if(.not.new_partition(index_comp)) then
           part_array(:,index_comp) = part_array(:,index_rec)
           n_cont(index_comp) = n_cont(index_rec)
@@ -261,6 +254,23 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
           lenb_rem(index_comp) = lenb_rem(index_rec)
        end if
            
+        ! Receive the data from the current partition - non-blocking
+        if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then
+           call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), &
+                a_b_c, b, recv_part, b_rem(index_rec)%values, &
+                lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
+        end if
+           
+       ! Check that previous partition data have been received before starting computation
+       if (kpart.gt.2) then
+          if (request(1,index_comp).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(1,index_comp),MPI_STATUSES_IGNORE,ierr)
+          if (request(2,index_comp).ne.MPI_REQUEST_NULL) &
+               call MPI_Wait(request(2,index_comp),MPI_STATUSES_IGNORE,ierr)
+          if (request(1,index_comp).ne.MPI_REQUEST_NULL .or. request(2,index_comp).ne.MPI_REQUEST_NULL) &
+             call cq_abort('mat_mult: error freeing MPI_request after MPI_Wait')
+       end if
+
         ! Now point the _rem variables at the appropriate parts of
         ! the array where we have received the data
         offset = 0
@@ -283,13 +293,6 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
         call end_part_comms(myid,n_cont(index_comp),nbnab_rem, &
              ibind_rem,npxyz_rem,ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
 
-        ! Receive the data from the current partition - non-blocking
-        if (kpart.lt.a_b_c%ahalo%np_in_halo+1) then
-           call do_comms(k_off(index_rec), kpart, part_array(:,index_rec), n_cont(index_rec), ilen2(index_rec), &
-                a_b_c, b, recv_part, b_rem(index_rec)%values, &
-                lenb_rem(index_rec), myid, ncover_yz, new_partition(index_rec), .true., request(:,index_rec))
-        end if
-           
        ! Omp master doesn't include an implicit barrier. We want master
        ! to be finished with comms before calling the multiply kernels
        ! hence the explicit barrier
@@ -310,7 +313,6 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
                a_b_c%ltrans,a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
                a_b_c%prim%mx_iprim, lena, lenb_rem(index_comp), lenc)
        end if
-       !$omp barrier
     end do main_loop    
     !$omp end parallel
 
@@ -330,30 +332,25 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)
           !write(io_lun,*) 'Send done ',i,myid
        end do
     end if
-    call my_barrier
     call start_timer(tmr_std_allocation)
     deallocate(nreqs,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating nreqs')
     call stop_timer(tmr_std_allocation)
-    call my_barrier
     ! --- for type 2, make backward local transpose of A-matrix -----------
     if(a_b_c%mult_type.eq.2) then
        invdir=1
        call loc_trans( a_b_c%ltrans, a_b_c%ahalo,a,lena,atrans,lena,invdir)
     end if
-    call my_barrier
     call start_timer(tmr_std_allocation)
     deallocate(atrans,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating atrans')
     call stop_timer(tmr_std_allocation)
-    call my_barrier
     call start_timer(tmr_std_allocation)
     deallocate(ibpart_rem,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
     deallocate(recv_part,STAT=stat)
     if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
     call stop_timer(tmr_std_allocation)
-    call my_barrier
     !deallocate(b_rem,STAT=stat)
     !if(stat/=0) call cq_abort('mat_mult: error deallocating b_rem')
     !call my_barrier
@@ -627,9 +624,9 @@ subroutine do_comms(k_off, kpart, part_array, n_cont, ilen2, a_b_c, b, recv_part
             n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
             mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2,do_nonb,request)
     end if
-       
-       k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
-     end subroutine do_comms
+    
+    k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
+  end subroutine do_comms
 
   !!****f* multiply_module/prefetch *
   !!