Skip to content

Commit

Permalink
Improvements and bug fixes.
Browse files Browse the repository at this point in the history
- Use do concurent
- Update fpm.rsp
  • Loading branch information
gha3mi committed Jan 30, 2024
1 parent b52a99c commit e46256d
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 28 deletions.
4 changes: 2 additions & 2 deletions fpm.rsp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ options --flag "-O3 -mtune=native -xHost -qmkl -qopenmp -ipo -coarray -coarray-n
options test
options --compiler ifx
options --profile release
options --flag "-O3 -mtune=native -xHost -qmkl -qopenmp"
options --flag "-O3 -mtune=native -xHost -qmkl -qopenmp -DUSE_DO_CONCURRENT"

@ifx-test-coarray
options test
options --compiler ifx
options --profile release
options --flag "-O3 -mtune=native -xHost -qmkl -qopenmp -coarray -coarray-num-images=4 -DUSE_COARRAY"
options --flag "-O3 -mtune=native -xHost -qmkl -qopenmp -DUSE_DO_CONCURRENT -coarray -coarray-num-images=4 -DUSE_COARRAY"

@nvfortran-test
options test
Expand Down
160 changes: 134 additions & 26 deletions src/formatmul.f90
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(B,2), nimg, block_size, start_elem, end_elem)
allocate(B_block(n, block_size(im))[*], C_block(m, block_size(im))[*])
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA, transB, option)
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA=.false., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -64,7 +64,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B, transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:,:)[im], B, transA=.true., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -81,7 +81,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -100,7 +100,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -120,7 +120,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -139,7 +139,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -160,7 +160,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -177,7 +177,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA, transB, option)
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -196,7 +196,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
call compute_block_ranges(size(B,2), nimg, block_size, start_elem, end_elem)
allocate(B_block(n, block_size(im))[*], C_block(m, block_size(im))[*])
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA, transB, option)
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA=.false., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -206,7 +206,7 @@ impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) resul
end if

#else
C = matmul(A, B, transA, transB, option)
C = matmul(A, B, transA=transA, transB=transB, option=option)
#endif

end function mat_mat_coarray_rel
Expand Down Expand Up @@ -235,7 +235,7 @@ impure function mat_vec_coarray_rel(A, v, transA, option, coarray) result(w)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(w_block(block_size(im))[*], A_block(size(A,1), block_size(im))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
w_block(:)[im] = matmul(A_block(:, :)[im], v, transA, option)
w_block(:)[im] = matmul(A_block(:, :)[im], v, transA=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -251,7 +251,7 @@ impure function mat_vec_coarray_rel(A, v, transA, option, coarray) result(w)
allocate(w_block(size(A,1))[*], v_block(block_size(im))[*], A_block(size(A,1), block_size(im))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
v_block(:)[im] = v(start_elem(im):end_elem(im))
w_block(:)[im] = matmul(A_block(:,:)[im], v_block(:)[im], transA, option)
w_block(:)[im] = matmul(A_block(:,:)[im], v_block(:)[im], transA=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -268,7 +268,7 @@ impure function mat_vec_coarray_rel(A, v, transA, option, coarray) result(w)
allocate(w_block(size(A,1))[*], v_block(block_size(im))[*], A_block(size(A,1), block_size(im))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
v_block(:)[im] = v(start_elem(im):end_elem(im))
w_block(:)[im] = matmul(A_block(:,:)[im], v_block(:)[im], transA, option)
w_block(:)[im] = matmul(A_block(:,:)[im], v_block(:)[im], transA=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
Expand All @@ -278,7 +278,7 @@ impure function mat_vec_coarray_rel(A, v, transA, option, coarray) result(w)
end if

#else
w = matmul(A, v, transA, option)
w = matmul(A, v, transA=transA, option=option)
#endif

end function mat_vec_coarray_rel
Expand Down Expand Up @@ -323,99 +323,180 @@ pure function mat_mat_block_rel(a, b, transA, transB, option, nblock) result(c)
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(B,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA, transB, option)
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#endif
else if (transA .and. transB) then
! ATBT
allocate(C(size(A,2), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA, transB, option)
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.true., option=option)
end do
#endif
else if (transA .and. .not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA, transB, option)
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
else if (.not.transA .and. transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA, transB, option)
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
end if
else if (present(transA) .or. present(transB)) then
if (present(transA)) then
if (transA) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA, transB, option)
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
else if (.not.transA) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA, transB, option)
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
end if
else if (present(transB)) then
if (transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA, transB, option)
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
else if (.not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA, transB, option)
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
end if
end if
else if (.not.present(transA) .and. .not.present(transB)) then
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(B,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:C)
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA, transB, option)
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#endif
end if

end function mat_mat_block_rel
Expand All @@ -439,33 +520,60 @@ pure function mat_vec_block_rel(A, v, transA, option, nblock) result(w)
! ATv
allocate(w(size(A,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:w)
se = start_elem(ib)
ee = end_elem(ib)
w(se:ee) = &
w(se:ee) + matmul(A(:,se:ee), v, transA=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
w(se:ee) = &
w(se:ee) + matmul(A(:,se:ee), v, transA, option)
w(se:ee) + matmul(A(:,se:ee), v, transA=.true., option=option)
end do
#endif
else if (.not. transA) then
! Av
allocate(w(size(A,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:w)
se = start_elem(ib)
ee = end_elem(ib)
w(:) = &
w(:) + matmul(A(:,se:ee), v(se:ee), transA=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
w(:) = &
w(:) + matmul(A(:,se:ee), v(se:ee), transA, option)
w(:) + matmul(A(:,se:ee), v(se:ee), transA=.false., option=option)
end do
#endif
end if
else if (.not. present(transA)) then
! Av
allocate(w(size(A,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock) reduce(+:w)
se = start_elem(ib)
ee = end_elem(ib)
w(:) = &
w(:) + matmul(A(:,se:ee), v(se:ee), transA=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
w(:) = &
w(:) + matmul(A(:,se:ee), v(se:ee), transA, option)
w(:) + matmul(A(:,se:ee), v(se:ee), transA=.false., option=option)
end do
#endif
end if

end function mat_vec_block_rel
Expand Down

0 comments on commit e46256d

Please sign in to comment.