From 1c44130b0b01a02e203e42670972b42877df5c27 Mon Sep 17 00:00:00 2001
From: "Auriane R." <48684432+aurianer@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:57:24 +0200
Subject: [PATCH] Move stdexec compilation to c++ files + add CI for cuda
 stdexec configuration (#1188)

---
 ci/.gitlab-ci.yml                             |  1 +
 ci/cuda/gcc13_release_stdexec.yml             | 21 ++++++++
 ci/docker/release-cuda-stdexec.yaml           | 32 +++++++++++
 .../kernels/internal/all_reduce.h             |  6 ++-
 include/dlaf/permutations/general/impl.h      | 53 +++++++++++++++++--
 include/dlaf/permutations/general/perms.h     | 31 ++++++-----
 spack/packages/dla-future/package.py          |  4 +-
 src/permutations/general/perms.cu             | 46 ++--------------
 8 files changed, 128 insertions(+), 66 deletions(-)
 create mode 100644 ci/cuda/gcc13_release_stdexec.yml
 create mode 100644 ci/docker/release-cuda-stdexec.yaml
diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
index 5fcbba6f0b..dc5820f10b 100644
--- a/ci/.gitlab-ci.yml
+++ b/ci/.gitlab-ci.yml
@@ -15,5 +15,6 @@ include:
   - local: 'ci/cuda/gcc11_release_scalapack.yml'
   - local: 'ci/cuda/gcc11_codecov.yml'
   - local: 'ci/cuda/gcc11_debug_scalapack.yml'
+  - local: 'ci/cuda/gcc13_release_stdexec.yml'
   - local: 'ci/rocm/clang14_release.yml'
   - local: 'ci/rocm/clang14_release_stdexec.yml'
diff --git a/ci/cuda/gcc13_release_stdexec.yml b/ci/cuda/gcc13_release_stdexec.yml
new file mode 100644
index 0000000000..24dccbdf1a
--- /dev/null
+++ b/ci/cuda/gcc13_release_stdexec.yml
@@ -0,0 +1,21 @@
+include:
+  - local: 'ci/common-ci.yml'
+
+cuda gcc13 stdexec release deps:
+  extends: .build_deps_common
+  variables:
+    BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04
+    COMPILER: gcc@13
+    CXXSTD: 20
+    SPACK_ENVIRONMENT: ci/docker/release-cuda-stdexec.yaml
+    USE_MKL: "ON"
+    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/build
+
+cuda gcc13 stdexec release build:
+  extends:
+    - .build_common
+    - .build_for_daint-gpu
+  needs:
+    - cuda gcc13 stdexec release deps
+  variables:
+    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/deploy:$CI_COMMIT_SHA
diff --git a/ci/docker/release-cuda-stdexec.yaml b/ci/docker/release-cuda-stdexec.yaml
new file mode 100644
index 0000000000..366ff247ef
--- /dev/null
+++ b/ci/docker/release-cuda-stdexec.yaml
@@ -0,0 +1,32 @@
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+spack:
+  include:
+  - /spack_environment/common.yaml
+
+  view: false
+  concretizer:
+    unify:
+      true
+
+  specs:
+    - dla-future@master +cuda +miniapps +ci-test
+
+  packages:
+    all:
+      variants:
+        - 'build_type=Release'
+    pika:
+      require:
+        - '+stdexec'
+    stdexec:
+      require:
+        - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
diff --git a/include/dlaf/communication/kernels/internal/all_reduce.h b/include/dlaf/communication/kernels/internal/all_reduce.h
index f4dff2f1e5..767c79d903 100644
--- a/include/dlaf/communication/kernels/internal/all_reduce.h
+++ b/include/dlaf/communication/kernels/internal/all_reduce.h
@@ -17,6 +17,7 @@
 #include <mpi.h>
 
 #include <pika/execution.hpp>
+#include <pika/execution_base/any_sender.hpp>
 
 #include <dlaf/common/callable_object.h>
 #include <dlaf/common/eti.h>
@@ -80,8 +81,9 @@ template <Device DCommIn, Device DCommOut, class T, Device DIn, Device DOut>
     // written into the output tile instead).  The reduction is explicitly done
     // on CPU memory so that we can manage potential asynchronous copies between
     // CPU and GPU. A reduction requires contiguous memory.
-    return withTemporaryTile<DCommIn, CopyToDestination::Yes, CopyFromDestination::No,
-                             RequireContiguous::Yes>(std::move(tile_in), std::move(all_reduce));
+    return pika::execution::experimental::make_unique_any_sender(
+        withTemporaryTile<DCommIn, CopyToDestination::Yes, CopyFromDestination::No,
+                          RequireContiguous::Yes>(std::move(tile_in), std::move(all_reduce)));
   };
 
 #if !defined(DLAF_WITH_MPI_GPU_AWARE)
diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h
index 8e4e6b958b..8cd94800de 100644
--- a/include/dlaf/permutations/general/impl.h
+++ b/include/dlaf/permutations/general/impl.h
@@ -131,6 +131,51 @@ void applyPermutationOnCPU(
   }
 }
 
+#if defined(DLAF_WITH_GPU)
+
+template <class T>
+MatrixLayout getMatrixLayout(const matrix::Distribution& distr,
+                             const std::vector<matrix::Tile<T, Device::GPU>>& tiles) {
+  LocalTileSize tile_sz = distr.localNrTiles();
+  MatrixLayout layout;
+  layout.nb = distr.blockSize().rows();
+  layout.ld = tiles[0].ld();
+  layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0;
+  layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0;
+  return layout;
+}
+
+template <class T>
+MatrixLayout getMatrixLayout(
+    const matrix::Distribution& distr,
+    const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& tiles) {
+  const LocalTileSize tile_sz = distr.localNrTiles();
+  MatrixLayout layout;
+  layout.nb = distr.blockSize().rows();
+  layout.ld = tiles[0].get().ld();
+  layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0;
+  layout.col_offset =
+      (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0;
+  return layout;
+}
+
+template <class T, Coord coord>
+void applyPermutationsOnDeviceWrapper(
+    GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
+    const matrix::Distribution& distr, const SizeType* perms,
+    const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
+    const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream) {
+  MatrixLayout in_layout = getMatrixLayout(distr, in_tiles);
+  MatrixLayout out_layout = getMatrixLayout(distr, out_tiles);
+  const T* in = in_tiles[0].get().ptr();
+  T* out = out_tiles[0].ptr();
+
+  applyPermutationsOnDevice<T, coord>(out_begin, sz, in_offset, perms, in_layout, in, out_layout, out,
+                                      stream);
+}
+
+#endif
+
 template <Backend B, Device D, class T, Coord C>
 void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end,
                                     Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in,
@@ -188,8 +233,8 @@ void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end
       TileElementIndex zero(0, 0);
       const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero);
 
-      applyPermutationsOnDevice<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr,
-                                      mat_in_tiles, mat_out_tiles, stream);
+      applyPermutationsOnDeviceWrapper<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist,
+                                             i_ptr, mat_in_tiles, mat_out_tiles, stream);
     };
 
     ex::start_detached(std::move(sender) |
@@ -408,8 +453,8 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i
       TileElementIndex zero(0, 0);
       const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero);
 
-      applyPermutationsOnDevice<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr,
-                                      mat_in_tiles, mat_out_tiles, stream);
+      applyPermutationsOnDeviceWrapper<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist,
+                                             i_ptr, mat_in_tiles, mat_out_tiles, stream);
     };
 
     ex::start_detached(std::move(sender) |
diff --git a/include/dlaf/permutations/general/perms.h b/include/dlaf/permutations/general/perms.h
index 21fadb9f81..149b736fc0 100644
--- a/include/dlaf/permutations/general/perms.h
+++ b/include/dlaf/permutations/general/perms.h
@@ -16,25 +16,28 @@
 
 #include <whip.hpp>
 
-#include <dlaf/matrix/distribution.h>
-#include <dlaf/matrix/tile.h>
+#include <dlaf/matrix/index.h>
 #include <dlaf/types.h>
 
 namespace dlaf::permutations::internal {
 
+struct MatrixLayout {
+  SizeType nb;          // square tile size
+  SizeType ld;          // tile leading dimension
+  SizeType row_offset;  // tile offset to first element of tile on the next row
+  SizeType col_offset;  // tile offset to first element of tile on the next column
+};
+
 template <class T, Coord coord>
-void applyPermutationsOnDevice(
-    GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
-    const matrix::Distribution& distr, const SizeType* perms,
-    const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
-    const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream);
-
-#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord)                                                  \
-  kword template void applyPermutationsOnDevice<Type, Coord>(                                            \
-      GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,                            \
-      const matrix::Distribution& distr, const SizeType* perms,                                          \
-      const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<Type, Device::GPU>>& in_tiles, \
-      const std::vector<matrix::Tile<Type, Device::GPU>>& out_tiles, whip::stream_t stream)
+void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
+                               const SizeType* perms, MatrixLayout in_layout, const T* in,
+                               MatrixLayout out_layout, T* out, whip::stream_t stream);
+
+#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord)                                              \
+  kword template void applyPermutationsOnDevice<Type, Coord>(                                        \
+      GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, const SizeType* perms, \
+      MatrixLayout in_layout, const Type* in, MatrixLayout out_layout, Type* out,                    \
+      whip::stream_t stream)
 
 DLAF_CUDA_PERMUTE_ON_DEVICE(extern, float, Coord::Col);
 DLAF_CUDA_PERMUTE_ON_DEVICE(extern, double, Coord::Col);
diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py
index 27c12ce65b..927279f9ab 100644
--- a/spack/packages/dla-future/package.py
+++ b/spack/packages/dla-future/package.py
@@ -98,8 +98,6 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
     depends_on("pika +cuda", when="+cuda")
     depends_on("pika +rocm", when="+rocm")
 
-    conflicts("^pika cxxstd=20", when="+cuda")
-
     depends_on("whip +cuda", when="+cuda")
     depends_on("whip +rocm", when="+rocm")
 
@@ -161,7 +159,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
         values=cxxstds,
         description="Use the specified C++ standard when building",
     )
-    conflicts("cxxstd=20", when="+cuda")
+    conflicts("cxxstd=20", when="+cuda ^cuda@:11")
 
     for cxxstd in cxxstds:
         depends_on(f"pika cxxstd={cxxstd}", when=f"cxxstd={cxxstd}")
diff --git a/src/permutations/general/perms.cu b/src/permutations/general/perms.cu
index c0d1aca3c1..72d5e3d988 100644
--- a/src/permutations/general/perms.cu
+++ b/src/permutations/general/perms.cu
@@ -18,13 +18,6 @@
 
 namespace dlaf::permutations::internal {
 
-struct MatrixLayout {
-  SizeType nb;          // square tile size
-  SizeType ld;          // tile leading dimension
-  SizeType row_offset;  // tile offset to first element of tile on the next row
-  SizeType col_offset;  // tile offset to first element of tile on the next column
-};
-
 __device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType col) {
   SizeType tile_row = row / layout.nb;
   SizeType tile_col = col / layout.nb;
@@ -37,32 +30,6 @@ __device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType
   return tile_offset + tile_el_offset;
 }
 
-template <class T>
-MatrixLayout getMatrixLayout(const matrix::Distribution& distr,
-                             const std::vector<matrix::Tile<T, Device::GPU>>& tiles) {
-  LocalTileSize tile_sz = distr.localNrTiles();
-  MatrixLayout layout;
-  layout.nb = distr.blockSize().rows();
-  layout.ld = tiles[0].ld();
-  layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0;
-  layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0;
-  return layout;
-}
-
-template <class T>
-MatrixLayout getMatrixLayout(
-    const matrix::Distribution& distr,
-    const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& tiles) {
-  const LocalTileSize tile_sz = distr.localNrTiles();
-  MatrixLayout layout;
-  layout.nb = distr.blockSize().rows();
-  layout.ld = tiles[0].get().ld();
-  layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0;
-  layout.col_offset =
-      (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0;
-  return layout;
-}
-
 constexpr unsigned perms_kernel_sz = 32;
 
 __device__ void swapIndices(SizeType& a, SizeType& b) {
@@ -100,16 +67,9 @@ __global__ void applyPermutationsOnDevice(SizeType out_begin_row, SizeType out_b
 }
 
 template <class T, Coord coord>
-void applyPermutationsOnDevice(
-    GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
-    const matrix::Distribution& distr, const SizeType* perms,
-    const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
-    const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream) {
-  MatrixLayout in_layout = getMatrixLayout(distr, in_tiles);
-  MatrixLayout out_layout = getMatrixLayout(distr, out_tiles);
-  const T* in = in_tiles[0].get().ptr();
-  T* out = out_tiles[0].ptr();
-
+void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
+                               const SizeType* perms, const MatrixLayout in_layout, const T* in,
+                               const MatrixLayout out_layout, T* out, whip::stream_t stream) {
   constexpr Coord orth_coord = orthogonal(coord);
   SizeType nelems = sz.get<orth_coord>();  // number of elements in each row or column
   SizeType nperms = sz.get<coord>();       // number of permuted rows or columns