From 1c44130b0b01a02e203e42670972b42877df5c27 Mon Sep 17 00:00:00 2001 From: "Auriane R." <48684432+aurianer@users.noreply.github.com> Date: Fri, 20 Sep 2024 10:57:24 +0200 Subject: [PATCH] Move stdexec compilation to c++ files + add CI for cuda stdexec configuration (#1188) --- ci/.gitlab-ci.yml | 1 + ci/cuda/gcc13_release_stdexec.yml | 21 ++++++++ ci/docker/release-cuda-stdexec.yaml | 32 +++++++++++ .../kernels/internal/all_reduce.h | 6 ++- include/dlaf/permutations/general/impl.h | 53 +++++++++++++++++-- include/dlaf/permutations/general/perms.h | 31 ++++++----- spack/packages/dla-future/package.py | 4 +- src/permutations/general/perms.cu | 46 ++-------------- 8 files changed, 128 insertions(+), 66 deletions(-) create mode 100644 ci/cuda/gcc13_release_stdexec.yml create mode 100644 ci/docker/release-cuda-stdexec.yaml diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index 5fcbba6f0b..dc5820f10b 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -15,5 +15,6 @@ include: - local: 'ci/cuda/gcc11_release_scalapack.yml' - local: 'ci/cuda/gcc11_codecov.yml' - local: 'ci/cuda/gcc11_debug_scalapack.yml' + - local: 'ci/cuda/gcc13_release_stdexec.yml' - local: 'ci/rocm/clang14_release.yml' - local: 'ci/rocm/clang14_release_stdexec.yml' diff --git a/ci/cuda/gcc13_release_stdexec.yml b/ci/cuda/gcc13_release_stdexec.yml new file mode 100644 index 0000000000..24dccbdf1a --- /dev/null +++ b/ci/cuda/gcc13_release_stdexec.yml @@ -0,0 +1,21 @@ +include: + - local: 'ci/common-ci.yml' + +cuda gcc13 stdexec release deps: + extends: .build_deps_common + variables: + BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04 + COMPILER: gcc@13 + CXXSTD: 20 + SPACK_ENVIRONMENT: ci/docker/release-cuda-stdexec.yaml + USE_MKL: "ON" + BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/build + +cuda gcc13 stdexec release build: + extends: + - .build_common + - .build_for_daint-gpu + needs: + - cuda gcc13 stdexec release deps + variables: + DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/deploy:$CI_COMMIT_SHA diff --git a/ci/docker/release-cuda-stdexec.yaml b/ci/docker/release-cuda-stdexec.yaml new file mode 100644 index 0000000000..366ff247ef --- /dev/null +++ b/ci/docker/release-cuda-stdexec.yaml @@ -0,0 +1,32 @@ +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +spack: + include: + - /spack_environment/common.yaml + + view: false + concretizer: + unify: + true + + specs: + - dla-future@master +cuda +miniapps +ci-test + + packages: + all: + variants: + - 'build_type=Release' + pika: + require: + - '+stdexec' + stdexec: + require: + - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main' diff --git a/include/dlaf/communication/kernels/internal/all_reduce.h b/include/dlaf/communication/kernels/internal/all_reduce.h index f4dff2f1e5..767c79d903 100644 --- a/include/dlaf/communication/kernels/internal/all_reduce.h +++ b/include/dlaf/communication/kernels/internal/all_reduce.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -80,8 +81,9 @@ template // written into the output tile instead). The reduction is explicitly done // on CPU memory so that we can manage potential asynchronous copies between // CPU and GPU. A reduction requires contiguous memory. - return withTemporaryTile(std::move(tile_in), std::move(all_reduce)); + return pika::execution::experimental::make_unique_any_sender( + withTemporaryTile(std::move(tile_in), std::move(all_reduce))); }; #if !defined(DLAF_WITH_MPI_GPU_AWARE) diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h index 8e4e6b958b..8cd94800de 100644 --- a/include/dlaf/permutations/general/impl.h +++ b/include/dlaf/permutations/general/impl.h @@ -131,6 +131,51 @@ void applyPermutationOnCPU( } } +#if defined(DLAF_WITH_GPU) + +template +MatrixLayout getMatrixLayout(const matrix::Distribution& distr, + const std::vector>& tiles) { + LocalTileSize tile_sz = distr.localNrTiles(); + MatrixLayout layout; + layout.nb = distr.blockSize().rows(); + layout.ld = tiles[0].ld(); + layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0; + layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0; + return layout; +} + +template +MatrixLayout getMatrixLayout( + const matrix::Distribution& distr, + const std::vector>& tiles) { + const LocalTileSize tile_sz = distr.localNrTiles(); + MatrixLayout layout; + layout.nb = distr.blockSize().rows(); + layout.ld = tiles[0].get().ld(); + layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0; + layout.col_offset = + (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0; + return layout; +} + +template +void applyPermutationsOnDeviceWrapper( + GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, + const matrix::Distribution& distr, const SizeType* perms, + const std::vector>& in_tiles, + const std::vector>& out_tiles, whip::stream_t stream) { + MatrixLayout in_layout = getMatrixLayout(distr, in_tiles); + MatrixLayout out_layout = getMatrixLayout(distr, out_tiles); + const T* in = in_tiles[0].get().ptr(); + T* out = out_tiles[0].ptr(); + + applyPermutationsOnDevice(out_begin, sz, in_offset, perms, in_layout, in, out_layout, out, + stream); +} + +#endif + template void Permutations::call(const SizeType i_begin, const SizeType i_end, Matrix& perms, Matrix& mat_in, @@ -188,8 +233,8 @@ void Permutations::call(const SizeType i_begin, const SizeType i_end TileElementIndex zero(0, 0); const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero); - applyPermutationsOnDevice(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr, - mat_in_tiles, mat_out_tiles, stream); + applyPermutationsOnDeviceWrapper(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, + i_ptr, mat_in_tiles, mat_out_tiles, stream); }; ex::start_detached(std::move(sender) | @@ -408,8 +453,8 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i TileElementIndex zero(0, 0); const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero); - applyPermutationsOnDevice(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr, - mat_in_tiles, mat_out_tiles, stream); + applyPermutationsOnDeviceWrapper(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, + i_ptr, mat_in_tiles, mat_out_tiles, stream); }; ex::start_detached(std::move(sender) | diff --git a/include/dlaf/permutations/general/perms.h b/include/dlaf/permutations/general/perms.h index 21fadb9f81..149b736fc0 100644 --- a/include/dlaf/permutations/general/perms.h +++ b/include/dlaf/permutations/general/perms.h @@ -16,25 +16,28 @@ #include -#include -#include +#include #include namespace dlaf::permutations::internal { +struct MatrixLayout { + SizeType nb; // square tile size + SizeType ld; // tile leading dimension + SizeType row_offset; // tile offset to first element of tile on the next row + SizeType col_offset; // tile offset to first element of tile on the next column +}; + template -void applyPermutationsOnDevice( - GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, - const matrix::Distribution& distr, const SizeType* perms, - const std::vector>& in_tiles, - const std::vector>& out_tiles, whip::stream_t stream); - -#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord) \ - kword template void applyPermutationsOnDevice( \ - GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, \ - const matrix::Distribution& distr, const SizeType* perms, \ - const std::vector>& in_tiles, \ - const std::vector>& out_tiles, whip::stream_t stream) +void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, + const SizeType* perms, MatrixLayout in_layout, const T* in, + MatrixLayout out_layout, T* out, whip::stream_t stream); + +#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord) \ + kword template void applyPermutationsOnDevice( \ + GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, const SizeType* perms, \ + MatrixLayout in_layout, const Type* in, MatrixLayout out_layout, Type* out, \ + whip::stream_t stream) DLAF_CUDA_PERMUTE_ON_DEVICE(extern, float, Coord::Col); DLAF_CUDA_PERMUTE_ON_DEVICE(extern, double, Coord::Col); diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py index 27c12ce65b..927279f9ab 100644 --- a/spack/packages/dla-future/package.py +++ b/spack/packages/dla-future/package.py @@ -98,8 +98,6 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage): depends_on("pika +cuda", when="+cuda") depends_on("pika +rocm", when="+rocm") - conflicts("^pika cxxstd=20", when="+cuda") - depends_on("whip +cuda", when="+cuda") depends_on("whip +rocm", when="+rocm") @@ -161,7 +159,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage): values=cxxstds, description="Use the specified C++ standard when building", ) - conflicts("cxxstd=20", when="+cuda") + conflicts("cxxstd=20", when="+cuda ^cuda@:11") for cxxstd in cxxstds: depends_on(f"pika cxxstd={cxxstd}", when=f"cxxstd={cxxstd}") diff --git a/src/permutations/general/perms.cu b/src/permutations/general/perms.cu index c0d1aca3c1..72d5e3d988 100644 --- a/src/permutations/general/perms.cu +++ b/src/permutations/general/perms.cu @@ -18,13 +18,6 @@ namespace dlaf::permutations::internal { -struct MatrixLayout { - SizeType nb; // square tile size - SizeType ld; // tile leading dimension - SizeType row_offset; // tile offset to first element of tile on the next row - SizeType col_offset; // tile offset to first element of tile on the next column -}; - __device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType col) { SizeType tile_row = row / layout.nb; SizeType tile_col = col / layout.nb; @@ -37,32 +30,6 @@ __device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType return tile_offset + tile_el_offset; } -template -MatrixLayout getMatrixLayout(const matrix::Distribution& distr, - const std::vector>& tiles) { - LocalTileSize tile_sz = distr.localNrTiles(); - MatrixLayout layout; - layout.nb = distr.blockSize().rows(); - layout.ld = tiles[0].ld(); - layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0; - layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0; - return layout; -} - -template -MatrixLayout getMatrixLayout( - const matrix::Distribution& distr, - const std::vector>& tiles) { - const LocalTileSize tile_sz = distr.localNrTiles(); - MatrixLayout layout; - layout.nb = distr.blockSize().rows(); - layout.ld = tiles[0].get().ld(); - layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0; - layout.col_offset = - (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0; - return layout; -} - constexpr unsigned perms_kernel_sz = 32; __device__ void swapIndices(SizeType& a, SizeType& b) { @@ -100,16 +67,9 @@ __global__ void applyPermutationsOnDevice(SizeType out_begin_row, SizeType out_b } template -void applyPermutationsOnDevice( - GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, - const matrix::Distribution& distr, const SizeType* perms, - const std::vector>& in_tiles, - const std::vector>& out_tiles, whip::stream_t stream) { - MatrixLayout in_layout = getMatrixLayout(distr, in_tiles); - MatrixLayout out_layout = getMatrixLayout(distr, out_tiles); - const T* in = in_tiles[0].get().ptr(); - T* out = out_tiles[0].ptr(); - +void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, + const SizeType* perms, const MatrixLayout in_layout, const T* in, + const MatrixLayout out_layout, T* out, whip::stream_t stream) { constexpr Coord orth_coord = orthogonal(coord); SizeType nelems = sz.get(); // number of elements in each row or column SizeType nperms = sz.get(); // number of permuted rows or columns