Skip to content

Commit

Permalink
Move stdexec compilation to c++ files + add CI for cuda stdexec confi…
Browse files Browse the repository at this point in the history
…guration (#1188)
  • Loading branch information
aurianer authored Sep 20, 2024
1 parent a64c822 commit 1c44130
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 66 deletions.
1 change: 1 addition & 0 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ include:
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
21 changes: 21 additions & 0 deletions ci/cuda/gcc13_release_stdexec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
include:
- local: 'ci/common-ci.yml'

cuda gcc13 stdexec release deps:
extends: .build_deps_common
variables:
BASE_IMAGE: docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04
COMPILER: gcc@13
CXXSTD: 20
SPACK_ENVIRONMENT: ci/docker/release-cuda-stdexec.yaml
USE_MKL: "ON"
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/build

cuda gcc13 stdexec release build:
extends:
- .build_common
- .build_for_daint-gpu
needs:
- cuda gcc13 stdexec release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/deploy:$CI_COMMIT_SHA
32 changes: 32 additions & 0 deletions ci/docker/release-cuda-stdexec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# Distributed Linear Algebra with Future (DLAF)
#
# Copyright (c) 2018-2024, ETH Zurich
# All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#

spack:
include:
- /spack_environment/common.yaml

view: false
concretizer:
unify:
true

specs:
- dla-future@master +cuda +miniapps +ci-test

packages:
all:
variants:
- 'build_type=Release'
pika:
require:
- '+stdexec'
stdexec:
require:
- '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
6 changes: 4 additions & 2 deletions include/dlaf/communication/kernels/internal/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <mpi.h>

#include <pika/execution.hpp>
#include <pika/execution_base/any_sender.hpp>

#include <dlaf/common/callable_object.h>
#include <dlaf/common/eti.h>
Expand Down Expand Up @@ -80,8 +81,9 @@ template <Device DCommIn, Device DCommOut, class T, Device DIn, Device DOut>
// written into the output tile instead). The reduction is explicitly done
// on CPU memory so that we can manage potential asynchronous copies between
// CPU and GPU. A reduction requires contiguous memory.
return withTemporaryTile<DCommIn, CopyToDestination::Yes, CopyFromDestination::No,
RequireContiguous::Yes>(std::move(tile_in), std::move(all_reduce));
return pika::execution::experimental::make_unique_any_sender(
withTemporaryTile<DCommIn, CopyToDestination::Yes, CopyFromDestination::No,
RequireContiguous::Yes>(std::move(tile_in), std::move(all_reduce)));
};

#if !defined(DLAF_WITH_MPI_GPU_AWARE)
Expand Down
53 changes: 49 additions & 4 deletions include/dlaf/permutations/general/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,51 @@ void applyPermutationOnCPU(
}
}

#if defined(DLAF_WITH_GPU)

template <class T>
MatrixLayout getMatrixLayout(const matrix::Distribution& distr,
const std::vector<matrix::Tile<T, Device::GPU>>& tiles) {
LocalTileSize tile_sz = distr.localNrTiles();
MatrixLayout layout;
layout.nb = distr.blockSize().rows();
layout.ld = tiles[0].ld();
layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0;
layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0;
return layout;
}

template <class T>
MatrixLayout getMatrixLayout(
const matrix::Distribution& distr,
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& tiles) {
const LocalTileSize tile_sz = distr.localNrTiles();
MatrixLayout layout;
layout.nb = distr.blockSize().rows();
layout.ld = tiles[0].get().ld();
layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0;
layout.col_offset =
(tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0;
return layout;
}

template <class T, Coord coord>
void applyPermutationsOnDeviceWrapper(
GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
const matrix::Distribution& distr, const SizeType* perms,
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream) {
MatrixLayout in_layout = getMatrixLayout(distr, in_tiles);
MatrixLayout out_layout = getMatrixLayout(distr, out_tiles);
const T* in = in_tiles[0].get().ptr();
T* out = out_tiles[0].ptr();

applyPermutationsOnDevice<T, coord>(out_begin, sz, in_offset, perms, in_layout, in, out_layout, out,
stream);
}

#endif

template <Backend B, Device D, class T, Coord C>
void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end,
Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in,
Expand Down Expand Up @@ -188,8 +233,8 @@ void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end
TileElementIndex zero(0, 0);
const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero);

applyPermutationsOnDevice<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr,
mat_in_tiles, mat_out_tiles, stream);
applyPermutationsOnDeviceWrapper<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist,
i_ptr, mat_in_tiles, mat_out_tiles, stream);
};

ex::start_detached(std::move(sender) |
Expand Down Expand Up @@ -408,8 +453,8 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i
TileElementIndex zero(0, 0);
const SizeType* i_ptr = index_tile_futs[0].get().ptr(zero);

applyPermutationsOnDevice<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist, i_ptr,
mat_in_tiles, mat_out_tiles, stream);
applyPermutationsOnDeviceWrapper<T, C>(GlobalElementIndex(0, 0), subm_dist.size(), 0, subm_dist,
i_ptr, mat_in_tiles, mat_out_tiles, stream);
};

ex::start_detached(std::move(sender) |
Expand Down
31 changes: 17 additions & 14 deletions include/dlaf/permutations/general/perms.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,28 @@

#include <whip.hpp>

#include <dlaf/matrix/distribution.h>
#include <dlaf/matrix/tile.h>
#include <dlaf/matrix/index.h>
#include <dlaf/types.h>

namespace dlaf::permutations::internal {

struct MatrixLayout {
SizeType nb; // square tile size
SizeType ld; // tile leading dimension
SizeType row_offset; // tile offset to first element of tile on the next row
SizeType col_offset; // tile offset to first element of tile on the next column
};

template <class T, Coord coord>
void applyPermutationsOnDevice(
GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
const matrix::Distribution& distr, const SizeType* perms,
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream);

#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord) \
kword template void applyPermutationsOnDevice<Type, Coord>( \
GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, \
const matrix::Distribution& distr, const SizeType* perms, \
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<Type, Device::GPU>>& in_tiles, \
const std::vector<matrix::Tile<Type, Device::GPU>>& out_tiles, whip::stream_t stream)
void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
const SizeType* perms, MatrixLayout in_layout, const T* in,
MatrixLayout out_layout, T* out, whip::stream_t stream);

#define DLAF_CUDA_PERMUTE_ON_DEVICE(kword, Type, Coord) \
kword template void applyPermutationsOnDevice<Type, Coord>( \
GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset, const SizeType* perms, \
MatrixLayout in_layout, const Type* in, MatrixLayout out_layout, Type* out, \
whip::stream_t stream)

DLAF_CUDA_PERMUTE_ON_DEVICE(extern, float, Coord::Col);
DLAF_CUDA_PERMUTE_ON_DEVICE(extern, double, Coord::Col);
Expand Down
4 changes: 1 addition & 3 deletions spack/packages/dla-future/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
depends_on("pika +cuda", when="+cuda")
depends_on("pika +rocm", when="+rocm")

conflicts("^pika cxxstd=20", when="+cuda")

depends_on("whip +cuda", when="+cuda")
depends_on("whip +rocm", when="+rocm")

Expand Down Expand Up @@ -161,7 +159,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage):
values=cxxstds,
description="Use the specified C++ standard when building",
)
conflicts("cxxstd=20", when="+cuda")
conflicts("cxxstd=20", when="+cuda ^cuda@:11")

for cxxstd in cxxstds:
depends_on(f"pika cxxstd={cxxstd}", when=f"cxxstd={cxxstd}")
Expand Down
46 changes: 3 additions & 43 deletions src/permutations/general/perms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@

namespace dlaf::permutations::internal {

struct MatrixLayout {
SizeType nb; // square tile size
SizeType ld; // tile leading dimension
SizeType row_offset; // tile offset to first element of tile on the next row
SizeType col_offset; // tile offset to first element of tile on the next column
};

__device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType col) {
SizeType tile_row = row / layout.nb;
SizeType tile_col = col / layout.nb;
Expand All @@ -37,32 +30,6 @@ __device__ SizeType getIndex(const MatrixLayout& layout, SizeType row, SizeType
return tile_offset + tile_el_offset;
}

template <class T>
MatrixLayout getMatrixLayout(const matrix::Distribution& distr,
const std::vector<matrix::Tile<T, Device::GPU>>& tiles) {
LocalTileSize tile_sz = distr.localNrTiles();
MatrixLayout layout;
layout.nb = distr.blockSize().rows();
layout.ld = tiles[0].ld();
layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].ptr() - tiles[0].ptr() : 0;
layout.col_offset = (tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].ptr() - tiles[0].ptr() : 0;
return layout;
}

template <class T>
MatrixLayout getMatrixLayout(
const matrix::Distribution& distr,
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& tiles) {
const LocalTileSize tile_sz = distr.localNrTiles();
MatrixLayout layout;
layout.nb = distr.blockSize().rows();
layout.ld = tiles[0].get().ld();
layout.row_offset = (tile_sz.rows() > 1) ? tiles[1].get().ptr() - tiles[0].get().ptr() : 0;
layout.col_offset =
(tile_sz.cols() > 1) ? tiles[to_sizet(tile_sz.rows())].get().ptr() - tiles[0].get().ptr() : 0;
return layout;
}

constexpr unsigned perms_kernel_sz = 32;

__device__ void swapIndices(SizeType& a, SizeType& b) {
Expand Down Expand Up @@ -100,16 +67,9 @@ __global__ void applyPermutationsOnDevice(SizeType out_begin_row, SizeType out_b
}

template <class T, Coord coord>
void applyPermutationsOnDevice(
GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
const matrix::Distribution& distr, const SizeType* perms,
const std::vector<matrix::internal::TileAsyncRwMutexReadOnlyWrapper<T, Device::GPU>>& in_tiles,
const std::vector<matrix::Tile<T, Device::GPU>>& out_tiles, whip::stream_t stream) {
MatrixLayout in_layout = getMatrixLayout(distr, in_tiles);
MatrixLayout out_layout = getMatrixLayout(distr, out_tiles);
const T* in = in_tiles[0].get().ptr();
T* out = out_tiles[0].ptr();

void applyPermutationsOnDevice(GlobalElementIndex out_begin, GlobalElementSize sz, SizeType in_offset,
const SizeType* perms, const MatrixLayout in_layout, const T* in,
const MatrixLayout out_layout, T* out, whip::stream_t stream) {
constexpr Coord orth_coord = orthogonal(coord);
SizeType nelems = sz.get<orth_coord>(); // number of elements in each row or column
SizeType nperms = sz.get<coord>(); // number of permuted rows or columns
Expand Down

0 comments on commit 1c44130

Please sign in to comment.