diff --git a/ci/common-ci.yml b/ci/common-ci.yml index 97ae84ae0c..f070765240 100644 --- a/ci/common-ci.yml +++ b/ci/common-ci.yml @@ -24,10 +24,9 @@ stages: - TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 16` - TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 16` - TAG=${TAG_IMAGE}-${TAG_APTGET}-${TAG_COMPILER}-MKL${USE_MKL}-${TAG_DOCKERFILE}-${TAG_SPACK}-${TAG_REPO}-${TAG_ENVIRONMENT} - - export PERSIST_IMAGE_NAME=$BUILD_IMAGE:$TAG - - echo "BUILD_IMAGE=$PERSIST_IMAGE_NAME" > build.env + - export PERSIST_IMAGE_NAME=$DEPS_IMAGE:$TAG + - echo "DEPS_IMAGE=$PERSIST_IMAGE_NAME" > build.env - echo "USE_MKL=$USE_MKL" >> build.env - - echo "USE_ROCBLAS=$USE_ROCBLAS" >> build.env - echo "USE_CODECOV=$USE_CODECOV" >> build.env - 'echo "INFO: Building image $PERSIST_IMAGE_NAME"' - 'echo "INFO: Using NUM_CORES_BUILD_DEPS=$NUM_CORES_BUILD_DEPS"' @@ -35,11 +34,10 @@ stages: reports: dotenv: build.env variables: - SPACK_SHA: 0905edf592752742eb4ddab3a528d3aee8f92930 + SPACK_SHA: develop-2024-10-06 SPACK_DLAF_REPO: ./spack DOCKER_BUILD_ARGS: '[ "BASE_IMAGE", - "BUILDKIT_INLINE_CACHE=1", "SPACK_SHA", "EXTRA_APTGET", "COMPILER", @@ -57,7 +55,6 @@ stages: EXTRA_APTGET: "" CXXSTD: 17 USE_MKL: "OFF" - USE_ROCBLAS: "OFF" COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml USE_CODECOV: "false" @@ -69,23 +66,17 @@ stages: - 'echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin' - 'echo "INFO: Using NUM_CORES_BUILD_DLAF=$NUM_CORES_BUILD_DLAF"' after_script: - - podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DEPLOY_IMAGE /ctest_to_gitlab.sh "$DEPLOY_IMAGE" "$USE_CODECOV" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" > pipeline.yml + - podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DLAF_IMAGE /ctest_to_gitlab.sh "$DLAF_IMAGE" "$USE_CODECOV" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" > pipeline.yml variables: - PERSIST_IMAGE_NAME: $DEPLOY_IMAGE + PERSIST_IMAGE_NAME: $DLAF_IMAGE DOCKER_BUILD_ARGS: '[ - "BUILD_IMAGE", - "DEPLOY_BASE_IMAGE", - "EXTRA_APTGET_DEPLOY", + "DEPS_IMAGE", "PIP_OPTS", - "USE_MKL", - "USE_ROCBLAS", "NUM_PROCS=$NUM_CORES_BUILD_DLAF" ]' # default configuration variables # can be overwritten in the configuration as needed DOCKERFILE: ci/docker/deploy.Dockerfile - DEPLOY_BASE_IMAGE: docker.io/ubuntu:24.04 - EXTRA_APTGET_DEPLOY: "" PIP_OPTS: "" artifacts: paths: diff --git a/ci/cpu/asan_ubsan_lsan.yml b/ci/cpu/asan_ubsan_lsan.yml index f5cb1224fd..bcc987399a 100644 --- a/ci/cpu/asan_ubsan_lsan.yml +++ b/ci/cpu/asan_ubsan_lsan.yml @@ -6,9 +6,9 @@ cpu asan ubsan lsan deps: variables: EXTRA_APTGET: "clang-18 libclang-rt-18-dev libomp-18-dev" COMPILER: clang@18 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/asan-ubsan-lsan.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deps cpu asan ubsan lsan build: extends: @@ -17,9 +17,7 @@ cpu asan ubsan lsan build: needs: - cpu asan ubsan lsan deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deploy:$CI_COMMIT_SHA - # For symbolizing stacktraces with llvm-symbolizer - EXTRA_APTGET_DEPLOY: "llvm-18" + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/dlaf:$CI_COMMIT_SHA cpu asan ubsan lsan test: extends: .run_common diff --git a/ci/cpu/clang15_release.yml b/ci/cpu/clang15_release.yml index 233f8ab185..36231c2979 100644 --- a/ci/cpu/clang15_release.yml +++ b/ci/cpu/clang15_release.yml @@ -6,9 +6,9 @@ cpu clang15 release deps: variables: EXTRA_APTGET: "clang-15" COMPILER: clang@15 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu-serial.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deps cpu clang15 release build: extends: @@ -17,7 +17,7 @@ cpu clang15 release build: needs: - cpu clang15 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/dlaf:$CI_COMMIT_SHA cpu clang15 release test: extends: .run_common diff --git a/ci/cpu/clang15_release_cxx20.yml b/ci/cpu/clang15_release_cxx20.yml index 2875e7a0c7..d045898e62 100644 --- a/ci/cpu/clang15_release_cxx20.yml +++ b/ci/cpu/clang15_release_cxx20.yml @@ -9,7 +9,7 @@ cpu clang15 cxx20 release deps: CXXSTD: 20 SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/deps cpu clang15 cxx20 release build: extends: @@ -18,7 +18,7 @@ cpu clang15 cxx20 release build: needs: - cpu clang15 cxx20 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/dlaf:$CI_COMMIT_SHA cpu clang15 cxx20 release test: extends: .run_common diff --git a/ci/cpu/clang15_release_stdexec.yml b/ci/cpu/clang15_release_stdexec.yml index da096bf204..bf1d8c73d1 100644 --- a/ci/cpu/clang15_release_stdexec.yml +++ b/ci/cpu/clang15_release_stdexec.yml @@ -7,9 +7,9 @@ cpu clang15 stdexec release deps: EXTRA_APTGET: "clang-15 libomp-15-dev" COMPILER: clang@15 CXXSTD: 20 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deps cpu clang15 stdexec release build: extends: @@ -18,7 +18,7 @@ cpu clang15 stdexec release build: needs: - cpu clang15 stdexec release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/dlaf:$CI_COMMIT_SHA cpu clang15 stdexec release test: extends: .run_common diff --git a/ci/cpu/clang16_release.yml b/ci/cpu/clang16_release.yml index ac650f90b5..af91352e3c 100644 --- a/ci/cpu/clang16_release.yml +++ b/ci/cpu/clang16_release.yml @@ -6,9 +6,9 @@ cpu clang16 release deps: variables: EXTRA_APTGET: "clang-16 libomp-16-dev" COMPILER: clang@16 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deps cpu clang16 release build: extends: @@ -17,7 +17,7 @@ cpu clang16 release build: needs: - cpu clang16 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/dlaf:$CI_COMMIT_SHA cpu clang16 release test: extends: .run_common diff --git a/ci/cpu/clang18_release.yml b/ci/cpu/clang18_release.yml index 6a01bc5a01..db61babf2b 100644 --- a/ci/cpu/clang18_release.yml +++ b/ci/cpu/clang18_release.yml @@ -6,9 +6,9 @@ cpu clang18 release deps: variables: EXTRA_APTGET: "clang-18 libomp-18-dev" COMPILER: clang@18 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deps cpu clang18 release build: extends: @@ -17,7 +17,7 @@ cpu clang18 release build: needs: - cpu clang18 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/dlaf:$CI_COMMIT_SHA cpu clang18 release test: extends: .run_common diff --git a/ci/cpu/gcc11_debug_stdexec.yml b/ci/cpu/gcc11_debug_stdexec.yml index 5aedcc04c1..be5df32f0c 100644 --- a/ci/cpu/gcc11_debug_stdexec.yml +++ b/ci/cpu/gcc11_debug_stdexec.yml @@ -9,7 +9,7 @@ cpu gcc11 stdexec debug deps: CXXSTD: 20 SPACK_ENVIRONMENT: ci/docker/debug-cpu-stdexec.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deps cpu gcc11 stdexec debug build: extends: @@ -18,7 +18,7 @@ cpu gcc11 stdexec debug build: needs: - cpu gcc11 stdexec debug deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/dlaf:$CI_COMMIT_SHA cpu gcc11 stdexec debug test: extends: .run_common diff --git a/ci/cpu/gcc11_release_stdexec.yml b/ci/cpu/gcc11_release_stdexec.yml index 0ba3660d10..f74112fa67 100644 --- a/ci/cpu/gcc11_release_stdexec.yml +++ b/ci/cpu/gcc11_release_stdexec.yml @@ -7,9 +7,9 @@ cpu gcc11 stdexec release deps: EXTRA_APTGET: "gcc-11 g++-11 gfortran-11" COMPILER: gcc@11 CXXSTD: 20 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deps cpu gcc11 stdexec release build: extends: @@ -18,7 +18,7 @@ cpu gcc11 stdexec release build: needs: - cpu gcc11 stdexec release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/dlaf:$CI_COMMIT_SHA cpu gcc11 stdexec release test: extends: .run_common diff --git a/ci/cpu/gcc12_release_cxx20.yml b/ci/cpu/gcc12_release_cxx20.yml index ced430f279..d1bf81503c 100644 --- a/ci/cpu/gcc12_release_cxx20.yml +++ b/ci/cpu/gcc12_release_cxx20.yml @@ -9,7 +9,7 @@ cpu gcc12 cxx20 release deps: CXXSTD: 20 SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/deps cpu gcc12 cxx20 release build: extends: @@ -18,7 +18,7 @@ cpu gcc12 cxx20 release build: needs: - cpu gcc12 cxx20 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/dlaf:$CI_COMMIT_SHA cpu gcc12 cxx20 release test: extends: .run_common diff --git a/ci/cpu/gcc13_codecov.yml b/ci/cpu/gcc13_codecov.yml index 326973e326..a83e9320ed 100644 --- a/ci/cpu/gcc13_codecov.yml +++ b/ci/cpu/gcc13_codecov.yml @@ -6,7 +6,7 @@ cpu gcc13 codecov deps: variables: COMPILER: gcc@13 SPACK_ENVIRONMENT: ci/docker/debug-cpu.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/deps USE_CODECOV: "true" cpu gcc13 codecov build: @@ -17,7 +17,7 @@ cpu gcc13 codecov build: - cpu gcc13 codecov deps variables: DOCKERFILE: ci/docker/codecov.Dockerfile - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/dlaf:$CI_COMMIT_SHA PIP_OPTS: "--break-system-packages" cpu gcc13 codecov test: diff --git a/ci/cpu/gcc13_release.yml b/ci/cpu/gcc13_release.yml index daa0d77951..6f95b9603a 100644 --- a/ci/cpu/gcc13_release.yml +++ b/ci/cpu/gcc13_release.yml @@ -5,9 +5,9 @@ cpu gcc13 release deps: extends: .build_deps_common variables: COMPILER: gcc@13 - USE_MKL: "ON" SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/build + USE_MKL: "ON" + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/deps cpu gcc13 release build: extends: @@ -16,7 +16,7 @@ cpu gcc13 release build: needs: - cpu gcc13 release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/dlaf:$CI_COMMIT_SHA cpu gcc13 release test: extends: .run_common diff --git a/ci/ctest_to_gitlab.sh b/ci/ctest_to_gitlab.sh index fb5c6cf423..cedf894f80 100755 --- a/ci/ctest_to_gitlab.sh +++ b/ci/ctest_to_gitlab.sh @@ -16,6 +16,7 @@ THREADS_PER_NODE="$3" SLURM_CONSTRAINT="$4" if [ "$USE_CODECOV" = true ]; then +# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197) BASE_TEMPLATE=" include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml' @@ -30,7 +31,7 @@ variables: SLURM_EXCLUSIVE: '' SLURM_EXACT: '' SLURM_CONSTRAINT: $SLURM_CONSTRAINT - CRAY_CUDA_MPS: 1 + CRAY_CUDA_MPS: 0 MPICH_MAX_THREAD_SAFETY: multiple {{JOBS}} @@ -65,6 +66,7 @@ JOB_TEMPLATE=" paths: - codecov-reports/" else +# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197) BASE_TEMPLATE=" include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml' @@ -78,7 +80,7 @@ variables: SLURM_EXCLUSIVE: '' SLURM_EXACT: '' SLURM_CONSTRAINT: $SLURM_CONSTRAINT - CRAY_CUDA_MPS: 1 + CRAY_CUDA_MPS: 0 MPICH_MAX_THREAD_SAFETY: multiple {{JOBS}} diff --git a/ci/cuda/gcc11_codecov.yml b/ci/cuda/gcc11_codecov.yml index cd81d00fb1..31fcea1150 100644 --- a/ci/cuda/gcc11_codecov.yml +++ b/ci/cuda/gcc11_codecov.yml @@ -7,7 +7,7 @@ cuda gcc11 codecov deps: BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04 COMPILER: gcc@11 SPACK_ENVIRONMENT: ci/docker/debug-cuda.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/deps USE_CODECOV: "true" cuda gcc11 codecov build: @@ -18,8 +18,7 @@ cuda gcc11 codecov build: - cuda gcc11 codecov deps variables: DOCKERFILE: ci/docker/codecov.Dockerfile - DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/dlaf:$CI_COMMIT_SHA cuda gcc11 codecov test: extends: .run_common diff --git a/ci/cuda/gcc11_debug_scalapack.yml b/ci/cuda/gcc11_debug_scalapack.yml index 707ec2b51d..98b07a1d03 100644 --- a/ci/cuda/gcc11_debug_scalapack.yml +++ b/ci/cuda/gcc11_debug_scalapack.yml @@ -7,7 +7,7 @@ cuda gcc11 debug scalapack deps: BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04 COMPILER: gcc@11 SPACK_ENVIRONMENT: ci/docker/debug-cuda-scalapack.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/deps cuda gcc11 debug scalapack build: extends: @@ -16,8 +16,7 @@ cuda gcc11 debug scalapack build: needs: - cuda gcc11 debug scalapack deps variables: - DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/dlaf:$CI_COMMIT_SHA cuda gcc11 debug scalapack test: extends: .run_common diff --git a/ci/cuda/gcc11_release.yml b/ci/cuda/gcc11_release.yml index 7d47f16d09..40d2b20bf2 100644 --- a/ci/cuda/gcc11_release.yml +++ b/ci/cuda/gcc11_release.yml @@ -6,10 +6,9 @@ cuda gcc11 release deps: variables: BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04 COMPILER: gcc@11 - CXXSTD: 17 SPACK_ENVIRONMENT: ci/docker/release-cuda.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deps cuda gcc11 release build: extends: @@ -18,8 +17,7 @@ cuda gcc11 release build: needs: - cuda gcc11 release deps variables: - DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/dlaf:$CI_COMMIT_SHA cuda gcc11 release test: extends: .run_common diff --git a/ci/cuda/gcc11_release_scalapack.yml b/ci/cuda/gcc11_release_scalapack.yml index ca4cc45d3f..5a668e4439 100644 --- a/ci/cuda/gcc11_release_scalapack.yml +++ b/ci/cuda/gcc11_release_scalapack.yml @@ -8,7 +8,7 @@ cuda gcc11 release scalapack deps: COMPILER: gcc@11 SPACK_ENVIRONMENT: ci/docker/release-cuda-scalapack.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/deps cuda gcc11 release scalapack build: extends: @@ -17,8 +17,7 @@ cuda gcc11 release scalapack build: needs: - cuda gcc11 release scalapack deps variables: - DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/dlaf:$CI_COMMIT_SHA cuda gcc11 release scalapack test: extends: .run_common diff --git a/ci/cuda/gcc13_release_stdexec.yml b/ci/cuda/gcc13_release_stdexec.yml index 24dccbdf1a..e3a3584aaa 100644 --- a/ci/cuda/gcc13_release_stdexec.yml +++ b/ci/cuda/gcc13_release_stdexec.yml @@ -9,7 +9,7 @@ cuda gcc13 stdexec release deps: CXXSTD: 20 SPACK_ENVIRONMENT: ci/docker/release-cuda-stdexec.yaml USE_MKL: "ON" - BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/deps cuda gcc13 stdexec release build: extends: @@ -18,4 +18,4 @@ cuda gcc13 stdexec release build: needs: - cuda gcc13 stdexec release deps variables: - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/dlaf:$CI_COMMIT_SHA diff --git a/ci/docker/build.Dockerfile b/ci/docker/build.Dockerfile index fadb655dd0..d1f36e8be5 100644 --- a/ci/docker/build.Dockerfile +++ b/ci/docker/build.Dockerfile @@ -6,28 +6,33 @@ FROM $BASE_IMAGE LABEL com.jfrog.artifactory.retention.maxDays="21" ENV DEBIAN_FRONTEND=noninteractive \ - PATH="$PATH:/opt/spack/bin:/opt/libtree" \ + PATH="$PATH:/opt/spack/bin" \ SPACK_COLOR=always + +# Overwrite entrypoint as NVIDIA images set a script that clog the output. +ENTRYPOINT [] +CMD [ "/bin/bash" ] SHELL ["/bin/bash", "-c"] ARG EXTRA_APTGET +# python is needed for spack and fastcov +# codecov upload needs curl + ca-certificates +# glibc-tools is needed for libSegFault on ubuntu > 22.04 +# jq, strace are needed for check-threads +# tzdata is needed to print correct time RUN apt-get -yqq update && \ apt-get -yqq install --no-install-recommends \ software-properties-common \ build-essential gfortran \ autoconf automake libssl-dev ninja-build pkg-config \ - ${EXTRA_APTGET} \ - gawk \ + gawk git tar \ + wget curl ca-certificates gpg-agent tzdata \ python3 python3-setuptools \ - git tar wget curl ca-certificates gpg-agent jq tzdata \ - patchelf unzip file gnupg2 libncurses-dev && \ + glibc-tools jq strace \ + patchelf unzip file gnupg2 libncurses-dev \ + ${EXTRA_APTGET} && \ rm -rf /var/lib/apt/lists/* -# Install libtree for packaging -RUN mkdir -p /opt/libtree && \ - curl -Lfso /opt/libtree/libtree https://github.com/haampie/libtree/releases/download/v2.0.0/libtree_x86_64 && \ - chmod +x /opt/libtree/libtree - # Install MKL and remove static libs (to keep image smaller) ARG USE_MKL=ON ARG MKL_VERSION=2024.0 @@ -84,11 +89,13 @@ RUN spack repo add --scope site /user_repo # e.g. --build-arg SPACK_ENVIRONMENT=ci/spack/my-env.yaml ARG SPACK_ENVIRONMENT ARG COMMON_SPACK_ENVIRONMENT +ARG ENV_VIEW=/view + # Build dependencies # 1. Create a spack environment named `ci` from the input spack.yaml file COPY $SPACK_ENVIRONMENT /spack_environment/spack.yaml COPY $COMMON_SPACK_ENVIRONMENT /spack_environment/ -RUN spack env create --without-view ci /spack_environment/spack.yaml +RUN spack env create --with-view ${ENV_VIEW} ci /spack_environment/spack.yaml # 2. Set the C++ standard ARG CXXSTD=17 RUN spack -e ci config add "packages:dla-future:variants:cxxstd=${CXXSTD}" @@ -98,3 +105,5 @@ RUN spack -e ci install --jobs ${NUM_PROCS} --fail-fast --only=dependencies # make ctest executable available. RUN ln -s `spack -e ci location -i cmake`/bin/ctest /usr/bin/ctest + +RUN echo ${ENV_VIEW}/lib > /etc/ld.so.conf.d/dlaf.conf && ldconfig diff --git a/ci/docker/codecov.Dockerfile b/ci/docker/codecov.Dockerfile index 3673b680ac..a838f943cc 100644 --- a/ci/docker/codecov.Dockerfile +++ b/ci/docker/codecov.Dockerfile @@ -1,18 +1,17 @@ -ARG BUILD_IMAGE -ARG DEPLOY_BASE_IMAGE +ARG DEPS_IMAGE +FROM $DEPS_IMAGE -# This is the folder where the project is built +LABEL com.jfrog.artifactory.retention.maxDays="7" +LABEL com.jfrog.artifactory.retention.maxCount="10" + +# Directory where the project is built ARG BUILD=/DLA-Future-build -# This is where we copy the sources to +# Directory where the miniapps are built as separate project +ARG BUILD_MINIAPP=/DLA-Future-miniapp-build +# Directory where the sources are copied to ARG SOURCE=/DLA-Future -# Where a bunch of shared libs live -ARG DEPLOY=/root/DLA-Future.bundle - -FROM $BUILD_IMAGE as builder - -ARG BUILD -ARG SOURCE -ARG DEPLOY +# Directory for some helper executables +ARG BIN=/DLA-Future-build/bin # Build DLA-Future COPY . ${SOURCE} @@ -30,81 +29,24 @@ RUN spack repo rm --scope site dlaf && \ spack -e ci concretize -f && \ spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose -# Prune and bundle binaries -RUN mkdir ${BUILD}-tmp && cd ${BUILD} && \ - export TEST_BINARIES=`PATH=${SOURCE}/ci:$PATH ctest --show-only=json-v1 | jq '.tests | map(.command | .[] | select(contains("check-threads") | not)) | .[]' | tr -d \"` && \ - echo "Binary sizes:" && \ - ls -lh ${TEST_BINARIES} && \ - ls -lh src/lib* && \ - libtree -d ${DEPLOY} ${TEST_BINARIES} && \ - rm -rf ${DEPLOY}/usr/bin && \ - libtree -d ${DEPLOY} $(which ctest gcov addr2line) && \ - cp -L ${SOURCE}/ci/{mpi-ctest,check-threads,upload_codecov} ${DEPLOY}/usr/bin && \ - echo "$TEST_BINARIES" | xargs -I{file} find -samefile {file} -exec cp --parents '{}' ${BUILD}-tmp ';' && \ - find '(' -name CTestTestfile.cmake -o -iname "*.gcno" ')' -exec cp --parents '{}' ${BUILD}-tmp ';' && \ - rm -rf ${BUILD} && \ - mv ${BUILD}-tmp ${BUILD} && \ - rm -rf ${SOURCE}/.git - -# Deploy Extra RocBlas files separately. -ARG USE_ROCBLAS=OFF -RUN mkdir ${DEPLOY}/usr/lib/rocblas; \ - if [ "$USE_ROCBLAS" = "ON" ]; then \ - cp -r `spack -e ci location -i rocblas`/lib/rocblas/library ${DEPLOY}/usr/lib/rocblas ; \ - fi +RUN mkdir -p ${BIN} && cp -L ${SOURCE}/ci/{mpi-ctest,check-threads,upload_codecov} ${BIN} -# Multistage build, this is the final small image -FROM $DEPLOY_BASE_IMAGE - -# set jfrog autoclean policy -LABEL com.jfrog.artifactory.retention.maxDays="7" -LABEL com.jfrog.artifactory.retention.maxCount="10" - -ENV DEBIAN_FRONTEND noninteractive - -ARG BUILD -ARG SOURCE -ARG DEPLOY - -ARG EXTRA_APTGET_DEPLOY ARG PIP_OPTS -# python is needed for fastcov # pip is needed only to install fastcov (it is removed with # its dependencies after fastcov installation) -# codecov upload needs curl + ca-certificates -# glibc-tools is needed for libSegFault on ubuntu:22.04 -# jq, strace are needed for check-threads -# tzdata is needed to print correct time RUN apt-get update -qq && \ - apt-get install -qq -y --no-install-recommends \ - ${EXTRA_APTGET_DEPLOY} \ - python3 python3-pip \ - curl \ - ca-certificates \ - glibc-tools jq strace \ - tzdata && \ + apt-get install -qq -y --no-install-recommends python3-pip && \ pip install ${PIP_OPTS} fastcov && \ apt-get autoremove -qq -y python3-pip && \ apt-get clean -# Copy the executables and the codecov gcno files -COPY --from=builder ${BUILD} ${BUILD} -COPY --from=builder ${DEPLOY} ${DEPLOY} - -# Copy the source files into the image as well. -# This is necessary for code coverage of MPI tests: gcov has to have write temporary -# data into the source folder. In distributed applications we can therefore not mount -# the git repo folder at runtime in the container, because it is shared and would -# cause race conditions in gcov. -COPY --from=builder ${SOURCE} ${SOURCE} - RUN cd /usr/local/bin && \ curl -Ls https://codecov.io/bash > codecov.sh && \ echo "f0e7a3ee76a787c37aa400cf44aee0c9b473b2fa79092edfb36d1faa853bbe23 codecov.sh" | sha256sum --check --quiet && \ chmod +x codecov.sh # Make it easy to call our binaries. -ENV PATH="${DEPLOY}/usr/bin:$PATH" +ENV PATH="${BIN}:$PATH" ENV NVIDIA_VISIBLE_DEVICES all ENV NVIDIA_DRIVER_CAPABILITIES compute,utility ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2" @@ -115,6 +57,4 @@ ENV ENABLE_COVERAGE="YES" # Automatically print stacktraces on segfault ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so -RUN echo "${DEPLOY}/usr/lib/" > /etc/ld.so.conf.d/dlaf.conf && ldconfig - WORKDIR ${BUILD} diff --git a/ci/docker/debug-cpu-stdexec.yaml b/ci/docker/debug-cpu-stdexec.yaml index 03578ac143..f225d71995 100644 --- a/ci/docker/debug-cpu-stdexec.yaml +++ b/ci/docker/debug-cpu-stdexec.yaml @@ -29,7 +29,3 @@ spack: - '+stdexec' - 'build_type=Debug' - 'malloc=system' - stdexec: - require: - - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main' - - 'build_type=Debug' diff --git a/ci/docker/deploy.Dockerfile b/ci/docker/deploy.Dockerfile index a500dd918e..42a11b2264 100644 --- a/ci/docker/deploy.Dockerfile +++ b/ci/docker/deploy.Dockerfile @@ -1,18 +1,17 @@ -ARG BUILD_IMAGE -ARG DEPLOY_BASE_IMAGE +ARG DEPS_IMAGE +FROM $DEPS_IMAGE -# This is the folder where the project is built +LABEL com.jfrog.artifactory.retention.maxDays="7" +LABEL com.jfrog.artifactory.retention.maxCount="10" + +# Directory where the project is built ARG BUILD=/DLA-Future-build -# This is where we copy the sources to +# Directory where the miniapps are built as separate project +ARG BUILD_MINIAPP=/DLA-Future-miniapp-build +# Directory where the sources are copied to ARG SOURCE=/DLA-Future -# Where a bunch of shared libs live -ARG DEPLOY=/root/DLA-Future.bundle - -FROM $BUILD_IMAGE as builder - -ARG BUILD -ARG SOURCE -ARG DEPLOY +# Directory for some helper executables +ARG BIN=/DLA-Future-build/bin # Build DLA-Future COPY . ${SOURCE} @@ -25,87 +24,18 @@ RUN spack repo rm --scope site dlaf && \ spack repo add ${SOURCE}/spack && \ spack -e ci develop --no-clone --path ${SOURCE} --build-directory ${BUILD} dla-future@master && \ spack -e ci concretize -f && \ - spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose + spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose && \ + find ${BUILD} -name CMakeFiles -exec rm -rf {} + # Test deployment with miniapps as independent project -RUN pushd ${SOURCE}/miniapp && \ - mkdir build-miniapps && cd build-miniapps && \ +RUN mkdir ${BUILD_MINIAPP} && cd ${BUILD_MINIAPP} && \ spack -e ci build-env dla-future@master -- \ - bash -c "cmake -DCMAKE_PREFIX_PATH=`spack -e ci location -i dla-future` .. && make -j ${NUM_PROCS}" && \ - popd - -# Prune and bundle binaries -RUN mkdir ${BUILD}-tmp && cd ${BUILD} && \ - export TEST_BINARIES=`PATH=${SOURCE}/ci:$PATH ctest --show-only=json-v1 | jq '.tests | map(.command | .[] | select(contains("check-threads") | not)) | .[]' | tr -d \"` && \ - LIBASAN=$(find /usr/lib -name libclang_rt.asan-x86_64.so) && \ - if [[ -n "${LIBASAN}" ]]; then export LD_LIBRARY_PATH=$(dirname ${LIBASAN}):${LD_LIBRARY_PATH}; fi && \ - echo "Binary sizes:" && \ - ls -lh ${TEST_BINARIES} && \ - ls -lh src/lib* && \ - libtree -d ${DEPLOY} ${TEST_BINARIES} && \ - rm -rf ${DEPLOY}/usr/bin && \ - libtree -d ${DEPLOY} $(which ctest addr2line) && \ - cp -L ${SOURCE}/ci/{mpi-ctest,check-threads} ${DEPLOY}/usr/bin && \ - echo "$TEST_BINARIES" | xargs -I{file} find -samefile {file} -exec cp --parents '{}' ${BUILD}-tmp ';' && \ - find -name CTestTestfile.cmake -exec cp --parents '{}' ${BUILD}-tmp ';' && \ - rm -rf ${BUILD} && \ - mv ${BUILD}-tmp ${BUILD} - -# Deploy MKL separately, since it dlopen's some libs -ARG USE_MKL=ON -RUN if [ "$USE_MKL" = "ON" ]; then \ - export MKL_LIB=$(dirname $(find $(spack location -i intel-oneapi-mkl) -name libmkl_core.so)) && \ - libtree -d ${DEPLOY} \ - ${MKL_LIB}/libmkl_avx2.so.2 \ - ${MKL_LIB}/libmkl_avx512.so.2 \ - ${MKL_LIB}/libmkl_core.so \ - ${MKL_LIB}/libmkl_def.so.2 \ - ${MKL_LIB}/libmkl_intel_thread.so \ - ${MKL_LIB}/libmkl_mc3.so.2 \ - ${MKL_LIB}/libmkl_sequential.so \ - ${MKL_LIB}/libmkl_tbb_thread.so \ - ${MKL_LIB}/libmkl_vml_avx2.so.2 \ - ${MKL_LIB}/libmkl_vml_avx512.so.2 \ - ${MKL_LIB}/libmkl_vml_cmpt.so.2 \ - ${MKL_LIB}/libmkl_vml_def.so.2 \ - ${MKL_LIB}/libmkl_vml_mc3.so.2 ; \ - fi - -# Deploy Extra RocBlas files separately. -ARG USE_ROCBLAS=OFF -RUN mkdir ${DEPLOY}/usr/lib/rocblas; \ - if [ "$USE_ROCBLAS" = "ON" ]; then \ - cp -r `spack -e ci location -i rocblas`/lib/rocblas/library ${DEPLOY}/usr/lib/rocblas ; \ - fi + bash -c "cmake -DCMAKE_PREFIX_PATH=`spack -e ci location -i dla-future` ${SOURCE}/miniapp && make -j ${NUM_PROCS}" -# Multistage build, this is the final small image -FROM $DEPLOY_BASE_IMAGE - -# set jfrog autoclean policy -LABEL com.jfrog.artifactory.retention.maxDays="7" -LABEL com.jfrog.artifactory.retention.maxCount="10" - -ENV DEBIAN_FRONTEND noninteractive - -ARG BUILD -ARG DEPLOY - -ARG EXTRA_APTGET_DEPLOY -# glibc-tools is needed for libSegFault on ubuntu:22.04 -# jq, strace are needed for check-threads -# tzdata is needed to print correct time -RUN apt-get update -qq && \ - apt-get install -qq -y --no-install-recommends \ - ${EXTRA_APTGET_DEPLOY} \ - glibc-tools jq strace \ - tzdata && \ - rm -rf /var/lib/apt/lists/* - -COPY --from=builder ${BUILD} ${BUILD} -COPY --from=builder ${DEPLOY} ${DEPLOY} +RUN mkdir -p ${BIN} && cp -L ${SOURCE}/ci/{mpi-ctest,check-threads} ${BIN} # Make it easy to call our binaries. -ENV PATH="${DEPLOY}/usr/bin:$PATH" +ENV PATH="${BIN}:$PATH" ENV NVIDIA_VISIBLE_DEVICES all ENV NVIDIA_DRIVER_CAPABILITIES compute,utility ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2" @@ -113,6 +43,4 @@ ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2" # Automatically print stacktraces on segfault ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so -RUN echo "${DEPLOY}/usr/lib/" > /etc/ld.so.conf.d/dlaf.conf && ldconfig - WORKDIR ${BUILD} diff --git a/ci/docker/release-cpu-stdexec.yaml b/ci/docker/release-cpu-stdexec.yaml index 412b47530e..4d3bcd6594 100644 --- a/ci/docker/release-cpu-stdexec.yaml +++ b/ci/docker/release-cpu-stdexec.yaml @@ -27,6 +27,3 @@ spack: pika: require: - '+stdexec' - stdexec: - require: - - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main' diff --git a/ci/docker/release-cuda-stdexec.yaml b/ci/docker/release-cuda-stdexec.yaml index 366ff247ef..f4847574b4 100644 --- a/ci/docker/release-cuda-stdexec.yaml +++ b/ci/docker/release-cuda-stdexec.yaml @@ -27,6 +27,3 @@ spack: pika: require: - '+stdexec' - stdexec: - require: - - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main' diff --git a/ci/docker/release-rocm533-stdexec.yaml b/ci/docker/release-rocm533-stdexec.yaml index 6008d85780..8713afa6bc 100644 --- a/ci/docker/release-rocm533-stdexec.yaml +++ b/ci/docker/release-rocm533-stdexec.yaml @@ -29,9 +29,6 @@ spack: pika: require: - '+stdexec' - stdexec: - require: - - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main' blas: require:: openblas lapack: diff --git a/ci/mpi-ctest b/ci/mpi-ctest index f8ca1b14d8..d753fbcb18 100755 --- a/ci/mpi-ctest +++ b/ci/mpi-ctest @@ -9,6 +9,19 @@ fi; pushd /DLA-Future-build > /dev/null +export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps +export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log + +if which nvidia-cuda-mps-control && [ $SLURM_LOCALID = 0 ]; then START_MPS=1; else START_MPS=0; fi &> /dev/null + +# Workaround on daint to avoid test hanging (See PR #1197) +# Launch MPS from a single rank per node +if [ $START_MPS -eq 1 ]; then + nvidia-cuda-mps-control -d +fi +# Wait for MPS to start +sleep 5 + # Run the tests, only output on the first rank if [[ $SLURM_PROCID == "0" ]]; then TZ=CET date +"Run started at: %H:%M:%S %z" @@ -18,6 +31,10 @@ else ctest -Q $@ fi +if [ $START_MPS -eq 1 ]; then + echo quit | nvidia-cuda-mps-control +fi + # Create coverage reports for code run if [[ "$ENABLE_COVERAGE" == "YES" ]]; then # On daint-mc (XC40) reduce the number of tasks to avoid out-of-memory error diff --git a/ci/rocm/clang14_release.yml b/ci/rocm/clang14_release.yml index 1a52ca6321..21636e54d7 100644 --- a/ci/rocm/clang14_release.yml +++ b/ci/rocm/clang14_release.yml @@ -9,7 +9,7 @@ rocm clang14 release deps: COMPILER: clang@14 USE_ROCBLAS: "ON" SPACK_ENVIRONMENT: ci/docker/release-rocm533.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/deps rocm clang14 release build: extends: @@ -18,5 +18,4 @@ rocm clang14 release build: needs: - rocm clang14 release deps variables: - DEPLOY_BASE_IMAGE: $CSCS_REGISTRY_PATH/rocm-patched:5.3.3 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/dlaf:$CI_COMMIT_SHA diff --git a/ci/rocm/clang14_release_stdexec.yml b/ci/rocm/clang14_release_stdexec.yml index 6c0407a562..3b55caf93b 100644 --- a/ci/rocm/clang14_release_stdexec.yml +++ b/ci/rocm/clang14_release_stdexec.yml @@ -9,7 +9,7 @@ rocm clang14 stdexec release deps: COMPILER: clang@14 USE_ROCBLAS: "ON" SPACK_ENVIRONMENT: ci/docker/release-rocm533-stdexec.yaml - BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/build + DEPS_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/deps rocm clang14 stdexec release build: extends: @@ -18,5 +18,4 @@ rocm clang14 stdexec release build: needs: - rocm clang14 stdexec release deps variables: - DEPLOY_BASE_IMAGE: $CSCS_REGISTRY_PATH/rocm-patched:5.3.3 - DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/deploy:$CI_COMMIT_SHA + DLAF_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/dlaf:$CI_COMMIT_SHA diff --git a/include/dlaf/eigensolver/band_to_tridiag.h b/include/dlaf/eigensolver/band_to_tridiag.h index be907da16c..273299d921 100644 --- a/include/dlaf/eigensolver/band_to_tridiag.h +++ b/include/dlaf/eigensolver/band_to_tridiag.h @@ -84,7 +84,6 @@ TridiagResult band_to_tridiagonal(blas::Uplo uplo, SizeType band switch (uplo) { case blas::Uplo::Lower: return BandToTridiag::call_L(band_size, mat_a); - break; case blas::Uplo::Upper: DLAF_UNIMPLEMENTED(uplo); break; @@ -161,7 +160,6 @@ TridiagResult band_to_tridiagonal(comm::CommunicatorGrid& grid, switch (uplo) { case blas::Uplo::Lower: return BandToTridiag::call_L(grid, band_size, mat_a); - break; case blas::Uplo::Upper: DLAF_UNIMPLEMENTED(uplo); break; diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h index ca680884ac..ede258197d 100644 --- a/include/dlaf/eigensolver/band_to_tridiag/mc.h +++ b/include/dlaf/eigensolver/band_to_tridiag/mc.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/include/dlaf/eigensolver/gen_to_std/impl.h b/include/dlaf/eigensolver/gen_to_std/impl.h index b5cfd3e2d0..9202ae6d0d 100644 --- a/include/dlaf/eigensolver/gen_to_std/impl.h +++ b/include/dlaf/eigensolver/gen_to_std/impl.h @@ -487,7 +487,6 @@ void GenToStd::call_L(comm::CommunicatorGrid& grid, Matrix(k)); hemmPanelTile(thread_priority::high, a_diag, mat_l.read(ik), mat_a.readwrite(ik)); @@ -759,7 +758,6 @@ void GenToStd::call_U(comm::CommunicatorGrid& grid, Matrix(k), j_local); hemmPanelTile(thread_priority::high, a_diag, mat_u.read(ki), mat_a.readwrite(ki)); diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h index 2a7882fd82..e3f4befb0e 100644 --- a/include/dlaf/eigensolver/reduction_to_band/impl.h +++ b/include/dlaf/eigensolver/reduction_to_band/impl.h @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -316,7 +317,7 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const std::vector>{}), // w (internally required) mat_taus.readwrite(LocalTileIndex(j_sub, 0)), ex::when_all_vector(std::move(panel_tiles))) | - ex::transfer(di::getBackendScheduler(thread_priority::high)) | + di::continues_on(di::getBackendScheduler(thread_priority::high)) | ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w, auto& taus, auto& tiles) { const auto barrier_busy_wait = getReductionToBandBarrierBusyWait(); @@ -638,7 +639,7 @@ void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, mat_taus.readwrite(GlobalTileIndex(j_sub, 0)), ex::when_all_vector(std::move(panel_tiles)), std::forward(mpi_col_chain_panel), std::forward(trigger)) | - ex::transfer(di::getBackendScheduler(pika::execution::thread_priority::high)) | + di::continues_on(di::getBackendScheduler(pika::execution::thread_priority::high)) | ex::bulk(nthreads, [nthreads, rank_v0, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w, auto& taus, auto& tiles, auto&& pcomm) { diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index 69f424afbf..e4eed8ff34 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -823,7 +824,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)), ex::just(std::vector>())) | - ex::transfer(di::getBackendScheduler(thread_priority::high)) | + di::continues_on(di::getBackendScheduler(thread_priority::high)) | ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho, auto& d_tiles, auto& z_tiles, auto& eval_tiles, const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) { @@ -1031,11 +1032,12 @@ void multiplyEigenvectors(const SizeType sub_offset, const SizeType n, const Siz // └───┴────────┴────┘ └────────────┴────┘ namespace ex = pika::execution::experimental; + using dlaf::internal::continues_on; using pika::execution::thread_priority; ex::start_detached( ex::when_all(std::forward(k), std::forward(n_udl)) | - ex::transfer(dlaf::internal::getBackendScheduler(thread_priority::high)) | + continues_on(dlaf::internal::getBackendScheduler(thread_priority::high)) | ex::then([sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(), e1 = e1.subPipelineConst(), e2 = e2.subPipelineConst()](const SizeType k, std::array n_udl) mutable { using dlaf::matrix::internal::MatrixRef; @@ -1082,7 +1084,6 @@ void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const Size namespace di = dlaf::internal; using pika::execution::thread_priority; - const GlobalTileIndex idx_gl_begin(i_begin, i_begin); const LocalTileIndex idx_loc_begin(i_begin, i_begin); const SizeType nrtiles = i_end - i_begin; const LocalTileSize sz_loc_tiles(nrtiles, nrtiles); @@ -1334,7 +1335,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S // additional workspaces ex::just(std::vector>()), ex::just(memory::MemoryView())) | - ex::transfer(hp_scheduler) | + di::continues_on(hp_scheduler) | ex::let_value([n, dist_sub, bcast_evals, all_reduce_in_place, hp_scheduler]( auto& row_comm_wrapper, auto& col_comm_wrapper, const SizeType k, const SizeType k_lc, const auto& rho, const auto& d_tiles, auto& z_tiles, @@ -1353,7 +1354,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S return std::clamp(ideal_workers, min_workers, available_workers); }(); - return ex::just(std::make_unique>(nthreads)) | ex::transfer(hp_scheduler) | + return ex::just(std::make_unique>(nthreads)) | di::continues_on(hp_scheduler) | ex::bulk(nthreads, [&row_comm_wrapper, &col_comm_wrapper, k, k_lc, &rho, &d_tiles, &z_tiles, &eval_tiles, &i4_tiles_arr, &i6_tiles_arr, &i2_tiles_arr, &evec_tiles, &ws_cols, &ws_row, nthreads, n, dist_sub, bcast_evals, @@ -1762,11 +1763,12 @@ void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Dis // └───┴────────┴────┘ └────────────┴────┘ namespace ex = pika::execution::experimental; + using dlaf::internal::continues_on; using pika::execution::thread_priority; ex::start_detached( ex::when_all(std::forward(k_lc), std::forward(n_udl)) | - ex::transfer(dlaf::internal::getBackendScheduler(thread_priority::high)) | + continues_on(dlaf::internal::getBackendScheduler(thread_priority::high)) | ex::then([dist_sub, sub_offset, n_upper, n_lower, e0 = e0.subPipeline(), e1 = e1.subPipelineConst(), e2 = e2.subPipelineConst(), sub_comm_row = row_task_chain.sub_pipeline(), @@ -1834,7 +1836,6 @@ void mergeDistSubproblems(comm::CommunicatorPipeline(dist, i_split, i_end); // The local size of the subproblem - const GlobalTileIndex idx_gl_begin(i_begin, i_begin); const LocalTileIndex idx_loc_begin{dist.next_local_tile_from_global_tile(i_begin), dist.next_local_tile_from_global_tile(i_begin)}; const LocalTileIndex idx_loc_end{dist.next_local_tile_from_global_tile(i_end), diff --git a/include/dlaf/factorization/cholesky/impl.h b/include/dlaf/factorization/cholesky/impl.h index ec6aed659e..5f26f0020e 100644 --- a/include/dlaf/factorization/cholesky/impl.h +++ b/include/dlaf/factorization/cholesky/impl.h @@ -196,7 +196,7 @@ void Cholesky::call_L(comm::CommunicatorGrid& grid, Matrix num_cholesky_calls = 0; std::stringstream fname; - fname << "cholesky-facrorization-" << matrix::internal::TypeToString_v << "-" + fname << "cholesky-factorization-" << matrix::internal::TypeToString_v << "-" << std::to_string(num_cholesky_calls) << ".h5"; std::optional file; diff --git a/include/dlaf/init.h b/include/dlaf/init.h index e63da1896f..fe21d84ac5 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -38,10 +38,18 @@ struct configuration { bool print_config = false; std::size_t num_np_gpu_streams_per_thread = 3; std::size_t num_hp_gpu_streams_per_thread = 3; + std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_alignment_bytes = 16; + double umpire_host_memory_pool_coalescing_free_ratio = 1.0; + double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0; + std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_alignment_bytes = 16; + double umpire_device_memory_pool_coalescing_free_ratio = 1.0; + double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0; std::size_t num_gpu_blas_handles = 16; std::size_t num_gpu_lapack_handles = 16; - std::size_t umpire_host_memory_pool_initial_bytes = 1 << 30; - std::size_t umpire_device_memory_pool_initial_bytes = 1 << 30; std::string mpi_pool = "mpi"; }; diff --git a/include/dlaf/matrix/panel.h b/include/dlaf/matrix/panel.h index 498ed60697..f20de36e6e 100644 --- a/include/dlaf/matrix/panel.h +++ b/include/dlaf/matrix/panel.h @@ -361,6 +361,8 @@ struct Panel { return {mat_size - i_tile * mb, nb}; case Coord::Row: return {mb, mat_size - i_tile * nb}; + default: + return DLAF_UNREACHABLE(LocalElementSize); } } diff --git a/include/dlaf/memory/memory_chunk.h b/include/dlaf/memory/memory_chunk.h index c837d54f49..ab5846c2f0 100644 --- a/include/dlaf/memory/memory_chunk.h +++ b/include/dlaf/memory/memory_chunk.h @@ -27,11 +27,15 @@ namespace memory { namespace internal { umpire::Allocator& getUmpireHostAllocator(); -void initializeUmpireHostAllocator(std::size_t initial_bytes); +void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio); void finalizeUmpireHostAllocator(); #ifdef DLAF_WITH_GPU -void initializeUmpireDeviceAllocator(std::size_t initial_bytes); +void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio); void finalizeUmpireDeviceAllocator(); umpire::Allocator& getUmpireDeviceAllocator(); #endif diff --git a/include/dlaf/multiplication/hermitian.h b/include/dlaf/multiplication/hermitian.h index 6c228cc2d2..38f5e99ac1 100644 --- a/include/dlaf/multiplication/hermitian.h +++ b/include/dlaf/multiplication/hermitian.h @@ -66,7 +66,6 @@ void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, M switch (uplo) { case blas::Uplo::Lower: return multiplication::internal::Hermitian::call_LL(alpha, mat_a, mat_b, beta, mat_c); - break; case blas::Uplo::Upper: DLAF_UNIMPLEMENTED(uplo); break; @@ -130,7 +129,6 @@ void hermitian_multiplication(comm::CommunicatorGrid& grid, blas::Side side, bla case blas::Uplo::Lower: return multiplication::internal::Hermitian::call_LL(grid, alpha, mat_a, mat_b, beta, mat_c); - break; case blas::Uplo::Upper: DLAF_UNIMPLEMENTED(uplo); break; diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h index 8e643ea8c7..ef3f8686a6 100644 --- a/include/dlaf/permutations/general/impl.h +++ b/include/dlaf/permutations/general/impl.h @@ -170,6 +170,7 @@ void Permutations::call(const SizeType i_begin, const SizeType i_end namespace ex = pika::execution::experimental; namespace dist_extra = dlaf::matrix::internal::distribution; using dist_extra::local_element_distance_from_global_tile; + using dlaf::internal::continues_on; if (i_begin == i_end) return; @@ -210,7 +211,7 @@ void Permutations::call(const SizeType i_begin, const SizeType i_end applyPermutationOnCPU(i_perm, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles); }; - ex::start_detached(std::move(sender) | ex::transfer(dlaf::internal::getBackendScheduler()) | + ex::start_detached(std::move(sender) | continues_on(dlaf::internal::getBackendScheduler()) | ex::bulk(nperms, std::move(permute_fn))); } else { @@ -430,7 +431,7 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i applyPermutationOnCPU(i_perm, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles); }; - ex::start_detached(std::move(sender) | ex::transfer(di::getBackendScheduler()) | + ex::start_detached(std::move(sender) | di::continues_on(di::getBackendScheduler()) | ex::bulk(nperms, std::move(permute_fn))); } else { diff --git a/include/dlaf/sender/continues_on.h b/include/dlaf/sender/continues_on.h new file mode 100644 index 0000000000..12aae7c725 --- /dev/null +++ b/include/dlaf/sender/continues_on.h @@ -0,0 +1,20 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +#include + +namespace dlaf::internal { +#if PIKA_VERSION_FULL < 0x001D00 // < 0.29.0 +inline constexpr pika::execution::experimental::transfer_t continues_on{}; +#else +using pika::execution::experimental::continues_on; +#endif +} diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h index 78b66eb001..4e391456b4 100644 --- a/include/dlaf/sender/transform.h +++ b/include/dlaf/sender/transform.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -46,19 +47,19 @@ enum class TransformDispatchType { Plain, Blas, Lapack }; // allows choosing the priority. // // At its core, transform is a convenience wrapper around -// sender | transfer(with_priority(scheduler, priority)) | then(ConsumeRvalues(unwrapping(f))). +// sender | continues_on(with_priority(scheduler, priority)) | then(ConsumeRvalues(unwrapping(f))). /// Lazy transform. This does not submit the work and returns a sender. template >> [[nodiscard]] decltype(auto) transform(const Policy policy, F&& f, Sender&& sender) { + using dlaf::internal::continues_on; using pika::execution::experimental::drop_operation_state; using pika::execution::experimental::then; - using pika::execution::experimental::transfer; auto scheduler = getBackendScheduler(policy.priority(), policy.stacksize()); - auto transfer_sender = transfer(std::forward(sender), std::move(scheduler)); + auto transfer_sender = continues_on(std::forward(sender), std::move(scheduler)); using dlaf::common::internal::ConsumeRvalues; using dlaf::common::internal::Unwrapping; diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index edfbd7d419..f5c2ac728a 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -18,8 +18,8 @@ #include #include #include +#include #include -#include namespace dlaf::comm::internal { @@ -89,38 +89,14 @@ MPICallHelper(F&&) -> MPICallHelper>; template >> [[nodiscard]] decltype(auto) transformMPI(F&& f, Sender&& sender) { + using dlaf::internal::continues_on; namespace ex = pika::execution::experimental; - return ex::transfer(std::forward(sender), dlaf::internal::getMPIScheduler()) | + return continues_on(std::forward(sender), dlaf::internal::getMPIScheduler()) | ex::then(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}) | ex::drop_operation_state(); } -/// Fire-and-forget transformMPI. This submits the work and returns void. -template >> -void transformMPIDetach(F&& f, Sender&& sender) { - pika::execution::experimental::start_detached(transformMPI(std::forward(f), - std::forward(sender))); -} - -/// Lazy transformMPI. This does not submit the work and returns a sender. First -/// lifts non-senders into senders using just, and then calls transform with a -/// when_all sender of the lifted senders. -template -[[nodiscard]] decltype(auto) transformMPILift(F&& f, Ts&&... ts) { - return transformMPI(std::forward(f), dlaf::internal::whenAllLift(std::forward(ts)...)); -} - -/// Fire-and-forget transformMPI. This submits the work and returns void. First -/// lifts non-senders into senders using just, and then calls transform with a -/// when_all sender of the lifted senders. -template -void transformMPILiftDetach(F&& f, Ts&&... ts) { - pika::execution::experimental::start_detached(transformLift(std::forward(f), - std::forward(ts)...)); -} - template struct PartialTransformMPIBase { std::decay_t f_; @@ -148,29 +124,6 @@ class PartialTransformMPI : private PartialTransformMPIBase { template PartialTransformMPI(F&& f) -> PartialTransformMPI>; -/// A partially applied transformMPIDetach, with the callable object given, but -/// the predecessor sender missing. The predecessor sender is applied when -/// calling the operator| overload. -template -class PartialTransformMPIDetach : private PartialTransformMPIBase { -public: - template - PartialTransformMPIDetach(F_&& f) : PartialTransformMPIBase{std::forward(f)} {} - PartialTransformMPIDetach(PartialTransformMPIDetach&&) = default; - PartialTransformMPIDetach(const PartialTransformMPIDetach&) = default; - PartialTransformMPIDetach& operator=(PartialTransformMPIDetach&&) = default; - PartialTransformMPIDetach& operator=(const PartialTransformMPIDetach&) = default; - - template - friend auto operator|(Sender&& sender, PartialTransformMPIDetach pa) { - return pika::execution::experimental::start_detached(transformMPI(std::move(pa.f_), - std::forward(sender))); - } -}; - -template -PartialTransformMPIDetach(F&& f) -> PartialTransformMPIDetach>; - /// \overload transformMPI /// /// This overload partially applies the MPI transform for later use with @@ -179,13 +132,4 @@ template [[nodiscard]] decltype(auto) transformMPI(F&& f) { return PartialTransformMPI{std::forward(f)}; } - -/// \overload transformMPIDetach -/// -/// This overload partially applies transformMPIDetach for later use with -/// operator| with a sender on the left-hand side. -template -[[nodiscard]] decltype(auto) transformMPIDetach(F&& f) { - return PartialTransformMPIDetach{std::forward(f)}; -} } diff --git a/miniapp/include/dlaf/miniapp/options.h b/miniapp/include/dlaf/miniapp/options.h index fc2248ab20..5254ce9291 100644 --- a/miniapp/include/dlaf/miniapp/options.h +++ b/miniapp/include/dlaf/miniapp/options.h @@ -170,7 +170,7 @@ T stringToBlasEnum(const std::string& option_name, const std::string& x, } } std::string option_name_dashes = "--" + option_name; - DLAF_MINIAPP_INVALID_OPTION_VALUE(option_name, x, valid_values_stream.str()); + DLAF_MINIAPP_INVALID_OPTION_VALUE(option_name_dashes, x, valid_values_stream.str()); } return static_cast(std::toupper(x[0])); diff --git a/scripts/plot_strong.sh b/scripts/plot_strong.sh new file mode 100755 index 0000000000..9a85e274a2 --- /dev/null +++ b/scripts/plot_strong.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash + +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +# This script helps to plot the results of different benchmarks. + +set -eu + +#################################################################################################### +# Variables to modify: +debug=0 +python_venv_path="" +# Path where to find the benchmarks data as the root of z/ and d/ (since they are appended later) +# (list containing several paths if we want to compare data on the same plots) +base_paths=( + "" \ + "" \ + ) +# Path where you want your plotting results +out_path="" +#################################################################################################### + +if [[ -z $out_path ]] || [[ -z ${base_paths[0]} ]]; then + echo "You need to set the variables in the beginning of the script" + exit 1 +fi + +if [[ ! -z "$python_venv_path" ]]; then + source $python_venv_path/bin/activate +fi + +complex_paths=(${base_paths[@]/%//z}) +double_paths=(${base_paths[@]/%//d}) +out_path_complex=${out_path/%//z} +out_path_double=${out_path/%//d} + +args_base="--distinguish-dir" +args_double="$args_base --out-path ${out_path_double}" +args_complex="$args_base --out-path ${out_path_complex}" + +idx=0 +for path in "${base_paths[@]}"; do + args_complex+=" --path ${complex_paths[$idx]}" + args_double+=" --path ${double_paths[$idx]}" + idx=$((idx+1)) +done + +if [[ "$debug" == 1 ]]; then + BOLD=$(tput bold) + NORMAL=$(tput sgr0) + echo "${BOLD}double_paths list:${NORMAL} ${double_paths[@]}" + echo "${BOLD}complex_paths list:${NORMAL} ${complex_paths[@]}" + echo "${BOLD}double_args:${NORMAL} $args_double" + echo "${BOLD}complex_args:${NORMAL} $args_complex" +else + set -x + # double + ./plot_chol_strong.py $args_double & + ./plot_band2trid_strong.py $args_double & + ./plot_hegst_strong.py $args_double & + ./plot_trmm_strong.py $args_double & + ./plot_bt_band2trid_strong.py $args_double & + ./plot_evp_strong.py $args_double & + ./plot_red2band_strong.py $args_double & + ./plot_trsm_strong.py $args_double & + ./plot_bt_red2band_strong.py $args_double & + ./plot_gevp_strong.py $args_double & + ./plot_tridiag_solver_strong.py $args_double & + + # complex & + ./plot_chol_strong.py $args_complex & + ./plot_band2trid_strong.py $args_complex & + ./plot_hegst_strong.py $args_complex & + ./plot_trmm_strong.py $args_complex & + ./plot_bt_band2trid_strong.py $args_complex & + ./plot_evp_strong.py $args_complex & + ./plot_red2band_strong.py $args_complex & + ./plot_trsm_strong.py $args_complex & + ./plot_bt_red2band_strong.py $args_complex & + ./plot_gevp_strong.py $args_complex & + ./plot_tridiag_solver_strong.py $args_complex & + + wait +fi diff --git a/scripts/systems.py b/scripts/systems.py index b28584a5bc..c87e79d760 100644 --- a/scripts/systems.py +++ b/scripts/systems.py @@ -49,6 +49,7 @@ #SBATCH --constraint=mc #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -81,6 +82,7 @@ #SBATCH --constraint=gpu #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -112,6 +114,7 @@ #SBATCH --constraint=mc #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -146,6 +149,7 @@ #SBATCH --hint=multithread #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -179,6 +183,7 @@ #SBATCH --hint=multithread #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -211,6 +216,7 @@ #SBATCH --hint=multithread #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_OPT_THREAD_SYNC=0 # Required to work around MPICH bug @@ -245,6 +251,7 @@ #SBATCH --hint=multithread #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple @@ -283,6 +290,7 @@ #SBATCH --gpus-per-node=8 #SBATCH --output=output.txt #SBATCH --error=error.txt +#SBATCH --no-requeue # Env export MPICH_MAX_THREAD_SAFETY=multiple diff --git a/src/eigensolver/bt_band_to_tridiag/gpu.cpp b/src/eigensolver/bt_band_to_tridiag/gpu.cpp index bc806bc7e8..86884b7162 100644 --- a/src/eigensolver/bt_band_to_tridiag/gpu.cpp +++ b/src/eigensolver/bt_band_to_tridiag/gpu.cpp @@ -8,6 +8,8 @@ // SPDX-License-Identifier: BSD-3-Clause // +#include + #include namespace dlaf::eigensolver::internal { diff --git a/src/init.cpp b/src/init.cpp index b1985b80a6..0d4366e15e 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -29,15 +29,23 @@ namespace dlaf { std::ostream& operator<<(std::ostream& os, const configuration& cfg) { + // clang-format off os << " num_np_gpu_streams_per_thread = " << cfg.num_np_gpu_streams_per_thread << std::endl; os << " num_hp_gpu_streams_per_thread = " << cfg.num_hp_gpu_streams_per_thread << std::endl; + os << " umpire_host_memory_pool_initial_block_bytes = " << cfg.umpire_host_memory_pool_initial_block_bytes << std::endl; + os << " umpire_host_memory_pool_next_block_bytes = " << cfg.umpire_host_memory_pool_next_block_bytes << std::endl; + os << " umpire_host_memory_pool_alignment_bytes = " << cfg.umpire_host_memory_pool_alignment_bytes << std::endl; + os << " umpire_host_memory_pool_coalescing_free_ratio = " << cfg.umpire_host_memory_pool_coalescing_free_ratio << std::endl; + os << " umpire_host_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_host_memory_pool_coalescing_reallocation_ratio << std::endl; + os << " umpire_device_memory_pool_initial_block_bytes = " << cfg.umpire_device_memory_pool_initial_block_bytes << std::endl; + os << " umpire_device_memory_pool_next_block_bytes = " << cfg.umpire_device_memory_pool_next_block_bytes << std::endl; + os << " umpire_device_memory_pool_alignment_bytes = " << cfg.umpire_device_memory_pool_alignment_bytes << std::endl; + os << " umpire_device_memory_pool_coalescing_free_ratio = " << cfg.umpire_device_memory_pool_coalescing_free_ratio << std::endl; + os << " umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl; os << " num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl; os << " num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl; - os << " umpire_host_memory_pool_initial_bytes = " << cfg.umpire_host_memory_pool_initial_bytes - << std::endl; - os << " umpire_device_memory_pool_initial_bytes = " << cfg.umpire_device_memory_pool_initial_bytes - << std::endl; os << " mpi_pool = " << cfg.mpi_pool << std::endl; + // clang-format on return os; } @@ -58,7 +66,10 @@ struct Init { template <> struct Init { static void initialize(const configuration& cfg) { - memory::internal::initializeUmpireHostAllocator(cfg.umpire_host_memory_pool_initial_bytes); + memory::internal::initializeUmpireHostAllocator( + cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes, + cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, + cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); } static void finalize() { @@ -106,7 +117,11 @@ template <> struct Init { static void initialize(const configuration& cfg) { const int device = 0; - memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes); + memory::internal::initializeUmpireDeviceAllocator( + cfg.umpire_device_memory_pool_initial_block_bytes, + cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes, + cfg.umpire_host_memory_pool_coalescing_free_ratio, + cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread, cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles); pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default")); @@ -140,6 +155,13 @@ struct parseFromString { } }; +template <> +struct parseFromString { + static std::optional call(const std::string& var) { + return std::stod(var); + } +}; + template <> struct parseFromString { static std::optional call(const std::string& var) { @@ -216,26 +238,27 @@ void warnUnusedConfigurationOption(const pika::program_options::variables_map& v } void updateConfiguration(const pika::program_options::variables_map& vm, configuration& cfg) { + // clang-format off updateConfigurationValue(vm, cfg.print_config, "PRINT_CONFIG", "print-config"); - updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", - "num-np-gpu-streams-per-thread"); - updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", - "num-hp-gpu-streams-per-thread"); + updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", "num-np-gpu-streams-per-thread"); + updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", "num-hp-gpu-streams-per-thread"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_block_bytes, "UMPIRE_HOST_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-host-memory-pool-initial-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_next_block_bytes, "UMPIRE_HOST_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-host-memory-pool-next-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_alignment_bytes, "UMPIRE_HOST_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-host-memory-pool-alignment-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_free_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-host-memory-pool-coalescing-free-ratio"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-host-memory-pool-coalescing-reallocation-ratio"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-device-memory-pool-initial-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_next_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-device-memory-pool-next-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_alignment_bytes, "UMPIRE_DEVICE_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-device-memory-pool-alignment-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_free_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-device-memory-pool-coalescing-free-ratio"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_reallocation_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-device-memory-pool-coalescing-reallocation-ratio"); updateConfigurationValue(vm, cfg.num_gpu_blas_handles, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles"); - updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", - "num-gpu-lapack-handles"); + updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles"); #if PIKA_VERSION_FULL < 0x001D00 // < 0.29.0 - warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", - "only supported with pika 0.29.0 or newer"); - warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", - "only supported with pika 0.29.0 or newer"); + warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", "only supported with pika 0.29.0 or newer"); + warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", "only supported with pika 0.29.0 or newer"); #endif - updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_bytes, - "UMPIRE_HOST_MEMORY_POOL_INITIAL_BYTES", - "umpire-host-memory-pool-initial-bytes"); - updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_bytes, - "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BYTES", - "umpire-device-memory-pool-initial-bytes"); + // clang-format on cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default"; // Warn if not using MPI pool without --dlaf:no-mpi-pool @@ -257,44 +280,28 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu // NOTE: Environment variables should omit the DLAF_ prefix and command line options the dlaf: prefix. // These are added automatically by updateConfigurationValue. auto& param = getTuneParameters(); - updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", - "red2band-panel-nworkers"); - - updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", - "red2band-barrier-busy-wait-us"); - - updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", - "eigensolver-min-band"); - - updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, - "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base"); - - updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, - "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, - "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, - "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", ""); + // clang-format off + updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers"); + updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us"); + updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band"); + updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base"); + + updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", ""); updateConfigurationValue(vm, param.debug_dump_eigensolver_data, "DEBUG_DUMP_EIGENSOLVER_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, - "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, - "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", - ""); + updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", ""); - updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", - "tridiag-rank1-nworkers"); + updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", "tridiag-rank1-nworkers"); - updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, - "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us"); + updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us"); - updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, - "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", - "bt-band-to-tridiag-hh-apply-group-size"); + updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", "bt-band-to-tridiag-hh-apply-group-size"); - updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", - "communicator-grid-num-pipelines"); + updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", "communicator-grid-num-pipelines"); + // clang-format on } configuration& getConfiguration() { @@ -306,49 +313,35 @@ configuration& getConfiguration() { pika::program_options::options_description getOptionsDescription() { pika::program_options::options_description desc("DLA-Future options"); + // clang-format off desc.add_options()("dlaf:help", "Print help message"); desc.add_options()("dlaf:print-config", "Print the DLA-Future configuration"); - desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value(), - "Number of normal priority GPU streams per worker thread"); - desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value(), - "Number of high priority GPU streams per worker thread"); - desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value(), - "Number of GPU BLAS (cuBLAS/rocBLAS) handles"); - desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value(), - "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles"); - desc.add_options()("dlaf:umpire-host-memory-pool-initial-bytes", - pika::program_options::value(), - "Number of bytes to preallocate for pinned host memory pool"); - desc.add_options()("dlaf:umpire-device-memory-pool-initial-bytes", - pika::program_options::value(), - "Number of bytes to preallocate for device memory pool"); + desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value(), "Number of normal priority GPU streams per worker thread"); + desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value(), "Number of high priority GPU streams per worker thread"); + desc.add_options()("dlaf:umpire-host-memory-pool-initial-block-bytes", pika::program_options::value(), "Number of bytes to preallocate for pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-next-block-bytes", pika::program_options::value(), "Number of bytes to allocate in blocks after the first block for pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-alignment-bytes", pika::program_options::value(), "Alignment of allocations in bytes in pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-free-ratio", pika::program_options::value(), "Required ratio of free memory in pinned host memory pool before performing coalescing of free blocks"); + desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-reallocation-ratio", pika::program_options::value(), "Ratio of current used memory in pinned host memory pool to use for reallocation of new blocks when coalescing free blocks"); + desc.add_options()("dlaf:umpire-device-memory-pool-initial-block-bytes", pika::program_options::value(), "Number of bytes to preallocate for device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-next-block-bytes", pika::program_options::value(), "Number of bytes to allocate in blocks after the first block for device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-alignment-bytes", pika::program_options::value(), "Alignment of allocations in bytes in device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-free-ratio", pika::program_options::value(), "Required ratio of free memory in device memory pool before performing coalescing of free blocks"); + desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-reallocation-ratio", pika::program_options::value(), "Ratio of current used memory in device memory pool to use for reallocation of new blocks when coalescing free blocks"); + desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value(), "Number of GPU BLAS (cuBLAS/rocBLAS) handles"); + desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value(), "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles"); desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool."); // Tune parameters command line options - desc.add_options()( - "dlaf:red2band-panel-nworkers", pika::program_options::value(), - "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); - desc.add_options()( - "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), - "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); - desc.add_options()( - "dlaf:eigensolver-min-band", pika::program_options::value(), - "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); - desc.add_options()( - "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), - "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); - desc.add_options()( - "dlaf:tridiag-rank1-nworkers", pika::program_options::value(), - "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); - desc.add_options()( - "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), - "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); - desc.add_options()( - "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), - "The application of the HH reflector is splitted in smaller applications of group size reflectors."); - desc.add_options()( - "dlaf:communicator-grid-num-pipelines", pika::program_options::value(), - "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); + desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); + desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); + desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); + desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); + desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); + desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); + desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), "The application of the HH reflector is splitted in smaller applications of group size reflectors."); + desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); + // clang-format on return desc; } diff --git a/src/lapack/gpu/add.cu b/src/lapack/gpu/add.cu index 9e9f3e97e7..4ef3329b64 100644 --- a/src/lapack/gpu/add.cu +++ b/src/lapack/gpu/add.cu @@ -32,6 +32,20 @@ __device__ inline void addAlpha(const T& alpha, const T& a, T& b) { b = b + alpha * a; } +#ifdef DLAF_WITH_HIP +template <> +__device__ inline void addAlpha(const hipFloatComplex& alpha, const hipFloatComplex& a, + hipFloatComplex& b) { + b = b + hipCmulf(alpha, a); +} + +template <> +__device__ inline void addAlpha(const hipDoubleComplex& alpha, + const hipDoubleComplex& a, hipDoubleComplex& b) { + b = b + hipCmul(alpha, a); +} +#endif + template __device__ inline void sum(const T& /*alpha*/, const T& a, T& b) { b = b + a; diff --git a/src/memory/memory_chunk.cpp b/src/memory/memory_chunk.cpp index a2b0c33885..9880c1a036 100644 --- a/src/memory/memory_chunk.cpp +++ b/src/memory/memory_chunk.cpp @@ -11,9 +11,11 @@ #include #include +#include #include #include +#include #include namespace dlaf { @@ -36,7 +38,46 @@ umpire::Allocator& getUmpireHostAllocator() { } #endif -void initializeUmpireHostAllocator(std::size_t initial_bytes) { +using PoolType = umpire::strategy::QuickPool; +using CoalesceHeuristicType = umpire::strategy::PoolCoalesceHeuristic; + +#ifdef DLAF_WITH_GPU +// This is a modified version of the "percent_releasable" coalescing heuristic +// from Umpire. This version allows choosing what ratio of the actual size to +// reallocate when coalescing. +// +// A free ratio of 1.0 means that the pool will be coalesced only when all +// blocks are unused. A free ratio of 0.5 means that the pool will be coalesced +// when at least 50% of the pool's memory is unused. A ratio of 0.0 means that +// the pool will be coalesced as soon as any two free blocks are available. A +// ratio of more than 1.0 will make the pool never coalesce. +// +// A reallocation ratio of 1.0 simply coalesces all the free memory into a new +// block. A ratio of 0.5 will attempt to shrink the pool to half its previous +// size. A ratio of 1.5 will allocate 50% more than the previous pool size. +// +// A single free block is never "coalesced" to keep things simple. In theory a +// single block could be shrunk or grown to match the reallocation ratio but +// this can lead to strange reallocations, so we simply avoid that case. Two or +// more blocks are always coalesced to one block, so no reallocation will +// happen immediately after coalescing two or more blocks. +static CoalesceHeuristicType get_coalesce_heuristic(double coalesce_free_ratio, + double coalesce_reallocation_ratio) { + return [=](const PoolType& pool) { + std::size_t threshold = static_cast(coalesce_free_ratio * pool.getActualSize()); + if (pool.getReleasableBlocks() >= 2 && pool.getReleasableSize() >= threshold) { + return static_cast(coalesce_reallocation_ratio * pool.getActualSize()); + } + else { + return static_cast(0); + } + }; +} +#endif + +void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio) { #ifdef DLAF_WITH_GPU static bool initialized = false; @@ -45,26 +86,29 @@ void initializeUmpireHostAllocator(std::size_t initial_bytes) { if (!initialized) { auto host_allocator = umpire::ResourceManager::getInstance().getAllocator("PINNED"); auto pooled_host_allocator = - umpire::ResourceManager::getInstance().makeAllocator("PINNED_pool", - host_allocator, - initial_bytes); + umpire::ResourceManager::getInstance().makeAllocator( + "DLAF_PINNED_pool", host_allocator, initial_block_bytes, next_block_bytes, alignment_bytes, + get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio)); auto thread_safe_pooled_host_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "PINNED_thread_safe_pool", pooled_host_allocator); + "DLAF_PINNED_thread_safe_pool", pooled_host_allocator); memory::internal::getUmpireHostAllocator() = thread_safe_pooled_host_allocator; initialized = true; } #else - (void) initial_bytes; + dlaf::internal::silenceUnusedWarningFor(initial_block_bytes, next_block_bytes, alignment_bytes, + coalesce_free_ratio, coalesce_reallocation_ratio); #endif } void finalizeUmpireHostAllocator() {} #ifdef DLAF_WITH_GPU -void initializeUmpireDeviceAllocator(std::size_t initial_bytes) { +void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio) { static bool initialized = false; // Umpire pools cannot be released, so we keep the pools around even when @@ -73,10 +117,11 @@ void initializeUmpireDeviceAllocator(std::size_t initial_bytes) { auto device_allocator = umpire::ResourceManager::getInstance().getAllocator("DEVICE"); auto pooled_device_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "DEVICE_pool", device_allocator, initial_bytes); + "DLAF_DEVICE_pool", device_allocator, initial_block_bytes, next_block_bytes, alignment_bytes, + get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio)); auto thread_safe_pooled_device_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "DEVICE_thread_safe_pool", pooled_device_allocator); + "DLAF_DEVICE_thread_safe_pool", pooled_device_allocator); memory::internal::getUmpireDeviceAllocator() = thread_safe_pooled_device_allocator; diff --git a/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp b/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp index 1ec8dcbfe4..ae86c21023 100644 --- a/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp +++ b/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp @@ -77,7 +77,6 @@ void testEigensolver(const blas::Uplo uplo, const SizeType m, const SizeType mb, // Here we need to resume it manually to build the matrices with DLA-Future pika::resume(); - const LocalElementSize size(m, m); const TileElementSize block_size(mb, mb); Matrix reference = [&]() {