diff --git a/ci/common-ci.yml b/ci/common-ci.yml
index 97ae84ae0c..f070765240 100644
--- a/ci/common-ci.yml
+++ b/ci/common-ci.yml
@@ -24,10 +24,9 @@ stages:
     - TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 16`
     - TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 16`
     - TAG=${TAG_IMAGE}-${TAG_APTGET}-${TAG_COMPILER}-MKL${USE_MKL}-${TAG_DOCKERFILE}-${TAG_SPACK}-${TAG_REPO}-${TAG_ENVIRONMENT}
-    - export PERSIST_IMAGE_NAME=$BUILD_IMAGE:$TAG
-    - echo "BUILD_IMAGE=$PERSIST_IMAGE_NAME" > build.env
+    - export PERSIST_IMAGE_NAME=$DEPS_IMAGE:$TAG
+    - echo "DEPS_IMAGE=$PERSIST_IMAGE_NAME" > build.env
     - echo "USE_MKL=$USE_MKL" >> build.env
-    - echo "USE_ROCBLAS=$USE_ROCBLAS" >> build.env
     - echo "USE_CODECOV=$USE_CODECOV" >> build.env
     - 'echo "INFO: Building image $PERSIST_IMAGE_NAME"'
     - 'echo "INFO: Using NUM_CORES_BUILD_DEPS=$NUM_CORES_BUILD_DEPS"'
@@ -35,11 +34,10 @@ stages:
     reports:
       dotenv: build.env
   variables:
-    SPACK_SHA: 0905edf592752742eb4ddab3a528d3aee8f92930
+    SPACK_SHA: develop-2024-10-06
     SPACK_DLAF_REPO: ./spack
     DOCKER_BUILD_ARGS: '[
         "BASE_IMAGE",
-        "BUILDKIT_INLINE_CACHE=1",
         "SPACK_SHA",
         "EXTRA_APTGET",
         "COMPILER",
@@ -57,7 +55,6 @@ stages:
     EXTRA_APTGET: ""
     CXXSTD: 17
     USE_MKL: "OFF"
-    USE_ROCBLAS: "OFF"
     COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
     USE_CODECOV: "false"
 
@@ -69,23 +66,17 @@ stages:
     - 'echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin'
     - 'echo "INFO: Using NUM_CORES_BUILD_DLAF=$NUM_CORES_BUILD_DLAF"'
   after_script:
-    - podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DEPLOY_IMAGE /ctest_to_gitlab.sh "$DEPLOY_IMAGE" "$USE_CODECOV" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" > pipeline.yml
+    - podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DLAF_IMAGE /ctest_to_gitlab.sh "$DLAF_IMAGE" "$USE_CODECOV" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" > pipeline.yml
   variables:
-    PERSIST_IMAGE_NAME: $DEPLOY_IMAGE
+    PERSIST_IMAGE_NAME: $DLAF_IMAGE
     DOCKER_BUILD_ARGS: '[
-        "BUILD_IMAGE",
-        "DEPLOY_BASE_IMAGE",
-        "EXTRA_APTGET_DEPLOY",
+        "DEPS_IMAGE",
         "PIP_OPTS",
-        "USE_MKL",
-        "USE_ROCBLAS",
         "NUM_PROCS=$NUM_CORES_BUILD_DLAF"
       ]'
     # default configuration variables
     # can be overwritten in the configuration as needed
     DOCKERFILE: ci/docker/deploy.Dockerfile
-    DEPLOY_BASE_IMAGE: docker.io/ubuntu:24.04
-    EXTRA_APTGET_DEPLOY: ""
     PIP_OPTS: ""
   artifacts:
     paths:
diff --git a/ci/cpu/asan_ubsan_lsan.yml b/ci/cpu/asan_ubsan_lsan.yml
index f5cb1224fd..bcc987399a 100644
--- a/ci/cpu/asan_ubsan_lsan.yml
+++ b/ci/cpu/asan_ubsan_lsan.yml
@@ -6,9 +6,9 @@ cpu asan ubsan lsan deps:
   variables:
     EXTRA_APTGET: "clang-18 libclang-rt-18-dev libomp-18-dev"
     COMPILER: clang@18
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/asan-ubsan-lsan.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deps
 
 cpu asan ubsan lsan build:
   extends:
@@ -17,9 +17,7 @@ cpu asan ubsan lsan build:
   needs:
     - cpu asan ubsan lsan deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deploy:$CI_COMMIT_SHA
-    # For symbolizing stacktraces with llvm-symbolizer
-    EXTRA_APTGET_DEPLOY: "llvm-18"
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/dlaf:$CI_COMMIT_SHA
 
 cpu asan ubsan lsan test:
   extends: .run_common
diff --git a/ci/cpu/clang15_release.yml b/ci/cpu/clang15_release.yml
index 233f8ab185..36231c2979 100644
--- a/ci/cpu/clang15_release.yml
+++ b/ci/cpu/clang15_release.yml
@@ -6,9 +6,9 @@ cpu clang15 release deps:
   variables:
     EXTRA_APTGET: "clang-15"
     COMPILER: clang@15
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu-serial.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deps
 
 cpu clang15 release build:
   extends:
@@ -17,7 +17,7 @@ cpu clang15 release build:
   needs:
     - cpu clang15 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/dlaf:$CI_COMMIT_SHA
 
 cpu clang15 release test:
   extends: .run_common
diff --git a/ci/cpu/clang15_release_cxx20.yml b/ci/cpu/clang15_release_cxx20.yml
index 2875e7a0c7..d045898e62 100644
--- a/ci/cpu/clang15_release_cxx20.yml
+++ b/ci/cpu/clang15_release_cxx20.yml
@@ -9,7 +9,7 @@ cpu clang15 cxx20 release deps:
     CXXSTD: 20
     SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/deps
 
 cpu clang15 cxx20 release build:
   extends:
@@ -18,7 +18,7 @@ cpu clang15 cxx20 release build:
   needs:
     - cpu clang15 cxx20 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/dlaf:$CI_COMMIT_SHA
 
 cpu clang15 cxx20 release test:
   extends: .run_common
diff --git a/ci/cpu/clang15_release_stdexec.yml b/ci/cpu/clang15_release_stdexec.yml
index da096bf204..bf1d8c73d1 100644
--- a/ci/cpu/clang15_release_stdexec.yml
+++ b/ci/cpu/clang15_release_stdexec.yml
@@ -7,9 +7,9 @@ cpu clang15 stdexec release deps:
     EXTRA_APTGET: "clang-15 libomp-15-dev"
     COMPILER: clang@15
     CXXSTD: 20
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deps
 
 cpu clang15 stdexec release build:
   extends:
@@ -18,7 +18,7 @@ cpu clang15 stdexec release build:
   needs:
     - cpu clang15 stdexec release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/dlaf:$CI_COMMIT_SHA
 
 cpu clang15 stdexec release test:
   extends: .run_common
diff --git a/ci/cpu/clang16_release.yml b/ci/cpu/clang16_release.yml
index ac650f90b5..af91352e3c 100644
--- a/ci/cpu/clang16_release.yml
+++ b/ci/cpu/clang16_release.yml
@@ -6,9 +6,9 @@ cpu clang16 release deps:
   variables:
     EXTRA_APTGET: "clang-16 libomp-16-dev"
     COMPILER: clang@16
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deps
 
 cpu clang16 release build:
   extends:
@@ -17,7 +17,7 @@ cpu clang16 release build:
   needs:
     - cpu clang16 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/dlaf:$CI_COMMIT_SHA
 
 cpu clang16 release test:
   extends: .run_common
diff --git a/ci/cpu/clang18_release.yml b/ci/cpu/clang18_release.yml
index 6a01bc5a01..db61babf2b 100644
--- a/ci/cpu/clang18_release.yml
+++ b/ci/cpu/clang18_release.yml
@@ -6,9 +6,9 @@ cpu clang18 release deps:
   variables:
     EXTRA_APTGET: "clang-18 libomp-18-dev"
     COMPILER: clang@18
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deps
 
 cpu clang18 release build:
   extends:
@@ -17,7 +17,7 @@ cpu clang18 release build:
   needs:
     - cpu clang18 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/dlaf:$CI_COMMIT_SHA
 
 cpu clang18 release test:
   extends: .run_common
diff --git a/ci/cpu/gcc11_debug_stdexec.yml b/ci/cpu/gcc11_debug_stdexec.yml
index 5aedcc04c1..be5df32f0c 100644
--- a/ci/cpu/gcc11_debug_stdexec.yml
+++ b/ci/cpu/gcc11_debug_stdexec.yml
@@ -9,7 +9,7 @@ cpu gcc11 stdexec debug deps:
     CXXSTD: 20
     SPACK_ENVIRONMENT: ci/docker/debug-cpu-stdexec.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deps
 
 cpu gcc11 stdexec debug build:
   extends:
@@ -18,7 +18,7 @@ cpu gcc11 stdexec debug build:
   needs:
     - cpu gcc11 stdexec debug deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/dlaf:$CI_COMMIT_SHA
 
 cpu gcc11 stdexec debug test:
   extends: .run_common
diff --git a/ci/cpu/gcc11_release_stdexec.yml b/ci/cpu/gcc11_release_stdexec.yml
index 0ba3660d10..f74112fa67 100644
--- a/ci/cpu/gcc11_release_stdexec.yml
+++ b/ci/cpu/gcc11_release_stdexec.yml
@@ -7,9 +7,9 @@ cpu gcc11 stdexec release deps:
     EXTRA_APTGET: "gcc-11 g++-11 gfortran-11"
     COMPILER: gcc@11
     CXXSTD: 20
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deps
 
 cpu gcc11 stdexec release build:
   extends:
@@ -18,7 +18,7 @@ cpu gcc11 stdexec release build:
   needs:
     - cpu gcc11 stdexec release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/dlaf:$CI_COMMIT_SHA
 
 cpu gcc11 stdexec release test:
   extends: .run_common
diff --git a/ci/cpu/gcc12_release_cxx20.yml b/ci/cpu/gcc12_release_cxx20.yml
index ced430f279..d1bf81503c 100644
--- a/ci/cpu/gcc12_release_cxx20.yml
+++ b/ci/cpu/gcc12_release_cxx20.yml
@@ -9,7 +9,7 @@ cpu gcc12 cxx20 release deps:
     CXXSTD: 20
     SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/deps
 
 cpu gcc12 cxx20 release build:
   extends:
@@ -18,7 +18,7 @@ cpu gcc12 cxx20 release build:
   needs:
     - cpu gcc12 cxx20 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/dlaf:$CI_COMMIT_SHA
 
 cpu gcc12 cxx20 release test:
   extends: .run_common
diff --git a/ci/cpu/gcc13_codecov.yml b/ci/cpu/gcc13_codecov.yml
index 326973e326..a83e9320ed 100644
--- a/ci/cpu/gcc13_codecov.yml
+++ b/ci/cpu/gcc13_codecov.yml
@@ -6,7 +6,7 @@ cpu gcc13 codecov deps:
   variables:
     COMPILER: gcc@13
     SPACK_ENVIRONMENT: ci/docker/debug-cpu.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/deps
     USE_CODECOV: "true"
 
 cpu gcc13 codecov build:
@@ -17,7 +17,7 @@ cpu gcc13 codecov build:
     - cpu gcc13 codecov deps
   variables:
     DOCKERFILE: ci/docker/codecov.Dockerfile
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-codecov/dlaf:$CI_COMMIT_SHA
     PIP_OPTS: "--break-system-packages"
 
 cpu gcc13 codecov test:
diff --git a/ci/cpu/gcc13_release.yml b/ci/cpu/gcc13_release.yml
index daa0d77951..6f95b9603a 100644
--- a/ci/cpu/gcc13_release.yml
+++ b/ci/cpu/gcc13_release.yml
@@ -5,9 +5,9 @@ cpu gcc13 release deps:
   extends: .build_deps_common
   variables:
     COMPILER: gcc@13
-    USE_MKL: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/build
+    USE_MKL: "ON"
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/deps
 
 cpu gcc13 release build:
   extends:
@@ -16,7 +16,7 @@ cpu gcc13 release build:
   needs:
     - cpu gcc13 release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc13-release/dlaf:$CI_COMMIT_SHA
 
 cpu gcc13 release test:
   extends: .run_common
diff --git a/ci/ctest_to_gitlab.sh b/ci/ctest_to_gitlab.sh
index fb5c6cf423..cedf894f80 100755
--- a/ci/ctest_to_gitlab.sh
+++ b/ci/ctest_to_gitlab.sh
@@ -16,6 +16,7 @@ THREADS_PER_NODE="$3"
 SLURM_CONSTRAINT="$4"
 
 if [ "$USE_CODECOV" = true ]; then
+# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
 BASE_TEMPLATE="
 include:
   - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml'
@@ -30,7 +31,7 @@ variables:
   SLURM_EXCLUSIVE: ''
   SLURM_EXACT: ''
   SLURM_CONSTRAINT: $SLURM_CONSTRAINT
-  CRAY_CUDA_MPS: 1
+  CRAY_CUDA_MPS: 0
   MPICH_MAX_THREAD_SAFETY: multiple
 
 {{JOBS}}
@@ -65,6 +66,7 @@ JOB_TEMPLATE="
     paths:
       - codecov-reports/"
 else
+# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
 BASE_TEMPLATE="
 include:
   - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml'
@@ -78,7 +80,7 @@ variables:
   SLURM_EXCLUSIVE: ''
   SLURM_EXACT: ''
   SLURM_CONSTRAINT: $SLURM_CONSTRAINT
-  CRAY_CUDA_MPS: 1
+  CRAY_CUDA_MPS: 0
   MPICH_MAX_THREAD_SAFETY: multiple
 
 {{JOBS}}
diff --git a/ci/cuda/gcc11_codecov.yml b/ci/cuda/gcc11_codecov.yml
index cd81d00fb1..31fcea1150 100644
--- a/ci/cuda/gcc11_codecov.yml
+++ b/ci/cuda/gcc11_codecov.yml
@@ -7,7 +7,7 @@ cuda gcc11 codecov deps:
     BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04
     COMPILER: gcc@11
     SPACK_ENVIRONMENT: ci/docker/debug-cuda.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/deps
     USE_CODECOV: "true"
 
 cuda gcc11 codecov build:
@@ -18,8 +18,7 @@ cuda gcc11 codecov build:
     - cuda gcc11 codecov deps
   variables:
     DOCKERFILE: ci/docker/codecov.Dockerfile
-    DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-codecov/dlaf:$CI_COMMIT_SHA
 
 cuda gcc11 codecov test:
   extends: .run_common
diff --git a/ci/cuda/gcc11_debug_scalapack.yml b/ci/cuda/gcc11_debug_scalapack.yml
index 707ec2b51d..98b07a1d03 100644
--- a/ci/cuda/gcc11_debug_scalapack.yml
+++ b/ci/cuda/gcc11_debug_scalapack.yml
@@ -7,7 +7,7 @@ cuda gcc11 debug scalapack deps:
     BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04
     COMPILER: gcc@11
     SPACK_ENVIRONMENT: ci/docker/debug-cuda-scalapack.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/deps
 
 cuda gcc11 debug scalapack build:
   extends:
@@ -16,8 +16,7 @@ cuda gcc11 debug scalapack build:
   needs:
     - cuda gcc11 debug scalapack deps
   variables:
-    DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-debug/dlaf:$CI_COMMIT_SHA
 
 cuda gcc11 debug scalapack test:
   extends: .run_common
diff --git a/ci/cuda/gcc11_release.yml b/ci/cuda/gcc11_release.yml
index 7d47f16d09..40d2b20bf2 100644
--- a/ci/cuda/gcc11_release.yml
+++ b/ci/cuda/gcc11_release.yml
@@ -6,10 +6,9 @@ cuda gcc11 release deps:
   variables:
     BASE_IMAGE: docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04
     COMPILER: gcc@11
-    CXXSTD: 17
     SPACK_ENVIRONMENT: ci/docker/release-cuda.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deps
 
 cuda gcc11 release build:
   extends:
@@ -18,8 +17,7 @@ cuda gcc11 release build:
   needs:
     - cuda gcc11 release deps
   variables:
-    DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-release/dlaf:$CI_COMMIT_SHA
 
 cuda gcc11 release test:
   extends: .run_common
diff --git a/ci/cuda/gcc11_release_scalapack.yml b/ci/cuda/gcc11_release_scalapack.yml
index ca4cc45d3f..5a668e4439 100644
--- a/ci/cuda/gcc11_release_scalapack.yml
+++ b/ci/cuda/gcc11_release_scalapack.yml
@@ -8,7 +8,7 @@ cuda gcc11 release scalapack deps:
     COMPILER: gcc@11
     SPACK_ENVIRONMENT: ci/docker/release-cuda-scalapack.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/deps
 
 cuda gcc11 release scalapack build:
   extends:
@@ -17,8 +17,7 @@ cuda gcc11 release scalapack build:
   needs:
     - cuda gcc11 release scalapack deps
   variables:
-    DEPLOY_BASE_IMAGE: docker.io/ubuntu:22.04
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc11-scalapack-release/dlaf:$CI_COMMIT_SHA
 
 cuda gcc11 release scalapack test:
   extends: .run_common
diff --git a/ci/cuda/gcc13_release_stdexec.yml b/ci/cuda/gcc13_release_stdexec.yml
index 24dccbdf1a..e3a3584aaa 100644
--- a/ci/cuda/gcc13_release_stdexec.yml
+++ b/ci/cuda/gcc13_release_stdexec.yml
@@ -9,7 +9,7 @@ cuda gcc13 stdexec release deps:
     CXXSTD: 20
     SPACK_ENVIRONMENT: ci/docker/release-cuda-stdexec.yaml
     USE_MKL: "ON"
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release-stdexec/deps
 
 cuda gcc13 stdexec release build:
   extends:
@@ -18,4 +18,4 @@ cuda gcc13 stdexec release build:
   needs:
     - cuda gcc13 stdexec release deps
   variables:
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gcc13-release/dlaf:$CI_COMMIT_SHA
diff --git a/ci/docker/build.Dockerfile b/ci/docker/build.Dockerfile
index fadb655dd0..d1f36e8be5 100644
--- a/ci/docker/build.Dockerfile
+++ b/ci/docker/build.Dockerfile
@@ -6,28 +6,33 @@ FROM $BASE_IMAGE
 LABEL com.jfrog.artifactory.retention.maxDays="21"
 
 ENV DEBIAN_FRONTEND=noninteractive \
-    PATH="$PATH:/opt/spack/bin:/opt/libtree" \
+    PATH="$PATH:/opt/spack/bin" \
     SPACK_COLOR=always
+
+# Overwrite entrypoint as NVIDIA images set a script that clog the output.
+ENTRYPOINT []
+CMD [ "/bin/bash" ]
 SHELL ["/bin/bash", "-c"]
 
 ARG EXTRA_APTGET
+# python is needed for spack and fastcov
+# codecov upload needs curl + ca-certificates
+# glibc-tools is needed for libSegFault on ubuntu > 22.04
+# jq, strace are needed for check-threads
+# tzdata is needed to print correct time
 RUN apt-get -yqq update && \
     apt-get -yqq install --no-install-recommends \
     software-properties-common \
     build-essential gfortran \
     autoconf automake libssl-dev ninja-build pkg-config \
-    ${EXTRA_APTGET} \
-    gawk \
+    gawk git tar \
+    wget curl ca-certificates gpg-agent tzdata \
     python3 python3-setuptools \
-    git tar wget curl ca-certificates gpg-agent jq tzdata \
-    patchelf unzip file gnupg2 libncurses-dev && \
+    glibc-tools jq strace \
+    patchelf unzip file gnupg2 libncurses-dev \
+    ${EXTRA_APTGET} && \
     rm -rf /var/lib/apt/lists/*
 
-# Install libtree for packaging
-RUN mkdir -p /opt/libtree && \
-    curl -Lfso /opt/libtree/libtree https://github.com/haampie/libtree/releases/download/v2.0.0/libtree_x86_64 && \
-    chmod +x /opt/libtree/libtree
-
 # Install MKL and remove static libs (to keep image smaller)
 ARG USE_MKL=ON
 ARG MKL_VERSION=2024.0
@@ -84,11 +89,13 @@ RUN spack repo add --scope site /user_repo
 # e.g. --build-arg SPACK_ENVIRONMENT=ci/spack/my-env.yaml
 ARG SPACK_ENVIRONMENT
 ARG COMMON_SPACK_ENVIRONMENT
+ARG ENV_VIEW=/view
+
 # Build dependencies
 # 1. Create a spack environment named `ci` from the input spack.yaml file
 COPY $SPACK_ENVIRONMENT /spack_environment/spack.yaml
 COPY $COMMON_SPACK_ENVIRONMENT /spack_environment/
-RUN spack env create --without-view ci /spack_environment/spack.yaml
+RUN spack env create --with-view ${ENV_VIEW} ci /spack_environment/spack.yaml
 # 2. Set the C++ standard
 ARG CXXSTD=17
 RUN spack -e ci config add "packages:dla-future:variants:cxxstd=${CXXSTD}"
@@ -98,3 +105,5 @@ RUN spack -e ci install --jobs ${NUM_PROCS} --fail-fast --only=dependencies
 
 # make ctest executable available.
 RUN ln -s `spack -e ci location -i cmake`/bin/ctest /usr/bin/ctest
+
+RUN echo ${ENV_VIEW}/lib > /etc/ld.so.conf.d/dlaf.conf && ldconfig
diff --git a/ci/docker/codecov.Dockerfile b/ci/docker/codecov.Dockerfile
index 3673b680ac..a838f943cc 100644
--- a/ci/docker/codecov.Dockerfile
+++ b/ci/docker/codecov.Dockerfile
@@ -1,18 +1,17 @@
-ARG BUILD_IMAGE
-ARG DEPLOY_BASE_IMAGE
+ARG DEPS_IMAGE
+FROM $DEPS_IMAGE
 
-# This is the folder where the project is built
+LABEL com.jfrog.artifactory.retention.maxDays="7"
+LABEL com.jfrog.artifactory.retention.maxCount="10"
+
+# Directory where the project is built
 ARG BUILD=/DLA-Future-build
-# This is where we copy the sources to
+# Directory where the miniapps are built as separate project
+ARG BUILD_MINIAPP=/DLA-Future-miniapp-build
+# Directory where the sources are copied to
 ARG SOURCE=/DLA-Future
-# Where a bunch of shared libs live
-ARG DEPLOY=/root/DLA-Future.bundle
-
-FROM $BUILD_IMAGE as builder
-
-ARG BUILD
-ARG SOURCE
-ARG DEPLOY
+# Directory for some helper executables
+ARG BIN=/DLA-Future-build/bin
 
 # Build DLA-Future
 COPY . ${SOURCE}
@@ -30,81 +29,24 @@ RUN spack repo rm --scope site dlaf && \
     spack -e ci concretize -f && \
     spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose
 
-# Prune and bundle binaries
-RUN mkdir ${BUILD}-tmp && cd ${BUILD} && \
-    export TEST_BINARIES=`PATH=${SOURCE}/ci:$PATH ctest --show-only=json-v1 | jq '.tests | map(.command | .[] | select(contains("check-threads") | not)) | .[]' | tr -d \"` && \
-    echo "Binary sizes:" && \
-    ls -lh ${TEST_BINARIES} && \
-    ls -lh src/lib* && \
-    libtree -d ${DEPLOY} ${TEST_BINARIES} && \
-    rm -rf ${DEPLOY}/usr/bin && \
-    libtree -d ${DEPLOY} $(which ctest gcov addr2line) && \
-    cp -L ${SOURCE}/ci/{mpi-ctest,check-threads,upload_codecov} ${DEPLOY}/usr/bin && \
-    echo "$TEST_BINARIES" | xargs -I{file} find -samefile {file} -exec cp --parents '{}' ${BUILD}-tmp ';' && \
-    find '(' -name CTestTestfile.cmake -o -iname "*.gcno" ')' -exec cp --parents '{}' ${BUILD}-tmp ';' && \
-    rm -rf ${BUILD} && \
-    mv ${BUILD}-tmp ${BUILD} && \
-    rm -rf ${SOURCE}/.git
-
-# Deploy Extra RocBlas files separately.
-ARG USE_ROCBLAS=OFF
-RUN mkdir ${DEPLOY}/usr/lib/rocblas; \
-    if [ "$USE_ROCBLAS" = "ON" ]; then \
-      cp -r `spack -e ci location -i rocblas`/lib/rocblas/library ${DEPLOY}/usr/lib/rocblas ; \
-    fi
+RUN mkdir -p ${BIN} && cp -L ${SOURCE}/ci/{mpi-ctest,check-threads,upload_codecov} ${BIN}
 
-# Multistage build, this is the final small image
-FROM $DEPLOY_BASE_IMAGE
-
-# set jfrog autoclean policy
-LABEL com.jfrog.artifactory.retention.maxDays="7"
-LABEL com.jfrog.artifactory.retention.maxCount="10"
-
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG BUILD
-ARG SOURCE
-ARG DEPLOY
-
-ARG EXTRA_APTGET_DEPLOY
 ARG PIP_OPTS
-# python is needed for fastcov
 # pip is needed only to install fastcov (it is removed with
 #     its dependencies after fastcov installation)
-# codecov upload needs curl + ca-certificates
-# glibc-tools is needed for libSegFault on ubuntu:22.04
-# jq, strace are needed for check-threads
-# tzdata is needed to print correct time
 RUN apt-get update -qq && \
-    apt-get install -qq -y --no-install-recommends \
-      ${EXTRA_APTGET_DEPLOY} \
-      python3 python3-pip \
-      curl \
-      ca-certificates \
-      glibc-tools jq strace \
-      tzdata && \
+    apt-get install -qq -y --no-install-recommends python3-pip && \
     pip install ${PIP_OPTS} fastcov && \
     apt-get autoremove -qq -y python3-pip && \
     apt-get clean
 
-# Copy the executables and the codecov gcno files
-COPY --from=builder ${BUILD} ${BUILD}
-COPY --from=builder ${DEPLOY} ${DEPLOY}
-
-# Copy the source files into the image as well.
-# This is necessary for code coverage of MPI tests: gcov has to have write temporary
-# data into the source folder. In distributed applications we can therefore not mount
-# the git repo folder at runtime in the container, because it is shared and would
-# cause race conditions in gcov.
-COPY --from=builder ${SOURCE} ${SOURCE}
-
 RUN cd /usr/local/bin && \
   curl -Ls https://codecov.io/bash > codecov.sh && \
   echo "f0e7a3ee76a787c37aa400cf44aee0c9b473b2fa79092edfb36d1faa853bbe23 codecov.sh" | sha256sum --check --quiet && \
   chmod +x codecov.sh
 
 # Make it easy to call our binaries.
-ENV PATH="${DEPLOY}/usr/bin:$PATH"
+ENV PATH="${BIN}:$PATH"
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"
@@ -115,6 +57,4 @@ ENV ENABLE_COVERAGE="YES"
 # Automatically print stacktraces on segfault
 ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
 
-RUN echo "${DEPLOY}/usr/lib/" > /etc/ld.so.conf.d/dlaf.conf && ldconfig
-
 WORKDIR ${BUILD}
diff --git a/ci/docker/debug-cpu-stdexec.yaml b/ci/docker/debug-cpu-stdexec.yaml
index 03578ac143..f225d71995 100644
--- a/ci/docker/debug-cpu-stdexec.yaml
+++ b/ci/docker/debug-cpu-stdexec.yaml
@@ -29,7 +29,3 @@ spack:
         - '+stdexec'
         - 'build_type=Debug'
         - 'malloc=system'
-    stdexec:
-      require:
-        - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
-        - 'build_type=Debug'
diff --git a/ci/docker/deploy.Dockerfile b/ci/docker/deploy.Dockerfile
index a500dd918e..42a11b2264 100644
--- a/ci/docker/deploy.Dockerfile
+++ b/ci/docker/deploy.Dockerfile
@@ -1,18 +1,17 @@
-ARG BUILD_IMAGE
-ARG DEPLOY_BASE_IMAGE
+ARG DEPS_IMAGE
+FROM $DEPS_IMAGE
 
-# This is the folder where the project is built
+LABEL com.jfrog.artifactory.retention.maxDays="7"
+LABEL com.jfrog.artifactory.retention.maxCount="10"
+
+# Directory where the project is built
 ARG BUILD=/DLA-Future-build
-# This is where we copy the sources to
+# Directory where the miniapps are built as separate project
+ARG BUILD_MINIAPP=/DLA-Future-miniapp-build
+# Directory where the sources are copied to
 ARG SOURCE=/DLA-Future
-# Where a bunch of shared libs live
-ARG DEPLOY=/root/DLA-Future.bundle
-
-FROM $BUILD_IMAGE as builder
-
-ARG BUILD
-ARG SOURCE
-ARG DEPLOY
+# Directory for some helper executables
+ARG BIN=/DLA-Future-build/bin
 
 # Build DLA-Future
 COPY . ${SOURCE}
@@ -25,87 +24,18 @@ RUN spack repo rm --scope site dlaf && \
     spack repo add ${SOURCE}/spack && \
     spack -e ci develop --no-clone --path ${SOURCE} --build-directory ${BUILD} dla-future@master && \
     spack -e ci concretize -f && \
-    spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose
+    spack -e ci --config "config:flags:keep_werror:all" install --jobs ${NUM_PROCS} --keep-stage --verbose && \
+    find ${BUILD} -name CMakeFiles -exec rm -rf {} +
 
 # Test deployment with miniapps as independent project
-RUN pushd ${SOURCE}/miniapp && \
-    mkdir build-miniapps && cd build-miniapps && \
+RUN mkdir ${BUILD_MINIAPP} && cd ${BUILD_MINIAPP} && \
     spack -e ci build-env dla-future@master -- \
-    bash -c "cmake -DCMAKE_PREFIX_PATH=`spack -e ci location -i dla-future` .. && make -j ${NUM_PROCS}" && \
-    popd
-
-# Prune and bundle binaries
-RUN mkdir ${BUILD}-tmp && cd ${BUILD} && \
-    export TEST_BINARIES=`PATH=${SOURCE}/ci:$PATH ctest --show-only=json-v1 | jq '.tests | map(.command | .[] | select(contains("check-threads") | not)) | .[]' | tr -d \"` && \
-    LIBASAN=$(find /usr/lib -name libclang_rt.asan-x86_64.so) && \
-    if [[ -n "${LIBASAN}" ]]; then export LD_LIBRARY_PATH=$(dirname ${LIBASAN}):${LD_LIBRARY_PATH}; fi && \
-    echo "Binary sizes:" && \
-    ls -lh ${TEST_BINARIES} && \
-    ls -lh src/lib* && \
-    libtree -d ${DEPLOY} ${TEST_BINARIES} && \
-    rm -rf ${DEPLOY}/usr/bin && \
-    libtree -d ${DEPLOY} $(which ctest addr2line) && \
-    cp -L ${SOURCE}/ci/{mpi-ctest,check-threads} ${DEPLOY}/usr/bin && \
-    echo "$TEST_BINARIES" | xargs -I{file} find -samefile {file} -exec cp --parents '{}' ${BUILD}-tmp ';' && \
-    find -name CTestTestfile.cmake -exec cp --parents '{}' ${BUILD}-tmp ';' && \
-    rm -rf ${BUILD} && \
-    mv ${BUILD}-tmp ${BUILD}
-
-# Deploy MKL separately, since it dlopen's some libs
-ARG USE_MKL=ON
-RUN if [ "$USE_MKL" = "ON" ]; then \
-      export MKL_LIB=$(dirname $(find $(spack location -i intel-oneapi-mkl) -name libmkl_core.so)) && \
-      libtree -d ${DEPLOY} \
-      ${MKL_LIB}/libmkl_avx2.so.2 \
-      ${MKL_LIB}/libmkl_avx512.so.2 \
-      ${MKL_LIB}/libmkl_core.so \
-      ${MKL_LIB}/libmkl_def.so.2 \
-      ${MKL_LIB}/libmkl_intel_thread.so \
-      ${MKL_LIB}/libmkl_mc3.so.2 \
-      ${MKL_LIB}/libmkl_sequential.so \
-      ${MKL_LIB}/libmkl_tbb_thread.so \
-      ${MKL_LIB}/libmkl_vml_avx2.so.2 \
-      ${MKL_LIB}/libmkl_vml_avx512.so.2 \
-      ${MKL_LIB}/libmkl_vml_cmpt.so.2 \
-      ${MKL_LIB}/libmkl_vml_def.so.2 \
-      ${MKL_LIB}/libmkl_vml_mc3.so.2 ; \
-    fi
-
-# Deploy Extra RocBlas files separately.
-ARG USE_ROCBLAS=OFF
-RUN mkdir ${DEPLOY}/usr/lib/rocblas; \
-    if [ "$USE_ROCBLAS" = "ON" ]; then \
-      cp -r `spack -e ci location -i rocblas`/lib/rocblas/library ${DEPLOY}/usr/lib/rocblas ; \
-    fi
+    bash -c "cmake -DCMAKE_PREFIX_PATH=`spack -e ci location -i dla-future` ${SOURCE}/miniapp && make -j ${NUM_PROCS}"
 
-# Multistage build, this is the final small image
-FROM $DEPLOY_BASE_IMAGE
-
-# set jfrog autoclean policy
-LABEL com.jfrog.artifactory.retention.maxDays="7"
-LABEL com.jfrog.artifactory.retention.maxCount="10"
-
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG BUILD
-ARG DEPLOY
-
-ARG EXTRA_APTGET_DEPLOY
-# glibc-tools is needed for libSegFault on ubuntu:22.04
-# jq, strace are needed for check-threads
-# tzdata is needed to print correct time
-RUN apt-get update -qq && \
-    apt-get install -qq -y --no-install-recommends \
-      ${EXTRA_APTGET_DEPLOY} \
-      glibc-tools jq strace \
-      tzdata && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY --from=builder ${BUILD} ${BUILD}
-COPY --from=builder ${DEPLOY} ${DEPLOY}
+RUN mkdir -p ${BIN} && cp -L ${SOURCE}/ci/{mpi-ctest,check-threads} ${BIN}
 
 # Make it easy to call our binaries.
-ENV PATH="${DEPLOY}/usr/bin:$PATH"
+ENV PATH="${BIN}:$PATH"
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"
@@ -113,6 +43,4 @@ ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"
 # Automatically print stacktraces on segfault
 ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
 
-RUN echo "${DEPLOY}/usr/lib/" > /etc/ld.so.conf.d/dlaf.conf && ldconfig
-
 WORKDIR ${BUILD}
diff --git a/ci/docker/release-cpu-stdexec.yaml b/ci/docker/release-cpu-stdexec.yaml
index 412b47530e..4d3bcd6594 100644
--- a/ci/docker/release-cpu-stdexec.yaml
+++ b/ci/docker/release-cpu-stdexec.yaml
@@ -27,6 +27,3 @@ spack:
     pika:
       require:
         - '+stdexec'
-    stdexec:
-      require:
-        - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
diff --git a/ci/docker/release-cuda-stdexec.yaml b/ci/docker/release-cuda-stdexec.yaml
index 366ff247ef..f4847574b4 100644
--- a/ci/docker/release-cuda-stdexec.yaml
+++ b/ci/docker/release-cuda-stdexec.yaml
@@ -27,6 +27,3 @@ spack:
     pika:
       require:
         - '+stdexec'
-    stdexec:
-      require:
-        - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
diff --git a/ci/docker/release-rocm533-stdexec.yaml b/ci/docker/release-rocm533-stdexec.yaml
index 6008d85780..8713afa6bc 100644
--- a/ci/docker/release-rocm533-stdexec.yaml
+++ b/ci/docker/release-rocm533-stdexec.yaml
@@ -29,9 +29,6 @@ spack:
     pika:
       require:
         - '+stdexec'
-    stdexec:
-      require:
-        - '@git.8bc7c7f06fe39831dea6852407ebe7f6be8fa9fd=main'
     blas:
       require:: openblas
     lapack:
diff --git a/ci/mpi-ctest b/ci/mpi-ctest
index f8ca1b14d8..d753fbcb18 100755
--- a/ci/mpi-ctest
+++ b/ci/mpi-ctest
@@ -9,6 +9,19 @@ fi;
 
 pushd /DLA-Future-build > /dev/null
 
+export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
+export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
+
+if which nvidia-cuda-mps-control && [ $SLURM_LOCALID = 0 ]; then START_MPS=1; else START_MPS=0; fi &> /dev/null
+
+# Workaround on daint to avoid test hanging (See PR #1197)
+# Launch MPS from a single rank per node
+if [ $START_MPS -eq 1 ]; then
+    nvidia-cuda-mps-control -d
+fi
+# Wait for MPS to start
+sleep 5
+
 # Run the tests, only output on the first rank
 if [[ $SLURM_PROCID == "0" ]]; then
     TZ=CET date +"Run started at: %H:%M:%S %z"
@@ -18,6 +31,10 @@ else
     ctest -Q $@
 fi
 
+if [ $START_MPS -eq 1 ]; then
+    echo quit | nvidia-cuda-mps-control
+fi
+
 # Create coverage reports for code run
 if [[ "$ENABLE_COVERAGE" == "YES" ]]; then
     # On daint-mc (XC40) reduce the number of tasks to avoid out-of-memory error
diff --git a/ci/rocm/clang14_release.yml b/ci/rocm/clang14_release.yml
index 1a52ca6321..21636e54d7 100644
--- a/ci/rocm/clang14_release.yml
+++ b/ci/rocm/clang14_release.yml
@@ -9,7 +9,7 @@ rocm clang14 release deps:
     COMPILER: clang@14
     USE_ROCBLAS: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-rocm533.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/deps
 
 rocm clang14 release build:
   extends:
@@ -18,5 +18,4 @@ rocm clang14 release build:
   needs:
     - rocm clang14 release deps
   variables:
-    DEPLOY_BASE_IMAGE: $CSCS_REGISTRY_PATH/rocm-patched:5.3.3
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-release/dlaf:$CI_COMMIT_SHA
diff --git a/ci/rocm/clang14_release_stdexec.yml b/ci/rocm/clang14_release_stdexec.yml
index 6c0407a562..3b55caf93b 100644
--- a/ci/rocm/clang14_release_stdexec.yml
+++ b/ci/rocm/clang14_release_stdexec.yml
@@ -9,7 +9,7 @@ rocm clang14 stdexec release deps:
     COMPILER: clang@14
     USE_ROCBLAS: "ON"
     SPACK_ENVIRONMENT: ci/docker/release-rocm533-stdexec.yaml
-    BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/build
+    DEPS_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/deps
 
 rocm clang14 stdexec release build:
   extends:
@@ -18,5 +18,4 @@ rocm clang14 stdexec release build:
   needs:
     - rocm clang14 stdexec release deps
   variables:
-    DEPLOY_BASE_IMAGE: $CSCS_REGISTRY_PATH/rocm-patched:5.3.3
-    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/deploy:$CI_COMMIT_SHA
+    DLAF_IMAGE: $CSCS_REGISTRY_PATH/rocm-clang14-stdexec-release/dlaf:$CI_COMMIT_SHA
diff --git a/include/dlaf/eigensolver/band_to_tridiag.h b/include/dlaf/eigensolver/band_to_tridiag.h
index be907da16c..273299d921 100644
--- a/include/dlaf/eigensolver/band_to_tridiag.h
+++ b/include/dlaf/eigensolver/band_to_tridiag.h
@@ -84,7 +84,6 @@ TridiagResult<T, Device::CPU> band_to_tridiagonal(blas::Uplo uplo, SizeType band
   switch (uplo) {
     case blas::Uplo::Lower:
       return BandToTridiag<B, D, T>::call_L(band_size, mat_a);
-      break;
     case blas::Uplo::Upper:
       DLAF_UNIMPLEMENTED(uplo);
       break;
@@ -161,7 +160,6 @@ TridiagResult<T, Device::CPU> band_to_tridiagonal(comm::CommunicatorGrid& grid,
   switch (uplo) {
     case blas::Uplo::Lower:
       return BandToTridiag<backend, device, T>::call_L(grid, band_size, mat_a);
-      break;
     case blas::Uplo::Upper:
       DLAF_UNIMPLEMENTED(uplo);
       break;
diff --git a/include/dlaf/eigensolver/band_to_tridiag/mc.h b/include/dlaf/eigensolver/band_to_tridiag/mc.h
index ca680884ac..ede258197d 100644
--- a/include/dlaf/eigensolver/band_to_tridiag/mc.h
+++ b/include/dlaf/eigensolver/band_to_tridiag/mc.h
@@ -28,6 +28,7 @@
 #include <dlaf/common/vector.h>
 #include <dlaf/communication/communicator.h>
 #include <dlaf/communication/communicator_grid.h>
+#include <dlaf/communication/datatypes.h>
 #include <dlaf/communication/index.h>
 #include <dlaf/communication/kernels.h>
 #include <dlaf/eigensolver/band_to_tridiag/api.h>
diff --git a/include/dlaf/eigensolver/gen_to_std/impl.h b/include/dlaf/eigensolver/gen_to_std/impl.h
index b5cfd3e2d0..9202ae6d0d 100644
--- a/include/dlaf/eigensolver/gen_to_std/impl.h
+++ b/include/dlaf/eigensolver/gen_to_std/impl.h
@@ -487,7 +487,6 @@ void GenToStd<backend, device, T>::call_L(comm::CommunicatorGrid& grid, Matrix<T
     if (kk_rank.col() == this_rank.col()) {
       // panel partial update
       for (SizeType i_local = at_offset.rows(); i_local < distr.localNrTiles().rows(); ++i_local) {
-        const LocalTileIndex local_idx(Coord::Row, i_local);
         const LocalTileIndex ik(i_local, distr.localTileFromGlobalTile<Coord::Col>(k));
 
         hemmPanelTile<backend>(thread_priority::high, a_diag, mat_l.read(ik), mat_a.readwrite(ik));
@@ -759,7 +758,6 @@ void GenToStd<backend, device, T>::call_U(comm::CommunicatorGrid& grid, Matrix<T
     if (kk_rank.row() == this_rank.row()) {
       // panel partial update
       for (SizeType j_local = at_offset.cols(); j_local < distr.localNrTiles().cols(); ++j_local) {
-        const LocalTileIndex local_idx(Coord::Col, j_local);
         const LocalTileIndex ki(distr.localTileFromGlobalTile<Coord::Row>(k), j_local);
 
         hemmPanelTile<backend>(thread_priority::high, a_diag, mat_u.read(ki), mat_a.readwrite(ki));
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 2a7882fd82..e3f4befb0e 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -52,6 +52,7 @@
 #include <dlaf/matrix/tile.h>
 #include <dlaf/matrix/views.h>
 #include <dlaf/schedulers.h>
+#include <dlaf/sender/continues_on.h>
 #include <dlaf/sender/traits.h>
 #include <dlaf/types.h>
 #include <dlaf/util_math.h>
@@ -316,7 +317,7 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const
                             std::vector<common::internal::vector<T>>{}),  // w (internally required)
                    mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
                    ex::when_all_vector(std::move(panel_tiles))) |
-      ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
       ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
                                                               auto& w, auto& taus, auto& tiles) {
         const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
@@ -638,7 +639,7 @@ void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
                    mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
                    ex::when_all_vector(std::move(panel_tiles)),
                    std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
-      ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+      di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
       ex::bulk(nthreads, [nthreads, rank_v0,
                           cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w,
                                                     auto& taus, auto& tiles, auto&& pcomm) {
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index 69f424afbf..e4eed8ff34 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -50,6 +50,7 @@
 #include <dlaf/permutations/general.h>
 #include <dlaf/permutations/general/impl.h>
 #include <dlaf/schedulers.h>
+#include <dlaf/sender/continues_on.h>
 #include <dlaf/sender/make_sender_algorithm_overloads.h>
 #include <dlaf/sender/policy.h>
 #include <dlaf/sender/transform.h>
@@ -823,7 +824,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k
                    ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)),
                    ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)),
                    ex::just(std::vector<memory::MemoryView<T, Device::CPU>>())) |
-      ex::transfer(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
       ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho,
                                            auto& d_tiles, auto& z_tiles, auto& eval_tiles,
                                            const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) {
@@ -1031,11 +1032,12 @@ void multiplyEigenvectors(const SizeType sub_offset, const SizeType n, const Siz
   // └───┴────────┴────┘  └────────────┴────┘
 
   namespace ex = pika::execution::experimental;
+  using dlaf::internal::continues_on;
   using pika::execution::thread_priority;
 
   ex::start_detached(
       ex::when_all(std::forward<KSender>(k), std::forward<UDLSenders>(n_udl)) |
-      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      continues_on(dlaf::internal::getBackendScheduler<Backend::MC>(thread_priority::high)) |
       ex::then([sub_offset, n, n_upper, n_lower, e0 = e0.subPipeline(), e1 = e1.subPipelineConst(),
                 e2 = e2.subPipelineConst()](const SizeType k, std::array<std::size_t, 3> n_udl) mutable {
         using dlaf::matrix::internal::MatrixRef;
@@ -1082,7 +1084,6 @@ void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const Size
   namespace di = dlaf::internal;
   using pika::execution::thread_priority;
 
-  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
   const LocalTileIndex idx_loc_begin(i_begin, i_begin);
   const SizeType nrtiles = i_end - i_begin;
   const LocalTileSize sz_loc_tiles(nrtiles, nrtiles);
@@ -1334,7 +1335,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
                    // additional workspaces
                    ex::just(std::vector<memory::MemoryView<T, Device::CPU>>()),
                    ex::just(memory::MemoryView<T, Device::CPU>())) |
-      ex::transfer(hp_scheduler) |
+      di::continues_on(hp_scheduler) |
       ex::let_value([n, dist_sub, bcast_evals, all_reduce_in_place, hp_scheduler](
                         auto& row_comm_wrapper, auto& col_comm_wrapper, const SizeType k,
                         const SizeType k_lc, const auto& rho, const auto& d_tiles, auto& z_tiles,
@@ -1353,7 +1354,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S
           return std::clamp(ideal_workers, min_workers, available_workers);
         }();
 
-        return ex::just(std::make_unique<pika::barrier<>>(nthreads)) | ex::transfer(hp_scheduler) |
+        return ex::just(std::make_unique<pika::barrier<>>(nthreads)) | di::continues_on(hp_scheduler) |
                ex::bulk(nthreads, [&row_comm_wrapper, &col_comm_wrapper, k, k_lc, &rho, &d_tiles,
                                    &z_tiles, &eval_tiles, &i4_tiles_arr, &i6_tiles_arr, &i2_tiles_arr,
                                    &evec_tiles, &ws_cols, &ws_row, nthreads, n, dist_sub, bcast_evals,
@@ -1762,11 +1763,12 @@ void multiplyEigenvectors(const GlobalElementIndex sub_offset, const matrix::Dis
   // └───┴────────┴────┘  └────────────┴────┘
 
   namespace ex = pika::execution::experimental;
+  using dlaf::internal::continues_on;
   using pika::execution::thread_priority;
 
   ex::start_detached(
       ex::when_all(std::forward<KLcSender>(k_lc), std::forward<UDLSenders>(n_udl)) |
-      ex::transfer(dlaf::internal::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+      continues_on(dlaf::internal::getBackendScheduler<Backend::MC>(thread_priority::high)) |
       ex::then([dist_sub, sub_offset, n_upper, n_lower, e0 = e0.subPipeline(),
                 e1 = e1.subPipelineConst(), e2 = e2.subPipelineConst(),
                 sub_comm_row = row_task_chain.sub_pipeline(),
@@ -1834,7 +1836,6 @@ void mergeDistSubproblems(comm::CommunicatorPipeline<comm::CommunicatorType::Ful
   const SizeType n_lower = global_tile_element_distance<Coord::Row>(dist, i_split, i_end);
 
   // The local size of the subproblem
-  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
   const LocalTileIndex idx_loc_begin{dist.next_local_tile_from_global_tile<Coord::Row>(i_begin),
                                      dist.next_local_tile_from_global_tile<Coord::Col>(i_begin)};
   const LocalTileIndex idx_loc_end{dist.next_local_tile_from_global_tile<Coord::Row>(i_end),
diff --git a/include/dlaf/factorization/cholesky/impl.h b/include/dlaf/factorization/cholesky/impl.h
index ec6aed659e..5f26f0020e 100644
--- a/include/dlaf/factorization/cholesky/impl.h
+++ b/include/dlaf/factorization/cholesky/impl.h
@@ -196,7 +196,7 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid& grid, Matrix<T
 #ifdef DLAF_WITH_HDF5
   static std::atomic<size_t> num_cholesky_calls = 0;
   std::stringstream fname;
-  fname << "cholesky-facrorization-" << matrix::internal::TypeToString_v<T> << "-"
+  fname << "cholesky-factorization-" << matrix::internal::TypeToString_v<T> << "-"
         << std::to_string(num_cholesky_calls) << ".h5";
   std::optional<matrix::internal::FileHDF5> file;
 
diff --git a/include/dlaf/init.h b/include/dlaf/init.h
index e63da1896f..fe21d84ac5 100644
--- a/include/dlaf/init.h
+++ b/include/dlaf/init.h
@@ -38,10 +38,18 @@ struct configuration {
   bool print_config = false;
   std::size_t num_np_gpu_streams_per_thread = 3;
   std::size_t num_hp_gpu_streams_per_thread = 3;
+  std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30;
+  std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30;
+  std::size_t umpire_host_memory_pool_alignment_bytes = 16;
+  double umpire_host_memory_pool_coalescing_free_ratio = 1.0;
+  double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0;
+  std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30;
+  std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30;
+  std::size_t umpire_device_memory_pool_alignment_bytes = 16;
+  double umpire_device_memory_pool_coalescing_free_ratio = 1.0;
+  double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0;
   std::size_t num_gpu_blas_handles = 16;
   std::size_t num_gpu_lapack_handles = 16;
-  std::size_t umpire_host_memory_pool_initial_bytes = 1 << 30;
-  std::size_t umpire_device_memory_pool_initial_bytes = 1 << 30;
   std::string mpi_pool = "mpi";
 };
 
diff --git a/include/dlaf/matrix/panel.h b/include/dlaf/matrix/panel.h
index 498ed60697..f20de36e6e 100644
--- a/include/dlaf/matrix/panel.h
+++ b/include/dlaf/matrix/panel.h
@@ -361,6 +361,8 @@ struct Panel<axis, const T, D, StoreTransposed::No> {
         return {mat_size - i_tile * mb, nb};
       case Coord::Row:
         return {mb, mat_size - i_tile * nb};
+      default:
+        return DLAF_UNREACHABLE(LocalElementSize);
     }
   }
 
diff --git a/include/dlaf/memory/memory_chunk.h b/include/dlaf/memory/memory_chunk.h
index c837d54f49..ab5846c2f0 100644
--- a/include/dlaf/memory/memory_chunk.h
+++ b/include/dlaf/memory/memory_chunk.h
@@ -27,11 +27,15 @@ namespace memory {
 
 namespace internal {
 umpire::Allocator& getUmpireHostAllocator();
-void initializeUmpireHostAllocator(std::size_t initial_bytes);
+void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                   std::size_t alignment_bytes, double coalesce_free_ratio,
+                                   double coalesce_reallocation_ratio);
 void finalizeUmpireHostAllocator();
 
 #ifdef DLAF_WITH_GPU
-void initializeUmpireDeviceAllocator(std::size_t initial_bytes);
+void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                     std::size_t alignment_bytes, double coalesce_free_ratio,
+                                     double coalesce_reallocation_ratio);
 void finalizeUmpireDeviceAllocator();
 umpire::Allocator& getUmpireDeviceAllocator();
 #endif
diff --git a/include/dlaf/multiplication/hermitian.h b/include/dlaf/multiplication/hermitian.h
index 6c228cc2d2..38f5e99ac1 100644
--- a/include/dlaf/multiplication/hermitian.h
+++ b/include/dlaf/multiplication/hermitian.h
@@ -66,7 +66,6 @@ void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, M
     switch (uplo) {
       case blas::Uplo::Lower:
         return multiplication::internal::Hermitian<B, D, T>::call_LL(alpha, mat_a, mat_b, beta, mat_c);
-        break;
       case blas::Uplo::Upper:
         DLAF_UNIMPLEMENTED(uplo);
         break;
@@ -130,7 +129,6 @@ void hermitian_multiplication(comm::CommunicatorGrid& grid, blas::Side side, bla
       case blas::Uplo::Lower:
         return multiplication::internal::Hermitian<B, D, T>::call_LL(grid, alpha, mat_a, mat_b, beta,
                                                                      mat_c);
-        break;
       case blas::Uplo::Upper:
         DLAF_UNIMPLEMENTED(uplo);
         break;
diff --git a/include/dlaf/permutations/general/impl.h b/include/dlaf/permutations/general/impl.h
index 8e643ea8c7..ef3f8686a6 100644
--- a/include/dlaf/permutations/general/impl.h
+++ b/include/dlaf/permutations/general/impl.h
@@ -170,6 +170,7 @@ void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end
   namespace ex = pika::execution::experimental;
   namespace dist_extra = dlaf::matrix::internal::distribution;
   using dist_extra::local_element_distance_from_global_tile;
+  using dlaf::internal::continues_on;
 
   if (i_begin == i_end)
     return;
@@ -210,7 +211,7 @@ void Permutations<B, D, T, C>::call(const SizeType i_begin, const SizeType i_end
       applyPermutationOnCPU<T, C>(i_perm, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles);
     };
 
-    ex::start_detached(std::move(sender) | ex::transfer(dlaf::internal::getBackendScheduler<B>()) |
+    ex::start_detached(std::move(sender) | continues_on(dlaf::internal::getBackendScheduler<B>()) |
                        ex::bulk(nperms, std::move(permute_fn)));
   }
   else {
@@ -430,7 +431,7 @@ void applyPackingIndex(const matrix::Distribution& subm_dist, IndexMapSender&& i
       applyPermutationOnCPU<T, C>(i_perm, subm_dist, perm_arr, mat_in_tiles, mat_out_tiles);
     };
 
-    ex::start_detached(std::move(sender) | ex::transfer(di::getBackendScheduler<Backend::MC>()) |
+    ex::start_detached(std::move(sender) | di::continues_on(di::getBackendScheduler<Backend::MC>()) |
                        ex::bulk(nperms, std::move(permute_fn)));
   }
   else {
diff --git a/include/dlaf/sender/continues_on.h b/include/dlaf/sender/continues_on.h
new file mode 100644
index 0000000000..12aae7c725
--- /dev/null
+++ b/include/dlaf/sender/continues_on.h
@@ -0,0 +1,20 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2024, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <pika/execution.hpp>
+
+namespace dlaf::internal {
+#if PIKA_VERSION_FULL < 0x001D00  // < 0.29.0
+inline constexpr pika::execution::experimental::transfer_t continues_on{};
+#else
+using pika::execution::experimental::continues_on;
+#endif
+}
diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
index 78b66eb001..4e391456b4 100644
--- a/include/dlaf/sender/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -17,6 +17,7 @@
 #include <dlaf/common/unwrap.h>
 #include <dlaf/init.h>
 #include <dlaf/schedulers.h>
+#include <dlaf/sender/continues_on.h>
 #include <dlaf/sender/policy.h>
 #include <dlaf/sender/typelist.h>
 #include <dlaf/sender/when_all_lift.h>
@@ -46,19 +47,19 @@ enum class TransformDispatchType { Plain, Blas, Lapack };
 // allows choosing the priority.
 //
 // At its core, transform is a convenience wrapper around
-// sender | transfer(with_priority(scheduler, priority)) | then(ConsumeRvalues(unwrapping(f))).
+// sender | continues_on(with_priority(scheduler, priority)) | then(ConsumeRvalues(unwrapping(f))).
 
 /// Lazy transform. This does not submit the work and returns a sender.
 template <TransformDispatchType Tag = TransformDispatchType::Plain, Backend B = Backend::MC,
           typename F = void, typename Sender = void,
           typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
 [[nodiscard]] decltype(auto) transform(const Policy<B> policy, F&& f, Sender&& sender) {
+  using dlaf::internal::continues_on;
   using pika::execution::experimental::drop_operation_state;
   using pika::execution::experimental::then;
-  using pika::execution::experimental::transfer;
 
   auto scheduler = getBackendScheduler<B>(policy.priority(), policy.stacksize());
-  auto transfer_sender = transfer(std::forward<Sender>(sender), std::move(scheduler));
+  auto transfer_sender = continues_on(std::forward<Sender>(sender), std::move(scheduler));
 
   using dlaf::common::internal::ConsumeRvalues;
   using dlaf::common::internal::Unwrapping;
diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h
index edfbd7d419..f5c2ac728a 100644
--- a/include/dlaf/sender/transform_mpi.h
+++ b/include/dlaf/sender/transform_mpi.h
@@ -18,8 +18,8 @@
 #include <dlaf/common/unwrap.h>
 #include <dlaf/communication/communicator.h>
 #include <dlaf/communication/communicator_pipeline.h>
+#include <dlaf/sender/continues_on.h>
 #include <dlaf/sender/transform.h>
-#include <dlaf/sender/when_all_lift.h>
 
 namespace dlaf::comm::internal {
 
@@ -89,38 +89,14 @@ MPICallHelper(F&&) -> MPICallHelper<std::decay_t<F>>;
 template <typename F, typename Sender,
           typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
 [[nodiscard]] decltype(auto) transformMPI(F&& f, Sender&& sender) {
+  using dlaf::internal::continues_on;
   namespace ex = pika::execution::experimental;
 
-  return ex::transfer(std::forward<Sender>(sender), dlaf::internal::getMPIScheduler()) |
+  return continues_on(std::forward<Sender>(sender), dlaf::internal::getMPIScheduler()) |
          ex::then(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward<F>(f)}}) |
          ex::drop_operation_state();
 }
 
-/// Fire-and-forget transformMPI. This submits the work and returns void.
-template <typename F, typename Sender,
-          typename = std::enable_if_t<pika::execution::experimental::is_sender_v<Sender>>>
-void transformMPIDetach(F&& f, Sender&& sender) {
-  pika::execution::experimental::start_detached(transformMPI(std::forward<F>(f),
-                                                             std::forward<Sender>(sender)));
-}
-
-/// Lazy transformMPI. This does not submit the work and returns a sender. First
-/// lifts non-senders into senders using just, and then calls transform with a
-/// when_all sender of the lifted senders.
-template <typename F, typename... Ts>
-[[nodiscard]] decltype(auto) transformMPILift(F&& f, Ts&&... ts) {
-  return transformMPI(std::forward<F>(f), dlaf::internal::whenAllLift(std::forward<Ts>(ts)...));
-}
-
-/// Fire-and-forget transformMPI. This submits the work and returns void. First
-/// lifts non-senders into senders using just, and then calls transform with a
-/// when_all sender of the lifted senders.
-template <typename F, typename... Ts>
-void transformMPILiftDetach(F&& f, Ts&&... ts) {
-  pika::execution::experimental::start_detached(transformLift(std::forward<F>(f),
-                                                              std::forward<Ts>(ts)...));
-}
-
 template <typename F>
 struct PartialTransformMPIBase {
   std::decay_t<F> f_;
@@ -148,29 +124,6 @@ class PartialTransformMPI : private PartialTransformMPIBase<F> {
 template <typename F>
 PartialTransformMPI(F&& f) -> PartialTransformMPI<std::decay_t<F>>;
 
-/// A partially applied transformMPIDetach, with the callable object given, but
-/// the predecessor sender missing. The predecessor sender is applied when
-/// calling the operator| overload.
-template <typename F>
-class PartialTransformMPIDetach : private PartialTransformMPIBase<F> {
-public:
-  template <typename F_>
-  PartialTransformMPIDetach(F_&& f) : PartialTransformMPIBase<F>{std::forward<F_>(f)} {}
-  PartialTransformMPIDetach(PartialTransformMPIDetach&&) = default;
-  PartialTransformMPIDetach(const PartialTransformMPIDetach&) = default;
-  PartialTransformMPIDetach& operator=(PartialTransformMPIDetach&&) = default;
-  PartialTransformMPIDetach& operator=(const PartialTransformMPIDetach&) = default;
-
-  template <typename Sender>
-  friend auto operator|(Sender&& sender, PartialTransformMPIDetach pa) {
-    return pika::execution::experimental::start_detached(transformMPI(std::move(pa.f_),
-                                                                      std::forward<Sender>(sender)));
-  }
-};
-
-template <typename F>
-PartialTransformMPIDetach(F&& f) -> PartialTransformMPIDetach<std::decay_t<F>>;
-
 /// \overload transformMPI
 ///
 /// This overload partially applies the MPI transform for later use with
@@ -179,13 +132,4 @@ template <typename F>
 [[nodiscard]] decltype(auto) transformMPI(F&& f) {
   return PartialTransformMPI{std::forward<F>(f)};
 }
-
-/// \overload transformMPIDetach
-///
-/// This overload partially applies transformMPIDetach for later use with
-/// operator| with a sender on the left-hand side.
-template <typename F>
-[[nodiscard]] decltype(auto) transformMPIDetach(F&& f) {
-  return PartialTransformMPIDetach{std::forward<F>(f)};
-}
 }
diff --git a/miniapp/include/dlaf/miniapp/options.h b/miniapp/include/dlaf/miniapp/options.h
index fc2248ab20..5254ce9291 100644
--- a/miniapp/include/dlaf/miniapp/options.h
+++ b/miniapp/include/dlaf/miniapp/options.h
@@ -170,7 +170,7 @@ T stringToBlasEnum(const std::string& option_name, const std::string& x,
       }
     }
     std::string option_name_dashes = "--" + option_name;
-    DLAF_MINIAPP_INVALID_OPTION_VALUE(option_name, x, valid_values_stream.str());
+    DLAF_MINIAPP_INVALID_OPTION_VALUE(option_name_dashes, x, valid_values_stream.str());
   }
 
   return static_cast<T>(std::toupper(x[0]));
diff --git a/scripts/plot_strong.sh b/scripts/plot_strong.sh
new file mode 100755
index 0000000000..9a85e274a2
--- /dev/null
+++ b/scripts/plot_strong.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+# This script helps to plot the results of different benchmarks.
+
+set -eu
+
+####################################################################################################
+# Variables to modify:
+debug=0
+python_venv_path=""
+# Path where to find the benchmarks data as the root of z/ and d/ (since they are appended later)
+# (list containing several paths if we want to compare data on the same plots)
+base_paths=(
+    "" \
+    "" \
+    )
+# Path where you want your plotting results
+out_path=""
+####################################################################################################
+
+if [[ -z $out_path ]] || [[ -z ${base_paths[0]} ]]; then
+    echo "You need to set the variables in the beginning of the script"
+    exit 1
+fi
+
+if [[ ! -z "$python_venv_path" ]]; then
+    source $python_venv_path/bin/activate
+fi
+
+complex_paths=(${base_paths[@]/%//z})
+double_paths=(${base_paths[@]/%//d})
+out_path_complex=${out_path/%//z}
+out_path_double=${out_path/%//d}
+
+args_base="--distinguish-dir"
+args_double="$args_base --out-path ${out_path_double}"
+args_complex="$args_base --out-path ${out_path_complex}"
+
+idx=0
+for path in "${base_paths[@]}"; do
+    args_complex+=" --path ${complex_paths[$idx]}"
+    args_double+=" --path ${double_paths[$idx]}"
+    idx=$((idx+1))
+done
+
+if [[ "$debug" == 1 ]]; then
+    BOLD=$(tput bold)
+    NORMAL=$(tput sgr0)
+    echo "${BOLD}double_paths list:${NORMAL} ${double_paths[@]}"
+    echo "${BOLD}complex_paths list:${NORMAL} ${complex_paths[@]}"
+    echo "${BOLD}double_args:${NORMAL} $args_double"
+    echo "${BOLD}complex_args:${NORMAL} $args_complex"
+else
+    set -x
+    # double
+    ./plot_chol_strong.py $args_double &
+    ./plot_band2trid_strong.py $args_double &
+    ./plot_hegst_strong.py $args_double &
+    ./plot_trmm_strong.py $args_double &
+    ./plot_bt_band2trid_strong.py $args_double &
+    ./plot_evp_strong.py $args_double &
+    ./plot_red2band_strong.py $args_double &
+    ./plot_trsm_strong.py $args_double &
+    ./plot_bt_red2band_strong.py $args_double &
+    ./plot_gevp_strong.py $args_double &
+    ./plot_tridiag_solver_strong.py $args_double &
+
+    # complex &
+    ./plot_chol_strong.py $args_complex &
+    ./plot_band2trid_strong.py $args_complex &
+    ./plot_hegst_strong.py $args_complex &
+    ./plot_trmm_strong.py $args_complex &
+    ./plot_bt_band2trid_strong.py $args_complex &
+    ./plot_evp_strong.py $args_complex &
+    ./plot_red2band_strong.py $args_complex &
+    ./plot_trsm_strong.py $args_complex &
+    ./plot_bt_red2band_strong.py $args_complex &
+    ./plot_gevp_strong.py $args_complex &
+    ./plot_tridiag_solver_strong.py $args_complex &
+
+    wait
+fi
diff --git a/scripts/systems.py b/scripts/systems.py
index b28584a5bc..c87e79d760 100644
--- a/scripts/systems.py
+++ b/scripts/systems.py
@@ -49,6 +49,7 @@
 #SBATCH --constraint=mc
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -81,6 +82,7 @@
 #SBATCH --constraint=gpu
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -112,6 +114,7 @@
 #SBATCH --constraint=mc
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -146,6 +149,7 @@
 #SBATCH --hint=multithread
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -179,6 +183,7 @@
 #SBATCH --hint=multithread
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -211,6 +216,7 @@
 #SBATCH --hint=multithread
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_OPT_THREAD_SYNC=0 # Required to work around MPICH bug
@@ -245,6 +251,7 @@
 #SBATCH --hint=multithread
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
@@ -283,6 +290,7 @@
 #SBATCH --gpus-per-node=8
 #SBATCH --output=output.txt
 #SBATCH --error=error.txt
+#SBATCH --no-requeue
 
 # Env
 export MPICH_MAX_THREAD_SAFETY=multiple
diff --git a/src/eigensolver/bt_band_to_tridiag/gpu.cpp b/src/eigensolver/bt_band_to_tridiag/gpu.cpp
index bc806bc7e8..86884b7162 100644
--- a/src/eigensolver/bt_band_to_tridiag/gpu.cpp
+++ b/src/eigensolver/bt_band_to_tridiag/gpu.cpp
@@ -8,6 +8,8 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
 
+#include <complex>
+
 #include <dlaf/eigensolver/bt_band_to_tridiag/impl.h>
 
 namespace dlaf::eigensolver::internal {
diff --git a/src/init.cpp b/src/init.cpp
index b1985b80a6..0d4366e15e 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -29,15 +29,23 @@
 
 namespace dlaf {
 std::ostream& operator<<(std::ostream& os, const configuration& cfg) {
+  // clang-format off
   os << "  num_np_gpu_streams_per_thread = " << cfg.num_np_gpu_streams_per_thread << std::endl;
   os << "  num_hp_gpu_streams_per_thread = " << cfg.num_hp_gpu_streams_per_thread << std::endl;
+  os << "  umpire_host_memory_pool_initial_block_bytes = " << cfg.umpire_host_memory_pool_initial_block_bytes << std::endl;
+  os << "  umpire_host_memory_pool_next_block_bytes = " << cfg.umpire_host_memory_pool_next_block_bytes << std::endl;
+  os << "  umpire_host_memory_pool_alignment_bytes = " << cfg.umpire_host_memory_pool_alignment_bytes << std::endl;
+  os << "  umpire_host_memory_pool_coalescing_free_ratio = " << cfg.umpire_host_memory_pool_coalescing_free_ratio << std::endl;
+  os << "  umpire_host_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_host_memory_pool_coalescing_reallocation_ratio << std::endl;
+  os << "  umpire_device_memory_pool_initial_block_bytes = " << cfg.umpire_device_memory_pool_initial_block_bytes << std::endl;
+  os << "  umpire_device_memory_pool_next_block_bytes = " << cfg.umpire_device_memory_pool_next_block_bytes << std::endl;
+  os << "  umpire_device_memory_pool_alignment_bytes = " << cfg.umpire_device_memory_pool_alignment_bytes << std::endl;
+  os << "  umpire_device_memory_pool_coalescing_free_ratio = " << cfg.umpire_device_memory_pool_coalescing_free_ratio << std::endl;
+  os << "  umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl;
   os << "  num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl;
   os << "  num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl;
-  os << "  umpire_host_memory_pool_initial_bytes = " << cfg.umpire_host_memory_pool_initial_bytes
-     << std::endl;
-  os << "  umpire_device_memory_pool_initial_bytes = " << cfg.umpire_device_memory_pool_initial_bytes
-     << std::endl;
   os << "  mpi_pool = " << cfg.mpi_pool << std::endl;
+  // clang-format on
   return os;
 }
 
@@ -58,7 +66,10 @@ struct Init {
 template <>
 struct Init<Backend::MC> {
   static void initialize(const configuration& cfg) {
-    memory::internal::initializeUmpireHostAllocator(cfg.umpire_host_memory_pool_initial_bytes);
+    memory::internal::initializeUmpireHostAllocator(
+        cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes,
+        cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio,
+        cfg.umpire_host_memory_pool_coalescing_reallocation_ratio);
   }
 
   static void finalize() {
@@ -106,7 +117,11 @@ template <>
 struct Init<Backend::GPU> {
   static void initialize(const configuration& cfg) {
     const int device = 0;
-    memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes);
+    memory::internal::initializeUmpireDeviceAllocator(
+        cfg.umpire_device_memory_pool_initial_block_bytes,
+        cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes,
+        cfg.umpire_host_memory_pool_coalescing_free_ratio,
+        cfg.umpire_host_memory_pool_coalescing_reallocation_ratio);
     initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread,
                       cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles);
     pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default"));
@@ -140,6 +155,13 @@ struct parseFromString<SizeType> {
   }
 };
 
+template <>
+struct parseFromString<double> {
+  static std::optional<double> call(const std::string& var) {
+    return std::stod(var);
+  }
+};
+
 template <>
 struct parseFromString<bool> {
   static std::optional<bool> call(const std::string& var) {
@@ -216,26 +238,27 @@ void warnUnusedConfigurationOption(const pika::program_options::variables_map& v
 }
 
 void updateConfiguration(const pika::program_options::variables_map& vm, configuration& cfg) {
+  // clang-format off
   updateConfigurationValue(vm, cfg.print_config, "PRINT_CONFIG", "print-config");
-  updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD",
-                           "num-np-gpu-streams-per-thread");
-  updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD",
-                           "num-hp-gpu-streams-per-thread");
+  updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", "num-np-gpu-streams-per-thread");
+  updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", "num-hp-gpu-streams-per-thread");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_block_bytes, "UMPIRE_HOST_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-host-memory-pool-initial-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_next_block_bytes, "UMPIRE_HOST_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-host-memory-pool-next-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_alignment_bytes, "UMPIRE_HOST_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-host-memory-pool-alignment-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_free_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-host-memory-pool-coalescing-free-ratio");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-host-memory-pool-coalescing-reallocation-ratio");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-device-memory-pool-initial-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_next_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-device-memory-pool-next-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_alignment_bytes, "UMPIRE_DEVICE_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-device-memory-pool-alignment-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_free_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-device-memory-pool-coalescing-free-ratio");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_reallocation_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-device-memory-pool-coalescing-reallocation-ratio");
   updateConfigurationValue(vm, cfg.num_gpu_blas_handles, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles");
-  updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES",
-                           "num-gpu-lapack-handles");
+  updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles");
 #if PIKA_VERSION_FULL < 0x001D00  // < 0.29.0
-  warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles",
-                                "only supported with pika 0.29.0 or newer");
-  warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles",
-                                "only supported with pika 0.29.0 or newer");
+  warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", "only supported with pika 0.29.0 or newer");
+  warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", "only supported with pika 0.29.0 or newer");
 #endif
-  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_bytes,
-                           "UMPIRE_HOST_MEMORY_POOL_INITIAL_BYTES",
-                           "umpire-host-memory-pool-initial-bytes");
-  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_bytes,
-                           "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BYTES",
-                           "umpire-device-memory-pool-initial-bytes");
+  // clang-format on
   cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default";
 
   // Warn if not using MPI pool without --dlaf:no-mpi-pool
@@ -257,44 +280,28 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
   // NOTE: Environment variables should omit the DLAF_ prefix and command line options the dlaf: prefix.
   // These are added automatically by updateConfigurationValue.
   auto& param = getTuneParameters();
-  updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS",
-                           "red2band-panel-nworkers");
-
-  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US",
-                           "red2band-barrier-busy-wait-us");
-
-  updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND",
-                           "eigensolver-min-band");
-
-  updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base,
-                           "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base");
-
-  updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data,
-                           "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data,
-                           "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data,
-                           "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", "");
+  // clang-format off
+  updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers");
+  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us");
+  updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band");
+  updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base");
+
+  updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", "");
   updateConfigurationValue(vm, param.debug_dump_eigensolver_data, "DEBUG_DUMP_EIGENSOLVER_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data,
-                           "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data,
-                           "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA",
-                           "");
+  updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", "");
 
-  updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
-                           "tridiag-rank1-nworkers");
+  updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", "tridiag-rank1-nworkers");
 
-  updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us,
-                           "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us");
+  updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us");
 
-  updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size,
-                           "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE",
-                           "bt-band-to-tridiag-hh-apply-group-size");
+  updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", "bt-band-to-tridiag-hh-apply-group-size");
 
-  updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES",
-                           "communicator-grid-num-pipelines");
+  updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", "communicator-grid-num-pipelines");
+  // clang-format on
 }
 
 configuration& getConfiguration() {
@@ -306,49 +313,35 @@ configuration& getConfiguration() {
 pika::program_options::options_description getOptionsDescription() {
   pika::program_options::options_description desc("DLA-Future options");
 
+  // clang-format off
   desc.add_options()("dlaf:help", "Print help message");
   desc.add_options()("dlaf:print-config", "Print the DLA-Future configuration");
-  desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value<std::size_t>(),
-                     "Number of normal priority GPU streams per worker thread");
-  desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value<std::size_t>(),
-                     "Number of high priority GPU streams per worker thread");
-  desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value<std::size_t>(),
-                     "Number of GPU BLAS (cuBLAS/rocBLAS) handles");
-  desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value<std::size_t>(),
-                     "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles");
-  desc.add_options()("dlaf:umpire-host-memory-pool-initial-bytes",
-                     pika::program_options::value<std::size_t>(),
-                     "Number of bytes to preallocate for pinned host memory pool");
-  desc.add_options()("dlaf:umpire-device-memory-pool-initial-bytes",
-                     pika::program_options::value<std::size_t>(),
-                     "Number of bytes to preallocate for device memory pool");
+  desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value<std::size_t>(), "Number of normal priority GPU streams per worker thread");
+  desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value<std::size_t>(), "Number of high priority GPU streams per worker thread");
+  desc.add_options()("dlaf:umpire-host-memory-pool-initial-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to preallocate for pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-next-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to allocate in blocks after the first block for pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-alignment-bytes", pika::program_options::value<std::size_t>(), "Alignment of allocations in bytes in pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-free-ratio", pika::program_options::value<double>(), "Required ratio of free memory in pinned host memory pool before performing coalescing of free blocks");
+  desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-reallocation-ratio", pika::program_options::value<double>(), "Ratio of current used memory in pinned host memory pool to use for reallocation of new blocks when coalescing free blocks");
+  desc.add_options()("dlaf:umpire-device-memory-pool-initial-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to preallocate for device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-next-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to allocate in blocks after the first block for device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-alignment-bytes", pika::program_options::value<std::size_t>(), "Alignment of allocations in bytes in device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-free-ratio", pika::program_options::value<double>(), "Required ratio of free memory in device memory pool before performing coalescing of free blocks");
+  desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-reallocation-ratio", pika::program_options::value<double>(), "Ratio of current used memory in device memory pool to use for reallocation of new blocks when coalescing free blocks");
+  desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value<std::size_t>(), "Number of GPU BLAS (cuBLAS/rocBLAS) handles");
+  desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value<std::size_t>(), "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles");
   desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool.");
 
   // Tune parameters command line options
-  desc.add_options()(
-      "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(),
-      "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
-  desc.add_options()(
-      "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
-      "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
-  desc.add_options()(
-      "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(),
-      "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
-  desc.add_options()(
-      "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(),
-      "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
-  desc.add_options()(
-      "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(),
-      "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
-  desc.add_options()(
-      "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
-      "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
-  desc.add_options()(
-      "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(),
-      "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
-  desc.add_options()(
-      "dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(),
-      "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
+  desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
+  desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
+  desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
+  desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
+  desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
+  desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
+  desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(), "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
+  desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
+  // clang-format on
 
   return desc;
 }
diff --git a/src/lapack/gpu/add.cu b/src/lapack/gpu/add.cu
index 9e9f3e97e7..4ef3329b64 100644
--- a/src/lapack/gpu/add.cu
+++ b/src/lapack/gpu/add.cu
@@ -32,6 +32,20 @@ __device__ inline void addAlpha(const T& alpha, const T& a, T& b) {
   b = b + alpha * a;
 }
 
+#ifdef DLAF_WITH_HIP
+template <>
+__device__ inline void addAlpha<hipFloatComplex>(const hipFloatComplex& alpha, const hipFloatComplex& a,
+                                                 hipFloatComplex& b) {
+  b = b + hipCmulf(alpha, a);
+}
+
+template <>
+__device__ inline void addAlpha<hipDoubleComplex>(const hipDoubleComplex& alpha,
+                                                  const hipDoubleComplex& a, hipDoubleComplex& b) {
+  b = b + hipCmul(alpha, a);
+}
+#endif
+
 template <class T>
 __device__ inline void sum(const T& /*alpha*/, const T& a, T& b) {
   b = b + a;
diff --git a/src/memory/memory_chunk.cpp b/src/memory/memory_chunk.cpp
index a2b0c33885..9880c1a036 100644
--- a/src/memory/memory_chunk.cpp
+++ b/src/memory/memory_chunk.cpp
@@ -11,9 +11,11 @@
 #include <cstddef>
 
 #include <umpire/ResourceManager.hpp>
+#include <umpire/strategy/PoolCoalesceHeuristic.hpp>
 #include <umpire/strategy/QuickPool.hpp>
 #include <umpire/strategy/ThreadSafeAllocator.hpp>
 
+#include <dlaf/common/assert.h>
 #include <dlaf/memory/memory_chunk.h>
 
 namespace dlaf {
@@ -36,7 +38,46 @@ umpire::Allocator& getUmpireHostAllocator() {
 }
 #endif
 
-void initializeUmpireHostAllocator(std::size_t initial_bytes) {
+using PoolType = umpire::strategy::QuickPool;
+using CoalesceHeuristicType = umpire::strategy::PoolCoalesceHeuristic<PoolType>;
+
+#ifdef DLAF_WITH_GPU
+// This is a modified version of the "percent_releasable" coalescing heuristic
+// from Umpire. This version allows choosing what ratio of the actual size to
+// reallocate when coalescing.
+//
+// A free ratio of 1.0 means that the pool will be coalesced only when all
+// blocks are unused. A free ratio of 0.5 means that the pool will be coalesced
+// when at least 50% of the pool's memory is unused. A ratio of 0.0 means that
+// the pool will be coalesced as soon as any two free blocks are available. A
+// ratio of more than 1.0 will make the pool never coalesce.
+//
+// A reallocation ratio of 1.0 simply coalesces all the free memory into a new
+// block. A ratio of 0.5 will attempt to shrink the pool to half its previous
+// size. A ratio of 1.5 will allocate 50% more than the previous pool size.
+//
+// A single free block is never "coalesced" to keep things simple. In theory a
+// single block could be shrunk or grown to match the reallocation ratio but
+// this can lead to strange reallocations, so we simply avoid that case. Two or
+// more blocks are always coalesced to one block, so no reallocation will
+// happen immediately after coalescing two or more blocks.
+static CoalesceHeuristicType get_coalesce_heuristic(double coalesce_free_ratio,
+                                                    double coalesce_reallocation_ratio) {
+  return [=](const PoolType& pool) {
+    std::size_t threshold = static_cast<std::size_t>(coalesce_free_ratio * pool.getActualSize());
+    if (pool.getReleasableBlocks() >= 2 && pool.getReleasableSize() >= threshold) {
+      return static_cast<std::size_t>(coalesce_reallocation_ratio * pool.getActualSize());
+    }
+    else {
+      return static_cast<std::size_t>(0);
+    }
+  };
+}
+#endif
+
+void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                   std::size_t alignment_bytes, double coalesce_free_ratio,
+                                   double coalesce_reallocation_ratio) {
 #ifdef DLAF_WITH_GPU
   static bool initialized = false;
 
@@ -45,26 +86,29 @@ void initializeUmpireHostAllocator(std::size_t initial_bytes) {
   if (!initialized) {
     auto host_allocator = umpire::ResourceManager::getInstance().getAllocator("PINNED");
     auto pooled_host_allocator =
-        umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>("PINNED_pool",
-                                                                                          host_allocator,
-                                                                                          initial_bytes);
+        umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>(
+            "DLAF_PINNED_pool", host_allocator, initial_block_bytes, next_block_bytes, alignment_bytes,
+            get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio));
     auto thread_safe_pooled_host_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::ThreadSafeAllocator>(
-            "PINNED_thread_safe_pool", pooled_host_allocator);
+            "DLAF_PINNED_thread_safe_pool", pooled_host_allocator);
 
     memory::internal::getUmpireHostAllocator() = thread_safe_pooled_host_allocator;
 
     initialized = true;
   }
 #else
-  (void) initial_bytes;
+  dlaf::internal::silenceUnusedWarningFor(initial_block_bytes, next_block_bytes, alignment_bytes,
+                                          coalesce_free_ratio, coalesce_reallocation_ratio);
 #endif
 }
 
 void finalizeUmpireHostAllocator() {}
 
 #ifdef DLAF_WITH_GPU
-void initializeUmpireDeviceAllocator(std::size_t initial_bytes) {
+void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                     std::size_t alignment_bytes, double coalesce_free_ratio,
+                                     double coalesce_reallocation_ratio) {
   static bool initialized = false;
 
   // Umpire pools cannot be released, so we keep the pools around even when
@@ -73,10 +117,11 @@ void initializeUmpireDeviceAllocator(std::size_t initial_bytes) {
     auto device_allocator = umpire::ResourceManager::getInstance().getAllocator("DEVICE");
     auto pooled_device_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>(
-            "DEVICE_pool", device_allocator, initial_bytes);
+            "DLAF_DEVICE_pool", device_allocator, initial_block_bytes, next_block_bytes, alignment_bytes,
+            get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio));
     auto thread_safe_pooled_device_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::ThreadSafeAllocator>(
-            "DEVICE_thread_safe_pool", pooled_device_allocator);
+            "DLAF_DEVICE_thread_safe_pool", pooled_device_allocator);
 
     memory::internal::getUmpireDeviceAllocator() = thread_safe_pooled_device_allocator;
 
diff --git a/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp b/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp
index 1ec8dcbfe4..ae86c21023 100644
--- a/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp
+++ b/test/unit/c_api/eigensolver/test_eigensolver_c_api.cpp
@@ -77,7 +77,6 @@ void testEigensolver(const blas::Uplo uplo, const SizeType m, const SizeType mb,
   // Here we need to resume it manually to build the matrices with DLA-Future
   pika::resume();
 
-  const LocalElementSize size(m, m);
   const TileElementSize block_size(mb, mb);
 
   Matrix<const T, Device::CPU> reference = [&]() {