diff --git a/include/dlaf/init.h b/include/dlaf/init.h
index 0e50ad7fec..07f8b26a22 100644
--- a/include/dlaf/init.h
+++ b/include/dlaf/init.h
@@ -37,10 +37,18 @@ struct configuration {
   // - getOptionsDescription to add a corresponding command line option
   std::size_t num_np_gpu_streams_per_thread = 3;
   std::size_t num_hp_gpu_streams_per_thread = 3;
+  std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30;
+  std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30;
+  std::size_t umpire_host_memory_pool_alignment_bytes = 16;
+  double umpire_host_memory_pool_coalescing_free_ratio = 1.0;
+  double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0;
+  std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30;
+  std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30;
+  std::size_t umpire_device_memory_pool_alignment_bytes = 16;
+  double umpire_device_memory_pool_coalescing_free_ratio = 1.0;
+  double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0;
   std::size_t num_gpu_blas_handles = 16;
   std::size_t num_gpu_lapack_handles = 16;
-  std::size_t umpire_host_memory_pool_initial_bytes = 1 << 30;
-  std::size_t umpire_device_memory_pool_initial_bytes = 1 << 30;
   std::string mpi_pool = "mpi";
 };
 
diff --git a/include/dlaf/memory/memory_chunk.h b/include/dlaf/memory/memory_chunk.h
index c837d54f49..ab5846c2f0 100644
--- a/include/dlaf/memory/memory_chunk.h
+++ b/include/dlaf/memory/memory_chunk.h
@@ -27,11 +27,15 @@ namespace memory {
 
 namespace internal {
 umpire::Allocator& getUmpireHostAllocator();
-void initializeUmpireHostAllocator(std::size_t initial_bytes);
+void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                   std::size_t alignment_bytes, double coalesce_free_ratio,
+                                   double coalesce_reallocation_ratio);
 void finalizeUmpireHostAllocator();
 
 #ifdef DLAF_WITH_GPU
-void initializeUmpireDeviceAllocator(std::size_t initial_bytes);
+void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                     std::size_t alignment_bytes, double coalesce_free_ratio,
+                                     double coalesce_reallocation_ratio);
 void finalizeUmpireDeviceAllocator();
 umpire::Allocator& getUmpireDeviceAllocator();
 #endif
diff --git a/src/init.cpp b/src/init.cpp
index 7e3c3072e4..dd9ce6e0c9 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -29,15 +29,23 @@
 
 namespace dlaf {
 std::ostream& operator<<(std::ostream& os, const configuration& cfg) {
+  // clang-format off
   os << "  num_np_gpu_streams_per_thread = " << cfg.num_np_gpu_streams_per_thread << std::endl;
   os << "  num_hp_gpu_streams_per_thread = " << cfg.num_hp_gpu_streams_per_thread << std::endl;
+  os << "  umpire_host_memory_pool_initial_block_bytes = " << cfg.umpire_host_memory_pool_initial_block_bytes << std::endl;
+  os << "  umpire_host_memory_pool_next_block_bytes = " << cfg.umpire_host_memory_pool_next_block_bytes << std::endl;
+  os << "  umpire_host_memory_pool_alignment_bytes = " << cfg.umpire_host_memory_pool_alignment_bytes << std::endl;
+  os << "  umpire_host_memory_pool_coalescing_free_ratio = " << cfg.umpire_host_memory_pool_coalescing_free_ratio << std::endl;
+  os << "  umpire_host_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_host_memory_pool_coalescing_reallocation_ratio << std::endl;
+  os << "  umpire_device_memory_pool_initial_block_bytes = " << cfg.umpire_device_memory_pool_initial_block_bytes << std::endl;
+  os << "  umpire_device_memory_pool_next_block_bytes = " << cfg.umpire_device_memory_pool_next_block_bytes << std::endl;
+  os << "  umpire_device_memory_pool_alignment_bytes = " << cfg.umpire_device_memory_pool_alignment_bytes << std::endl;
+  os << "  umpire_device_memory_pool_coalescing_free_ratio = " << cfg.umpire_device_memory_pool_coalescing_free_ratio << std::endl;
+  os << "  umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl;
   os << "  num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl;
   os << "  num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl;
-  os << "  umpire_host_memory_pool_initial_bytes = " << cfg.umpire_host_memory_pool_initial_bytes
-     << std::endl;
-  os << "  umpire_device_memory_pool_initial_bytes = " << cfg.umpire_device_memory_pool_initial_bytes
-     << std::endl;
   os << "  mpi_pool = " << cfg.mpi_pool << std::endl;
+  // clang-format on
   return os;
 }
 
@@ -58,7 +66,10 @@ struct Init {
 template <>
 struct Init<Backend::MC> {
   static void initialize(const configuration& cfg) {
-    memory::internal::initializeUmpireHostAllocator(cfg.umpire_host_memory_pool_initial_bytes);
+    memory::internal::initializeUmpireHostAllocator(
+        cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes,
+        cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio,
+        cfg.umpire_host_memory_pool_coalescing_reallocation_ratio);
   }
 
   static void finalize() {
@@ -106,7 +117,11 @@ template <>
 struct Init<Backend::GPU> {
   static void initialize(const configuration& cfg) {
     const int device = 0;
-    memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes);
+    memory::internal::initializeUmpireDeviceAllocator(
+        cfg.umpire_device_memory_pool_initial_block_bytes,
+        cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes,
+        cfg.umpire_host_memory_pool_coalescing_free_ratio,
+        cfg.umpire_host_memory_pool_coalescing_reallocation_ratio);
     initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread,
                       cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles);
     pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default"));
@@ -140,6 +155,13 @@ struct parseFromString<SizeType> {
   }
 };
 
+template <>
+struct parseFromString<double> {
+  static std::optional<double> call(const std::string& var) {
+    return std::stod(var);
+  }
+};
+
 template <>
 struct parseFromString<bool> {
   static std::optional<bool> call(const std::string& var) {
@@ -211,25 +233,26 @@ void warnUnusedConfigurationOption(const pika::program_options::variables_map& v
 }
 
 void updateConfiguration(const pika::program_options::variables_map& vm, configuration& cfg) {
-  updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD",
-                           "num-np-gpu-streams-per-thread");
-  updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD",
-                           "num-hp-gpu-streams-per-thread");
+  // clang-format off
+  updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", "num-np-gpu-streams-per-thread");
+  updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", "num-hp-gpu-streams-per-thread");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_block_bytes, "UMPIRE_HOST_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-host-memory-pool-initial-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_next_block_bytes, "UMPIRE_HOST_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-host-memory-pool-next-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_alignment_bytes, "UMPIRE_HOST_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-host-memory-pool-alignment-bytes");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_free_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-host-memory-pool-coalescing-free-ratio");
+  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-host-memory-pool-coalescing-reallocation-ratio");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-device-memory-pool-initial-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_next_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-device-memory-pool-next-block-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_alignment_bytes, "UMPIRE_DEVICE_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-device-memory-pool-alignment-bytes");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_free_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-device-memory-pool-coalescing-free-ratio");
+  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_reallocation_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-device-memory-pool-coalescing-reallocation-ratio");
   updateConfigurationValue(vm, cfg.num_gpu_blas_handles, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles");
-  updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES",
-                           "num-gpu-lapack-handles");
+  updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles");
 #if PIKA_VERSION_FULL < 0x001D00  // < 0.29.0
-  warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles",
-                                "only supported with pika 0.29.0 or newer");
-  warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles",
-                                "only supported with pika 0.29.0 or newer");
+  warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", "only supported with pika 0.29.0 or newer");
+  warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", "only supported with pika 0.29.0 or newer");
 #endif
-  updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_bytes,
-                           "UMPIRE_HOST_MEMORY_POOL_INITIAL_BYTES",
-                           "umpire-host-memory-pool-initial-bytes");
-  updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_bytes,
-                           "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BYTES",
-                           "umpire-device-memory-pool-initial-bytes");
+  // clang-format on
   cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default";
 
   // Warn if not using MPI pool without --dlaf:no-mpi-pool
@@ -251,44 +274,28 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
   // NOTE: Environment variables should omit the DLAF_ prefix and command line options the dlaf: prefix.
   // These are added automatically by updateConfigurationValue.
   auto& param = getTuneParameters();
-  updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS",
-                           "red2band-panel-nworkers");
-
-  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US",
-                           "red2band-barrier-busy-wait-us");
-
-  updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND",
-                           "eigensolver-min-band");
-
-  updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base,
-                           "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base");
-
-  updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data,
-                           "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data,
-                           "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data,
-                           "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", "");
+  // clang-format off
+  updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers");
+  updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us");
+  updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band");
+  updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base");
+
+  updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", "");
   updateConfigurationValue(vm, param.debug_dump_eigensolver_data, "DEBUG_DUMP_EIGENSOLVER_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data,
-                           "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data,
-                           "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", "");
-  updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA",
-                           "");
+  updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", "");
+  updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", "");
 
-  updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS",
-                           "tridiag-rank1-nworkers");
+  updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", "tridiag-rank1-nworkers");
 
-  updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us,
-                           "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us");
+  updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us");
 
-  updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size,
-                           "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE",
-                           "bt-band-to-tridiag-hh-apply-group-size");
+  updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", "bt-band-to-tridiag-hh-apply-group-size");
 
-  updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES",
-                           "communicator-grid-num-pipelines");
+  updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", "communicator-grid-num-pipelines");
+  // clang-format on
 }
 
 configuration& getConfiguration() {
@@ -300,49 +307,35 @@ configuration& getConfiguration() {
 pika::program_options::options_description getOptionsDescription() {
   pika::program_options::options_description desc("DLA-Future options");
 
+  // clang-format off
   desc.add_options()("dlaf:help", "Print help message");
   desc.add_options()("dlaf:print-config", "Print the DLA-Future configuration");
-  desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value<std::size_t>(),
-                     "Number of normal priority GPU streams per worker thread");
-  desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value<std::size_t>(),
-                     "Number of high priority GPU streams per worker thread");
-  desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value<std::size_t>(),
-                     "Number of GPU BLAS (cuBLAS/rocBLAS) handles");
-  desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value<std::size_t>(),
-                     "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles");
-  desc.add_options()("dlaf:umpire-host-memory-pool-initial-bytes",
-                     pika::program_options::value<std::size_t>(),
-                     "Number of bytes to preallocate for pinned host memory pool");
-  desc.add_options()("dlaf:umpire-device-memory-pool-initial-bytes",
-                     pika::program_options::value<std::size_t>(),
-                     "Number of bytes to preallocate for device memory pool");
+  desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value<std::size_t>(), "Number of normal priority GPU streams per worker thread");
+  desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value<std::size_t>(), "Number of high priority GPU streams per worker thread");
+  desc.add_options()("dlaf:umpire-host-memory-pool-initial-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to preallocate for pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-next-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to allocate in blocks after the first block for pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-alignment-bytes", pika::program_options::value<std::size_t>(), "Alignment of allocations in bytes in pinned host memory pool");
+  desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-free-ratio", pika::program_options::value<double>(), "Required ratio of free memory in pinned host memory pool before performing coalescing of free blocks");
+  desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-reallocation-ratio", pika::program_options::value<double>(), "Ratio of current used memory in pinned host memory pool to use for reallocation of new blocks when coalescing free blocks");
+  desc.add_options()("dlaf:umpire-device-memory-pool-initial-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to preallocate for device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-next-block-bytes", pika::program_options::value<std::size_t>(), "Number of bytes to allocate in blocks after the first block for device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-alignment-bytes", pika::program_options::value<std::size_t>(), "Alignment of allocations in bytes in device memory pool");
+  desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-free-ratio", pika::program_options::value<double>(), "Required ratio of free memory in device memory pool before performing coalescing of free blocks");
+  desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-reallocation-ratio", pika::program_options::value<double>(), "Ratio of current used memory in device memory pool to use for reallocation of new blocks when coalescing free blocks");
+  desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value<std::size_t>(), "Number of GPU BLAS (cuBLAS/rocBLAS) handles");
+  desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value<std::size_t>(), "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles");
   desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool.");
 
   // Tune parameters command line options
-  desc.add_options()(
-      "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(),
-      "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
-  desc.add_options()(
-      "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
-      "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
-  desc.add_options()(
-      "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(),
-      "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
-  desc.add_options()(
-      "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(),
-      "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
-  desc.add_options()(
-      "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(),
-      "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
-  desc.add_options()(
-      "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(),
-      "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
-  desc.add_options()(
-      "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(),
-      "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
-  desc.add_options()(
-      "dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(),
-      "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
+  desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
+  desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
+  desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
+  desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
+  desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
+  desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
+  desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(), "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
+  desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
+  // clang-format on
 
   return desc;
 }
diff --git a/src/memory/memory_chunk.cpp b/src/memory/memory_chunk.cpp
index a2b0c33885..9880c1a036 100644
--- a/src/memory/memory_chunk.cpp
+++ b/src/memory/memory_chunk.cpp
@@ -11,9 +11,11 @@
 #include <cstddef>
 
 #include <umpire/ResourceManager.hpp>
+#include <umpire/strategy/PoolCoalesceHeuristic.hpp>
 #include <umpire/strategy/QuickPool.hpp>
 #include <umpire/strategy/ThreadSafeAllocator.hpp>
 
+#include <dlaf/common/assert.h>
 #include <dlaf/memory/memory_chunk.h>
 
 namespace dlaf {
@@ -36,7 +38,46 @@ umpire::Allocator& getUmpireHostAllocator() {
 }
 #endif
 
-void initializeUmpireHostAllocator(std::size_t initial_bytes) {
+using PoolType = umpire::strategy::QuickPool;
+using CoalesceHeuristicType = umpire::strategy::PoolCoalesceHeuristic<PoolType>;
+
+#ifdef DLAF_WITH_GPU
+// This is a modified version of the "percent_releasable" coalescing heuristic
+// from Umpire. This version allows choosing what ratio of the actual size to
+// reallocate when coalescing.
+//
+// A free ratio of 1.0 means that the pool will be coalesced only when all
+// blocks are unused. A free ratio of 0.5 means that the pool will be coalesced
+// when at least 50% of the pool's memory is unused. A ratio of 0.0 means that
+// the pool will be coalesced as soon as any two free blocks are available. A
+// ratio of more than 1.0 will make the pool never coalesce.
+//
+// A reallocation ratio of 1.0 simply coalesces all the free memory into a new
+// block. A ratio of 0.5 will attempt to shrink the pool to half its previous
+// size. A ratio of 1.5 will allocate 50% more than the previous pool size.
+//
+// A single free block is never "coalesced" to keep things simple. In theory a
+// single block could be shrunk or grown to match the reallocation ratio but
+// this can lead to strange reallocations, so we simply avoid that case. Two or
+// more blocks are always coalesced to one block, so no reallocation will
+// happen immediately after coalescing two or more blocks.
+static CoalesceHeuristicType get_coalesce_heuristic(double coalesce_free_ratio,
+                                                    double coalesce_reallocation_ratio) {
+  return [=](const PoolType& pool) {
+    std::size_t threshold = static_cast<std::size_t>(coalesce_free_ratio * pool.getActualSize());
+    if (pool.getReleasableBlocks() >= 2 && pool.getReleasableSize() >= threshold) {
+      return static_cast<std::size_t>(coalesce_reallocation_ratio * pool.getActualSize());
+    }
+    else {
+      return static_cast<std::size_t>(0);
+    }
+  };
+}
+#endif
+
+void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                   std::size_t alignment_bytes, double coalesce_free_ratio,
+                                   double coalesce_reallocation_ratio) {
 #ifdef DLAF_WITH_GPU
   static bool initialized = false;
 
@@ -45,26 +86,29 @@ void initializeUmpireHostAllocator(std::size_t initial_bytes) {
   if (!initialized) {
     auto host_allocator = umpire::ResourceManager::getInstance().getAllocator("PINNED");
     auto pooled_host_allocator =
-        umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>("PINNED_pool",
-                                                                                          host_allocator,
-                                                                                          initial_bytes);
+        umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>(
+            "DLAF_PINNED_pool", host_allocator, initial_block_bytes, next_block_bytes, alignment_bytes,
+            get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio));
     auto thread_safe_pooled_host_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::ThreadSafeAllocator>(
-            "PINNED_thread_safe_pool", pooled_host_allocator);
+            "DLAF_PINNED_thread_safe_pool", pooled_host_allocator);
 
     memory::internal::getUmpireHostAllocator() = thread_safe_pooled_host_allocator;
 
     initialized = true;
   }
 #else
-  (void) initial_bytes;
+  dlaf::internal::silenceUnusedWarningFor(initial_block_bytes, next_block_bytes, alignment_bytes,
+                                          coalesce_free_ratio, coalesce_reallocation_ratio);
 #endif
 }
 
 void finalizeUmpireHostAllocator() {}
 
 #ifdef DLAF_WITH_GPU
-void initializeUmpireDeviceAllocator(std::size_t initial_bytes) {
+void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes,
+                                     std::size_t alignment_bytes, double coalesce_free_ratio,
+                                     double coalesce_reallocation_ratio) {
   static bool initialized = false;
 
   // Umpire pools cannot be released, so we keep the pools around even when
@@ -73,10 +117,11 @@ void initializeUmpireDeviceAllocator(std::size_t initial_bytes) {
     auto device_allocator = umpire::ResourceManager::getInstance().getAllocator("DEVICE");
     auto pooled_device_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::QuickPool>(
-            "DEVICE_pool", device_allocator, initial_bytes);
+            "DLAF_DEVICE_pool", device_allocator, initial_block_bytes, next_block_bytes, alignment_bytes,
+            get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio));
     auto thread_safe_pooled_device_allocator =
         umpire::ResourceManager::getInstance().makeAllocator<umpire::strategy::ThreadSafeAllocator>(
-            "DEVICE_thread_safe_pool", pooled_device_allocator);
+            "DLAF_DEVICE_thread_safe_pool", pooled_device_allocator);
 
     memory::internal::getUmpireDeviceAllocator() = thread_safe_pooled_device_allocator;