diff --git a/include/dlaf/init.h b/include/dlaf/init.h index 0e50ad7fec..07f8b26a22 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -37,10 +37,18 @@ struct configuration { // - getOptionsDescription to add a corresponding command line option std::size_t num_np_gpu_streams_per_thread = 3; std::size_t num_hp_gpu_streams_per_thread = 3; + std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_alignment_bytes = 16; + double umpire_host_memory_pool_coalescing_free_ratio = 1.0; + double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0; + std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_alignment_bytes = 16; + double umpire_device_memory_pool_coalescing_free_ratio = 1.0; + double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0; std::size_t num_gpu_blas_handles = 16; std::size_t num_gpu_lapack_handles = 16; - std::size_t umpire_host_memory_pool_initial_bytes = 1 << 30; - std::size_t umpire_device_memory_pool_initial_bytes = 1 << 30; std::string mpi_pool = "mpi"; }; diff --git a/include/dlaf/memory/memory_chunk.h b/include/dlaf/memory/memory_chunk.h index c837d54f49..ab5846c2f0 100644 --- a/include/dlaf/memory/memory_chunk.h +++ b/include/dlaf/memory/memory_chunk.h @@ -27,11 +27,15 @@ namespace memory { namespace internal { umpire::Allocator& getUmpireHostAllocator(); -void initializeUmpireHostAllocator(std::size_t initial_bytes); +void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio); void finalizeUmpireHostAllocator(); #ifdef DLAF_WITH_GPU -void initializeUmpireDeviceAllocator(std::size_t initial_bytes); +void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio); void finalizeUmpireDeviceAllocator(); umpire::Allocator& getUmpireDeviceAllocator(); #endif diff --git a/src/init.cpp b/src/init.cpp index 7e3c3072e4..dd9ce6e0c9 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -29,15 +29,23 @@ namespace dlaf { std::ostream& operator<<(std::ostream& os, const configuration& cfg) { + // clang-format off os << " num_np_gpu_streams_per_thread = " << cfg.num_np_gpu_streams_per_thread << std::endl; os << " num_hp_gpu_streams_per_thread = " << cfg.num_hp_gpu_streams_per_thread << std::endl; + os << " umpire_host_memory_pool_initial_block_bytes = " << cfg.umpire_host_memory_pool_initial_block_bytes << std::endl; + os << " umpire_host_memory_pool_next_block_bytes = " << cfg.umpire_host_memory_pool_next_block_bytes << std::endl; + os << " umpire_host_memory_pool_alignment_bytes = " << cfg.umpire_host_memory_pool_alignment_bytes << std::endl; + os << " umpire_host_memory_pool_coalescing_free_ratio = " << cfg.umpire_host_memory_pool_coalescing_free_ratio << std::endl; + os << " umpire_host_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_host_memory_pool_coalescing_reallocation_ratio << std::endl; + os << " umpire_device_memory_pool_initial_block_bytes = " << cfg.umpire_device_memory_pool_initial_block_bytes << std::endl; + os << " umpire_device_memory_pool_next_block_bytes = " << cfg.umpire_device_memory_pool_next_block_bytes << std::endl; + os << " umpire_device_memory_pool_alignment_bytes = " << cfg.umpire_device_memory_pool_alignment_bytes << std::endl; + os << " umpire_device_memory_pool_coalescing_free_ratio = " << cfg.umpire_device_memory_pool_coalescing_free_ratio << std::endl; + os << " umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl; os << " num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl; os << " num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl; - os << " umpire_host_memory_pool_initial_bytes = " << cfg.umpire_host_memory_pool_initial_bytes - << std::endl; - os << " umpire_device_memory_pool_initial_bytes = " << cfg.umpire_device_memory_pool_initial_bytes - << std::endl; os << " mpi_pool = " << cfg.mpi_pool << std::endl; + // clang-format on return os; } @@ -58,7 +66,10 @@ struct Init { template <> struct Init { static void initialize(const configuration& cfg) { - memory::internal::initializeUmpireHostAllocator(cfg.umpire_host_memory_pool_initial_bytes); + memory::internal::initializeUmpireHostAllocator( + cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes, + cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, + cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); } static void finalize() { @@ -106,7 +117,11 @@ template <> struct Init { static void initialize(const configuration& cfg) { const int device = 0; - memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes); + memory::internal::initializeUmpireDeviceAllocator( + cfg.umpire_device_memory_pool_initial_block_bytes, + cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes, + cfg.umpire_host_memory_pool_coalescing_free_ratio, + cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread, cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles); pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default")); @@ -140,6 +155,13 @@ struct parseFromString { } }; +template <> +struct parseFromString { + static std::optional call(const std::string& var) { + return std::stod(var); + } +}; + template <> struct parseFromString { static std::optional call(const std::string& var) { @@ -211,25 +233,26 @@ void warnUnusedConfigurationOption(const pika::program_options::variables_map& v } void updateConfiguration(const pika::program_options::variables_map& vm, configuration& cfg) { - updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", - "num-np-gpu-streams-per-thread"); - updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", - "num-hp-gpu-streams-per-thread"); + // clang-format off + updateConfigurationValue(vm, cfg.num_np_gpu_streams_per_thread, "NUM_NP_GPU_STREAMS_PER_THREAD", "num-np-gpu-streams-per-thread"); + updateConfigurationValue(vm, cfg.num_hp_gpu_streams_per_thread, "NUM_HP_GPU_STREAMS_PER_THREAD", "num-hp-gpu-streams-per-thread"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_block_bytes, "UMPIRE_HOST_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-host-memory-pool-initial-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_next_block_bytes, "UMPIRE_HOST_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-host-memory-pool-next-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_alignment_bytes, "UMPIRE_HOST_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-host-memory-pool-alignment-bytes"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_free_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-host-memory-pool-coalescing-free-ratio"); + updateConfigurationValue(vm, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio, "UMPIRE_HOST_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-host-memory-pool-coalescing-reallocation-ratio"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BLOCK_BYTES", "umpire-device-memory-pool-initial-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_next_block_bytes, "UMPIRE_DEVICE_MEMORY_POOL_NEXT_BLOCK_BYTES", "umpire-device-memory-pool-next-block-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_alignment_bytes, "UMPIRE_DEVICE_MEMORY_POOL_ALIGNMENT_BYTES", "umpire-device-memory-pool-alignment-bytes"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_free_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_FREE_RATIO", "umpire-device-memory-pool-coalescing-free-ratio"); + updateConfigurationValue(vm, cfg.umpire_device_memory_pool_coalescing_reallocation_ratio, "UMPIRE_DEVICE_MEMORY_POOL_COALESCING_REALLOCATION_RATIO", "umpire-device-memory-pool-coalescing-reallocation-ratio"); updateConfigurationValue(vm, cfg.num_gpu_blas_handles, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles"); - updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", - "num-gpu-lapack-handles"); + updateConfigurationValue(vm, cfg.num_gpu_lapack_handles, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles"); #if PIKA_VERSION_FULL < 0x001D00 // < 0.29.0 - warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", - "only supported with pika 0.29.0 or newer"); - warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", - "only supported with pika 0.29.0 or newer"); + warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", "only supported with pika 0.29.0 or newer"); + warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", "only supported with pika 0.29.0 or newer"); #endif - updateConfigurationValue(vm, cfg.umpire_host_memory_pool_initial_bytes, - "UMPIRE_HOST_MEMORY_POOL_INITIAL_BYTES", - "umpire-host-memory-pool-initial-bytes"); - updateConfigurationValue(vm, cfg.umpire_device_memory_pool_initial_bytes, - "UMPIRE_DEVICE_MEMORY_POOL_INITIAL_BYTES", - "umpire-device-memory-pool-initial-bytes"); + // clang-format on cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default"; // Warn if not using MPI pool without --dlaf:no-mpi-pool @@ -251,44 +274,28 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu // NOTE: Environment variables should omit the DLAF_ prefix and command line options the dlaf: prefix. // These are added automatically by updateConfigurationValue. auto& param = getTuneParameters(); - updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", - "red2band-panel-nworkers"); - - updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", - "red2band-barrier-busy-wait-us"); - - updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", - "eigensolver-min-band"); - - updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, - "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base"); - - updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, - "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, - "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, - "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", ""); + // clang-format off + updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers"); + updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us"); + updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band"); + updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base"); + + updateConfigurationValue(vm, param.debug_dump_cholesky_factorization_data, "DEBUG_DUMP_CHOLESKY_FACTORIZATION_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_generalized_eigensolver_data, "DEBUG_DUMP_GENERALIZED_EIGENSOLVER_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_generalized_to_standard_data, "DEBUG_DUMP_GENERALIZED_TO_STANDARD_DATA", ""); updateConfigurationValue(vm, param.debug_dump_eigensolver_data, "DEBUG_DUMP_EIGENSOLVER_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, - "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, - "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", ""); - updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", - ""); + updateConfigurationValue(vm, param.debug_dump_reduction_to_band_data, "DEBUG_DUMP_REDUCTION_TO_BAND_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", ""); + updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", ""); - updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", - "tridiag-rank1-nworkers"); + updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", "tridiag-rank1-nworkers"); - updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, - "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us"); + updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us"); - updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, - "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", - "bt-band-to-tridiag-hh-apply-group-size"); + updateConfigurationValue(vm, param.bt_band_to_tridiag_hh_apply_group_size, "BT_BAND_TO_TRIDIAG_HH_APPLY_GROUP_SIZE", "bt-band-to-tridiag-hh-apply-group-size"); - updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", - "communicator-grid-num-pipelines"); + updateConfigurationValue(vm, param.communicator_grid_num_pipelines, "COMMUNICATOR_GRID_NUM_PIPELINES", "communicator-grid-num-pipelines"); + // clang-format on } configuration& getConfiguration() { @@ -300,49 +307,35 @@ configuration& getConfiguration() { pika::program_options::options_description getOptionsDescription() { pika::program_options::options_description desc("DLA-Future options"); + // clang-format off desc.add_options()("dlaf:help", "Print help message"); desc.add_options()("dlaf:print-config", "Print the DLA-Future configuration"); - desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value(), - "Number of normal priority GPU streams per worker thread"); - desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value(), - "Number of high priority GPU streams per worker thread"); - desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value(), - "Number of GPU BLAS (cuBLAS/rocBLAS) handles"); - desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value(), - "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles"); - desc.add_options()("dlaf:umpire-host-memory-pool-initial-bytes", - pika::program_options::value(), - "Number of bytes to preallocate for pinned host memory pool"); - desc.add_options()("dlaf:umpire-device-memory-pool-initial-bytes", - pika::program_options::value(), - "Number of bytes to preallocate for device memory pool"); + desc.add_options()("dlaf:num-np-gpu-streams-per-thread", pika::program_options::value(), "Number of normal priority GPU streams per worker thread"); + desc.add_options()("dlaf:num-hp-gpu-streams-per-thread", pika::program_options::value(), "Number of high priority GPU streams per worker thread"); + desc.add_options()("dlaf:umpire-host-memory-pool-initial-block-bytes", pika::program_options::value(), "Number of bytes to preallocate for pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-next-block-bytes", pika::program_options::value(), "Number of bytes to allocate in blocks after the first block for pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-alignment-bytes", pika::program_options::value(), "Alignment of allocations in bytes in pinned host memory pool"); + desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-free-ratio", pika::program_options::value(), "Required ratio of free memory in pinned host memory pool before performing coalescing of free blocks"); + desc.add_options()("dlaf:umpire-host-memory-pool-coalescing-reallocation-ratio", pika::program_options::value(), "Ratio of current used memory in pinned host memory pool to use for reallocation of new blocks when coalescing free blocks"); + desc.add_options()("dlaf:umpire-device-memory-pool-initial-block-bytes", pika::program_options::value(), "Number of bytes to preallocate for device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-next-block-bytes", pika::program_options::value(), "Number of bytes to allocate in blocks after the first block for device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-alignment-bytes", pika::program_options::value(), "Alignment of allocations in bytes in device memory pool"); + desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-free-ratio", pika::program_options::value(), "Required ratio of free memory in device memory pool before performing coalescing of free blocks"); + desc.add_options()("dlaf:umpire-device-memory-pool-coalescing-reallocation-ratio", pika::program_options::value(), "Ratio of current used memory in device memory pool to use for reallocation of new blocks when coalescing free blocks"); + desc.add_options()("dlaf:num-gpu-blas-handles", pika::program_options::value(), "Number of GPU BLAS (cuBLAS/rocBLAS) handles"); + desc.add_options()("dlaf:num-gpu-lapack-handles", pika::program_options::value(), "Number of GPU LAPACK (cuSOLVER/rocSOLVER) handles"); desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool."); // Tune parameters command line options - desc.add_options()( - "dlaf:red2band-panel-nworkers", pika::program_options::value(), - "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); - desc.add_options()( - "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), - "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); - desc.add_options()( - "dlaf:eigensolver-min-band", pika::program_options::value(), - "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); - desc.add_options()( - "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), - "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); - desc.add_options()( - "dlaf:tridiag-rank1-nworkers", pika::program_options::value(), - "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); - desc.add_options()( - "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), - "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); - desc.add_options()( - "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), - "The application of the HH reflector is splitted in smaller applications of group size reflectors."); - desc.add_options()( - "dlaf:communicator-grid-num-pipelines", pika::program_options::value(), - "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); + desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); + desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); + desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); + desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); + desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); + desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); + desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), "The application of the HH reflector is splitted in smaller applications of group size reflectors."); + desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); + // clang-format on return desc; } diff --git a/src/memory/memory_chunk.cpp b/src/memory/memory_chunk.cpp index a2b0c33885..9880c1a036 100644 --- a/src/memory/memory_chunk.cpp +++ b/src/memory/memory_chunk.cpp @@ -11,9 +11,11 @@ #include #include +#include #include #include +#include #include namespace dlaf { @@ -36,7 +38,46 @@ umpire::Allocator& getUmpireHostAllocator() { } #endif -void initializeUmpireHostAllocator(std::size_t initial_bytes) { +using PoolType = umpire::strategy::QuickPool; +using CoalesceHeuristicType = umpire::strategy::PoolCoalesceHeuristic; + +#ifdef DLAF_WITH_GPU +// This is a modified version of the "percent_releasable" coalescing heuristic +// from Umpire. This version allows choosing what ratio of the actual size to +// reallocate when coalescing. +// +// A free ratio of 1.0 means that the pool will be coalesced only when all +// blocks are unused. A free ratio of 0.5 means that the pool will be coalesced +// when at least 50% of the pool's memory is unused. A ratio of 0.0 means that +// the pool will be coalesced as soon as any two free blocks are available. A +// ratio of more than 1.0 will make the pool never coalesce. +// +// A reallocation ratio of 1.0 simply coalesces all the free memory into a new +// block. A ratio of 0.5 will attempt to shrink the pool to half its previous +// size. A ratio of 1.5 will allocate 50% more than the previous pool size. +// +// A single free block is never "coalesced" to keep things simple. In theory a +// single block could be shrunk or grown to match the reallocation ratio but +// this can lead to strange reallocations, so we simply avoid that case. Two or +// more blocks are always coalesced to one block, so no reallocation will +// happen immediately after coalescing two or more blocks. +static CoalesceHeuristicType get_coalesce_heuristic(double coalesce_free_ratio, + double coalesce_reallocation_ratio) { + return [=](const PoolType& pool) { + std::size_t threshold = static_cast(coalesce_free_ratio * pool.getActualSize()); + if (pool.getReleasableBlocks() >= 2 && pool.getReleasableSize() >= threshold) { + return static_cast(coalesce_reallocation_ratio * pool.getActualSize()); + } + else { + return static_cast(0); + } + }; +} +#endif + +void initializeUmpireHostAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio) { #ifdef DLAF_WITH_GPU static bool initialized = false; @@ -45,26 +86,29 @@ void initializeUmpireHostAllocator(std::size_t initial_bytes) { if (!initialized) { auto host_allocator = umpire::ResourceManager::getInstance().getAllocator("PINNED"); auto pooled_host_allocator = - umpire::ResourceManager::getInstance().makeAllocator("PINNED_pool", - host_allocator, - initial_bytes); + umpire::ResourceManager::getInstance().makeAllocator( + "DLAF_PINNED_pool", host_allocator, initial_block_bytes, next_block_bytes, alignment_bytes, + get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio)); auto thread_safe_pooled_host_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "PINNED_thread_safe_pool", pooled_host_allocator); + "DLAF_PINNED_thread_safe_pool", pooled_host_allocator); memory::internal::getUmpireHostAllocator() = thread_safe_pooled_host_allocator; initialized = true; } #else - (void) initial_bytes; + dlaf::internal::silenceUnusedWarningFor(initial_block_bytes, next_block_bytes, alignment_bytes, + coalesce_free_ratio, coalesce_reallocation_ratio); #endif } void finalizeUmpireHostAllocator() {} #ifdef DLAF_WITH_GPU -void initializeUmpireDeviceAllocator(std::size_t initial_bytes) { +void initializeUmpireDeviceAllocator(std::size_t initial_block_bytes, std::size_t next_block_bytes, + std::size_t alignment_bytes, double coalesce_free_ratio, + double coalesce_reallocation_ratio) { static bool initialized = false; // Umpire pools cannot be released, so we keep the pools around even when @@ -73,10 +117,11 @@ void initializeUmpireDeviceAllocator(std::size_t initial_bytes) { auto device_allocator = umpire::ResourceManager::getInstance().getAllocator("DEVICE"); auto pooled_device_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "DEVICE_pool", device_allocator, initial_bytes); + "DLAF_DEVICE_pool", device_allocator, initial_block_bytes, next_block_bytes, alignment_bytes, + get_coalesce_heuristic(coalesce_free_ratio, coalesce_reallocation_ratio)); auto thread_safe_pooled_device_allocator = umpire::ResourceManager::getInstance().makeAllocator( - "DEVICE_thread_safe_pool", pooled_device_allocator); + "DLAF_DEVICE_thread_safe_pool", pooled_device_allocator); memory::internal::getUmpireDeviceAllocator() = thread_safe_pooled_device_allocator;