diff --git a/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp b/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp index d37a48c8..cddefcbe 100644 --- a/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp +++ b/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp @@ -1,52 +1,29 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. #include "comms/torchcomms/ncclx/TorchCommNCCLXCCA.hpp" -#include -#include +#include -// Helper to detect if c10::CachingAllocator constants exist +// Helper function to get allocation granularity for a device namespace { - -// Helper to get kLargeBuffer from c10::CachingAllocator if it exists -template -struct LargeBufferGetter { - static constexpr size_t get() { - return 20971520; // 20MB (20 * 1024 * 1024) - fallback - } -}; - -// Specialization when c10::CachingAllocator::kLargeBuffer exists -template <> -struct LargeBufferGetter< - std::void_t> { - static constexpr size_t get() { - return c10::CachingAllocator::kLargeBuffer; - } -}; - -// Helper to get kSmallBuffer from c10::CachingAllocator if it exists -template -struct SmallBufferGetter { - static constexpr size_t get() { - return 2097152; // 2MB (2 * 1024 * 1024) - fallback - } -}; - -// Specialization when c10::CachingAllocator::kSmallBuffer exists -template <> -struct SmallBufferGetter< - std::void_t> { - static constexpr size_t get() { - return c10::CachingAllocator::kSmallBuffer; +size_t getAllocationGranularity(int device) { + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + + size_t granularity; + CUresult result = cuMemGetAllocationGranularity( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + if (result != CUDA_SUCCESS) { + const char* error_string; + cuGetErrorString(result, &error_string); + throw std::runtime_error( + std::string("Failed to get allocation granularity for device ") + + std::to_string(device) + ": " + error_string); } -}; -inline size_t getLargeBufferSize() { - return LargeBufferGetter<>::get(); -} - -inline size_t getSmallBufferSize() { - return SmallBufferGetter<>::get(); + return granularity; } } // namespace @@ -132,32 +109,20 @@ void CachingAllocatorHookImpl::regDeregMem( // PyTorch expandable segments can send MAP events covering one or more // chunks. We need to register each chunk separately. - // The assumption here is that chunks are either kLargeBuffer or - // kSmallBuffer sized, no other sizes are allowed. We assert on this check - // below to make sure this assumption is valid. + // Chunk size is determined by the allocation granularity for the device. size_t total_size = te.size_; - - // Define chunk sizes for expandable segments - // In older PyTorch versions, c10::CachingAllocator::kLargeBuffer and - // kSmallBuffer symbols may not exist. We detect their availability at - // compile time using SFINAE and fall back to hardcoded values if needed. - const size_t kLargeBuffer = getLargeBufferSize(); - const size_t kSmallBuffer = getSmallBufferSize(); + size_t chunk_size = getAllocationGranularity(te.device_); for (size_t offset = 0; offset < total_size;) { // Determine the chunk size at this offset size_t remaining = total_size - offset; - size_t chunk_size; - if (remaining >= kLargeBuffer) { - chunk_size = kLargeBuffer; - } else if (remaining == kSmallBuffer) { - chunk_size = kSmallBuffer; - } else { + if (remaining < chunk_size) { LOG(ERROR) << "[CCA] SEGMENT_MAP: Invalid remaining size=" << remaining - << " at offset=" << offset << " total_size=" << total_size; + << " at offset=" << offset << " total_size=" << total_size + << " chunk_size=" << chunk_size; throw std::runtime_error( - "SEGMENT_MAP: Remaining size must be kLargeBuffer or kSmallBuffer"); + "SEGMENT_MAP: Remaining size must be a multiple of allocation granularity"); } void* chunk_addr = @@ -210,8 +175,8 @@ void CachingAllocatorHookImpl::regDeregMem( c10::cuda::CUDACachingAllocator::TraceEntry::Action::SEGMENT_UNMAP) { // Memory got unmapped, deregister it with NCCL - // PyTorch expandable segments can have chunks of different sizes (e.g., - // kLargeBuffer or kSmallBuffer). UNMAP events can cover multiple chunks. + // PyTorch expandable segments have chunks sized according to the + // allocation granularity. UNMAP events can cover multiple chunks. // We iterate through registered chunks within the unmapped range and // deregister each one. size_t total_size = te.size_;