From 74d2bb607be9f9755a039fe1960088f78494e4ef Mon Sep 17 00:00:00 2001
From: Pavan Balaji <pavanbalaji@meta.com>
Date: Mon, 8 Dec 2025 15:39:59 -0800
Subject: [PATCH] Replace hardcoded buffer sizes with
 cuMemGetAllocationGranularity

Summary:
Previously, the code used SFINAE templates to detect and use
c10::CachingAllocator::kLargeBuffer and kSmallBuffer, with hardcoded
fallbacks of 20MB and 2MB respectively. This approach had issues with
different PyTorch versions that had the buffer sizes defined in
different locations, making it unreliable across PyTorch versions.

This diff simplifies the implementation by directly querying the
allocation granularity from the CUDA driver using
cuMemGetAllocationGranularity. This ensures we use the correct,
device-specific granularity for memory registration chunks, making the
code more robust and portable across different GPU architectures and
PyTorch versions.

Differential Revision: D88688463
---
 comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp | 89 ++++++--------------
 1 file changed, 27 insertions(+), 62 deletions(-)
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp b/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp
index d37a48c8..cddefcbe 100644
--- a/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp
+++ b/comms/torchcomms/ncclx/TorchCommNCCLXCCA.cpp
@@ -1,52 +1,29 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 
 #include "comms/torchcomms/ncclx/TorchCommNCCLXCCA.hpp"
-#include <mutex>
-#include <type_traits>
+#include <cuda.h>
 
-// Helper to detect if c10::CachingAllocator constants exist
+// Helper function to get allocation granularity for a device
 namespace {
-
-// Helper to get kLargeBuffer from c10::CachingAllocator if it exists
-template <typename = void>
-struct LargeBufferGetter {
-  static constexpr size_t get() {
-    return 20971520; // 20MB (20 * 1024 * 1024) - fallback
-  }
-};
-
-// Specialization when c10::CachingAllocator::kLargeBuffer exists
-template <>
-struct LargeBufferGetter<
-    std::void_t<decltype(c10::CachingAllocator::kLargeBuffer)>> {
-  static constexpr size_t get() {
-    return c10::CachingAllocator::kLargeBuffer;
-  }
-};
-
-// Helper to get kSmallBuffer from c10::CachingAllocator if it exists
-template <typename = void>
-struct SmallBufferGetter {
-  static constexpr size_t get() {
-    return 2097152; // 2MB (2 * 1024 * 1024) - fallback
-  }
-};
-
-// Specialization when c10::CachingAllocator::kSmallBuffer exists
-template <>
-struct SmallBufferGetter<
-    std::void_t<decltype(c10::CachingAllocator::kSmallBuffer)>> {
-  static constexpr size_t get() {
-    return c10::CachingAllocator::kSmallBuffer;
+size_t getAllocationGranularity(int device) {
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device;
+
+  size_t granularity;
+  CUresult result = cuMemGetAllocationGranularity(
+      &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+
+  if (result != CUDA_SUCCESS) {
+    const char* error_string;
+    cuGetErrorString(result, &error_string);
+    throw std::runtime_error(
+        std::string("Failed to get allocation granularity for device ") +
+        std::to_string(device) + ": " + error_string);
   }
-};
 
-inline size_t getLargeBufferSize() {
-  return LargeBufferGetter<>::get();
-}
-
-inline size_t getSmallBufferSize() {
-  return SmallBufferGetter<>::get();
+  return granularity;
 }
 } // namespace
 
@@ -132,32 +109,20 @@ void CachingAllocatorHookImpl::regDeregMem(
 
     // PyTorch expandable segments can send MAP events covering one or more
     // chunks. We need to register each chunk separately.
-    // The assumption here is that chunks are either kLargeBuffer or
-    // kSmallBuffer sized, no other sizes are allowed.  We assert on this check
-    // below to make sure this assumption is valid.
+    // Chunk size is determined by the allocation granularity for the device.
     size_t total_size = te.size_;
-
-    // Define chunk sizes for expandable segments
-    // In older PyTorch versions, c10::CachingAllocator::kLargeBuffer and
-    // kSmallBuffer symbols may not exist. We detect their availability at
-    // compile time using SFINAE and fall back to hardcoded values if needed.
-    const size_t kLargeBuffer = getLargeBufferSize();
-    const size_t kSmallBuffer = getSmallBufferSize();
+    size_t chunk_size = getAllocationGranularity(te.device_);
 
     for (size_t offset = 0; offset < total_size;) {
       // Determine the chunk size at this offset
       size_t remaining = total_size - offset;
-      size_t chunk_size;
 
-      if (remaining >= kLargeBuffer) {
-        chunk_size = kLargeBuffer;
-      } else if (remaining == kSmallBuffer) {
-        chunk_size = kSmallBuffer;
-      } else {
+      if (remaining < chunk_size) {
         LOG(ERROR) << "[CCA] SEGMENT_MAP: Invalid remaining size=" << remaining
-                   << " at offset=" << offset << " total_size=" << total_size;
+                   << " at offset=" << offset << " total_size=" << total_size
+                   << " chunk_size=" << chunk_size;
         throw std::runtime_error(
-            "SEGMENT_MAP: Remaining size must be kLargeBuffer or kSmallBuffer");
+            "SEGMENT_MAP: Remaining size must be a multiple of allocation granularity");
       }
 
       void* chunk_addr =
@@ -210,8 +175,8 @@ void CachingAllocatorHookImpl::regDeregMem(
       c10::cuda::CUDACachingAllocator::TraceEntry::Action::SEGMENT_UNMAP) {
     // Memory got unmapped, deregister it with NCCL
 
-    // PyTorch expandable segments can have chunks of different sizes (e.g.,
-    // kLargeBuffer or kSmallBuffer). UNMAP events can cover multiple chunks.
+    // PyTorch expandable segments have chunks sized according to the
+    // allocation granularity. UNMAP events can cover multiple chunks.
     // We iterate through registered chunks within the unmapped range and
     // deregister each one.
     size_t total_size = te.size_;