From ac4cb3df2912c5accd13709673f9514c0f979be8 Mon Sep 17 00:00:00 2001 From: Aidyn-A Date: Fri, 19 Jul 2024 10:15:16 -0700 Subject: [PATCH 1/7] NCCLAllocator: add shareIpcHandle --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp | 8 ++++++++ apex/contrib/csrc/nccl_allocator/NCCLAllocator.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp index 1c4a70528..80882fe96 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp @@ -324,6 +324,14 @@ c10::cuda::CUDACachingAllocator::SnapshotInfo NCCLAllocator::snapshot() { return result; } +c10::cuda::CUDACachingAllocator::ShareableHandle CUDAPluggableAllocator:: + shareIpcHandle(void* ptr) { + TORCH_CHECK( + false, + "CUDAPluggableAllocator does not yet support shareIPcHandle. " + "If you need it, please file an issue describing your use case."); +} + std::shared_ptr NCCLAllocator::getIpcDevPtr(std::string handle) { TORCH_CHECK( false, diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h index 3a4e76b83..5100caf48 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h @@ -59,6 +59,8 @@ struct NCCLAllocator override; void releasePool(c10::DeviceIndex device, c10::cuda::MempoolId_t mempool_id) override; std::shared_ptr getIpcDevPtr(std::string handle) override; + c10::cuda::CUDACachingAllocator::ShareableHandle shareIpcHandle( + void*) override; void recordHistory( bool enabled, c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder, From 4a3c75852c0609f1a8c5bef9728c47ae645a5e3f Mon Sep 17 00:00:00 2001 From: Aidyn-A Date: Fri, 19 Jul 2024 10:56:08 -0700 Subject: [PATCH 2/7] Fix namespace --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp index 80882fe96..1d2d5d441 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp @@ -324,7 +324,7 @@ c10::cuda::CUDACachingAllocator::SnapshotInfo NCCLAllocator::snapshot() { return result; } -c10::cuda::CUDACachingAllocator::ShareableHandle CUDAPluggableAllocator:: +c10::cuda::CUDACachingAllocator::ShareableHandle NCCLAllocator:: shareIpcHandle(void* ptr) { TORCH_CHECK( false, From 02d736617f00f5c29f98508dbeb4581bec16f204 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Sat, 5 Oct 2024 10:08:28 +0900 Subject: [PATCH 3/7] Update NCCLAllocator.cpp following https://github.com/pytorch/pytorch/pull/131114/files#diff-b9a0773c48ce53900355ab72505056fae68c7117993028c826c9cc1e3f3c502dR209-R210 --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp index 1d2d5d441..64d8158de 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp @@ -440,4 +440,12 @@ void NCCLAllocator::copy_data(void* dest, const void* src, std::size_t count) cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); } +void NCCLAllocator::enable() { + +} + +bool NCCLAllocator::isEnabled() { + return true; +} + } // namespace nccl_allocator::cuda From fa5df4378a964fc9c1add8668e17e3dca409a88a Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Sat, 5 Oct 2024 14:24:10 +0900 Subject: [PATCH 4/7] Update NCCLAllocator.cpp --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp index 64d8158de..4cda00776 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp @@ -440,11 +440,9 @@ void NCCLAllocator::copy_data(void* dest, const void* src, std::size_t count) cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); } -void NCCLAllocator::enable() { +void NCCLAllocator::enable() {} -} - -bool NCCLAllocator::isEnabled() { +bool NCCLAllocator::isEnabled() const { return true; } From f868b50d5f450a1eb37b3197fe0c5222c80bb283 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Sat, 5 Oct 2024 14:26:43 +0900 Subject: [PATCH 5/7] Update NCCLAllocator.h --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h index 5100caf48..a69f83822 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h @@ -87,6 +87,8 @@ struct NCCLAllocator bool p2p_enabled) override; std::string name() override; void copy_data(void* dest, const void* src, std::size_t count) const; + void enable(bool value) override; + bool isEnabled() const override; protected: std::function init_fn_; From bbc8d87aac045ae7809cf64a92172420ae5cdbfc Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Sun, 6 Oct 2024 16:53:43 +0900 Subject: [PATCH 6/7] Update NCCLAllocator.cpp --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp index 4cda00776..14cdaff86 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp @@ -440,7 +440,7 @@ void NCCLAllocator::copy_data(void* dest, const void* src, std::size_t count) cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice)); } -void NCCLAllocator::enable() {} +void NCCLAllocator::enable(bool) {} bool NCCLAllocator::isEnabled() const { return true; From c50d1d03f8ac8b3ed6976661575d92a21be0eb3b Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Sun, 6 Oct 2024 16:54:02 +0900 Subject: [PATCH 7/7] Update NCCLAllocator.h --- apex/contrib/csrc/nccl_allocator/NCCLAllocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h index a69f83822..5a71325d8 100644 --- a/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h +++ b/apex/contrib/csrc/nccl_allocator/NCCLAllocator.h @@ -87,7 +87,7 @@ struct NCCLAllocator bool p2p_enabled) override; std::string name() override; void copy_data(void* dest, const void* src, std::size_t count) const; - void enable(bool value) override; + void enable(bool) override; bool isEnabled() const override; protected: