-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds DMA-BUF support to the plugin and enables it under the following conditions: At build time, libfabric>=1.20 is required (build checks for FI_MR_DMABUF). At runtime: + The specific version of NCCL being used supports DMA-BUF and passes valid dmabuf fds to the plugin. + FI_HMEM must be supported. + For CUDA accelerators, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED is queried and FI_HMEM_CUDA is requested. + For Neuron, we assume all nrt versions are viable of dmabuf export. Libfabric as of today provides no hints in at init time that allow the plugin to differentiate between a provider that merely has FI_HMEM support, and one that has dmabuf support. In the case that libfabric rejects registration and the plugin aborts because the underlying provider and/or rdma-core cannot support it, a new environment variable is introduced to force the legacy path: OFI_NCCL_DMABUF_DISABLE Testing: Various combinations of + OFI_NCCL_DISABLE_DMABUF=0/1 + OFI_NCCL_PROTOCOL=RDMA/SENDRECV + FI_HMEM_CUDA_USE_GDRCOPY=0/1 Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
- Loading branch information
1 parent
25f65c6
commit d76d4fd
Showing
10 changed files
with
156 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
/* | ||
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. | ||
*/ | ||
|
||
#ifndef NCCL_OFI_DMABUF_H_ | ||
#define NCCL_OFI_DMABUF_H_ | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
int nccl_ofi_dmabuf_viable(void); | ||
|
||
#ifdef __cplusplus | ||
} // End extern "C" | ||
#endif | ||
|
||
#endif // NCCL_OFI_DMABUF_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. | ||
*/ | ||
|
||
#include "config.h" | ||
#include <stdbool.h> | ||
#include "nccl_ofi_dmabuf.h" | ||
#include "nccl_ofi_param.h" | ||
#if HAVE_CUDA | ||
#include "nccl_ofi_cuda.h" | ||
#endif | ||
|
||
|
||
/* Check preconditions for using DMA-BUF. Note that we may disable DMA-BUF for | ||
* other reasons, even if this function returns true. For example, if we do not | ||
* resolve a provider with FI_HMEM support */ | ||
int nccl_ofi_dmabuf_viable() { | ||
|
||
/* Disable DMA-BUF if building against older libfabric. */ | ||
if (!HAVE_DECL_FI_MR_DMABUF) { | ||
NCCL_OFI_WARN("Will not use DMA-BUF, requires Libfabric 1.20 or greater. Consider upgrading."); | ||
return false; | ||
} | ||
|
||
|
||
/* Disable DMA-BUF if explicitly disabled by user. */ | ||
if (ofi_nccl_disable_dmabuf()) { | ||
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Will not attempt to use DMA-BUF, explicitly disabled by user."); | ||
return false; | ||
} | ||
|
||
|
||
/* Disable DMA-BUF if using CUDA and CUDA does not report DMA-BUF | ||
* support in device attributes. */ | ||
#if HAVE_CUDA | ||
if (!nccl_net_ofi_cuda_have_dma_buf_attr()) { | ||
NCCL_OFI_WARN("Will not attempt to use DMA-BUF, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED was false."); | ||
return false; | ||
} | ||
#endif | ||
|
||
/* When using an HMEM capable provider at API version 1.20 or greater, | ||
* advertise DMA-BUF support in NCCL getProperties calls. When given dmabuf | ||
* file descriptors from NCCL, forward them in fi_regattr calls and pass the | ||
* FI_MR_DMABUF flag. */ | ||
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, | ||
"Will attempt to resolve DMA-BUF capable providers. export OFI_NCCL_DMABUF_DISABLE=1 to disable"); | ||
return true; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters