Skip to content

Commit

Permalink
feat: add DMA-BUF support
Browse files Browse the repository at this point in the history
This adds DMA-BUF support to the plugin and enables it under the
following conditions:

At build time, libfabric>=1.20 is required (build checks for FI_MR_DMABUF).

At runtime:
 + The specific version of NCCL being used supports DMA-BUF and passes
   valid dmabuf fds to the plugin.
 + FI_HMEM must be supported.
  + For CUDA accelerators, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED is
   queried and FI_HMEM_CUDA is requested.
  + For Neuron, we assume all nrt versions are viable of dmabuf
    export.

Libfabric as of today provides no hints in at init time that allow the
plugin to differentiate between a provider that merely has FI_HMEM
support, and one that has dmabuf support. In the case that libfabric
rejects registration and the plugin aborts because the underlying
provider and/or rdma-core cannot support it, a new environment variable
is introduced to force the legacy path: OFI_NCCL_DMABUF_DISABLE

Testing: Various combinations of
  + OFI_NCCL_DISABLE_DMABUF=0/1
  + OFI_NCCL_PROTOCOL=RDMA/SENDRECV
  + FI_HMEM_CUDA_USE_GDRCOPY=0/1

Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
  • Loading branch information
aws-nslick committed Sep 22, 2024
1 parent a07fc67 commit e99cff1
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 12 deletions.
1 change: 1 addition & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ noinst_HEADERS = \
nccl_ofi_topo.h \
nccl_ofi_tuner.h \
nccl_ofi_ofiutils.h \
nccl_ofi_dmabuf.h \
nccl_ofi_tracepoint.h \
tracing_impl/lttng.h \
tracing_impl/nvtx.h \
Expand Down
18 changes: 18 additions & 0 deletions include/nccl_ofi_dmabuf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#ifndef NCCL_OFI_DMABUF_H_
#define NCCL_OFI_DMABUF_H_

#ifdef __cplusplus
extern "C" {
#endif

int nccl_ofi_dmabuf_viable(void);

#ifdef __cplusplus
} // End extern "C"
#endif

#endif // NCCL_OFI_DMABUF_H_
17 changes: 17 additions & 0 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,23 @@ OFI_NCCL_PARAM_INT(disable_native_rdma_check, "DISABLE_NATIVE_RDMA_CHECK", 0);
*/
OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);

/*
* In cases where libfabric>=1.20 is available, and the provider has FI_HMEM
* support, the only further stated requirement for a user application to use
* dmabuf is to pass FI_MR_DMABUF in the flags on the call to fi_regattr(3).
*
* Unfortunately, the plugin needs to signal DMABUF support or lack thereof back
* to NCCL prior to having an opportuntiy to make any any memory registrations.
* This ultimately means that the plugin will opimistically assume DMA-BUF is
* viable on all FI_HMEM providers beyond libfabric 1.20.
*
* If dmabuf registrations fail, (ie: if ibv_reg_dmabuf_mr cannot be resolved),
* the plugin has no freedom to renegotiate DMABUF support with NCCL, and so it
* is fatal. Under those conditions, users should set this environment variable
* to force NCCL to avoid providing dmabuf file desciptors.
*/
OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);

/*
* Maximum size of a message in bytes before message is multiplexed
*/
Expand Down
1 change: 1 addition & 0 deletions m4/check_pkg_libfabric.m4
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [
FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES,
FI_OPT_MAX_MSG_SIZE,
FI_OPT_SHARED_MEMORY_PERMITTED,
FI_MR_DMABUF,
FI_OPT_INJECT_RMA_SIZE],
[], [], [AC_INCLUDES_DEFAULT
[#include <rdma/fi_endpoint.h>
Expand Down
1 change: 1 addition & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ sources = \
nccl_ofi_idpool.c \
nccl_ofi_ofiutils.c \
nccl_ofi_pthread.c \
nccl_ofi_dmabuf.c \
nccl_ofi_ep_addr_list.c \
tracepoint.c

Expand Down
49 changes: 49 additions & 0 deletions src/nccl_ofi_dmabuf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#include "config.h"
#include <stdbool.h>
#include "nccl_ofi_dmabuf.h"
#include "nccl_ofi_param.h"
#if HAVE_CUDA
#include "nccl_ofi_cuda.h"
#endif


/* Check preconditions for using DMA-BUF. Note that we may disable DMA-BUF for
* other reasons, even if this function returns true. For example, if we do not
* resolve a provider with FI_HMEM support */
int nccl_ofi_dmabuf_viable() {

/* Disable DMA-BUF if building against older libfabric. */
if (!HAVE_DECL_FI_MR_DMABUF) {
NCCL_OFI_WARN("Will not use DMA-BUF, requires Libfabric 1.20 or greater. Consider upgrading.");
return false;
}


/* Disable DMA-BUF if explicitly disabled by user. */
if (ofi_nccl_disable_dmabuf()) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Will not attempt to use DMA-BUF, explicitly disabled by user.");
return false;
}


/* Disable DMA-BUF if using CUDA and CUDA does not report DMA-BUF
* support in device attributes. */
#if HAVE_CUDA
if (!nccl_net_ofi_cuda_have_dma_buf_attr()) {
NCCL_OFI_WARN("Will not attempt to use DMA-BUF, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED was false.");
return false;
}
#endif

/* When using an HMEM capable provider at API version 1.20 or greater,
* advertise DMA-BUF support in NCCL getProperties calls. When given dmabuf
* file descriptors from NCCL, forward them in fi_regattr calls and pass the
* FI_MR_DMABUF flag. */
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Will attempt to resolve DMA-BUF capable providers. export OFI_NCCL_DMABUF_DISABLE=1 to disable");
return true;
}
10 changes: 3 additions & 7 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "nccl_ofi_topo.h"
#include "nccl_ofi_math.h"
#include "nccl_ofi_idpool.h"
#include "nccl_ofi_dmabuf.h"
#include "nccl_ofi_platform.h"

/* Indicates if GPUDirect is supported by libfabric provider */
Expand Down Expand Up @@ -380,13 +381,8 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov,
*/
props->max_group_receives = NCCL_OFI_MAX_RECVS;

if (support_gdr == GDR_SUPPORTED) {
props->hmem_support = true;
} else {
props->hmem_support = false;
}

props->dmabuf_support = false;
props->hmem_support = (support_gdr != GDR_UNSUPPORTED);
props->dmabuf_support = nccl_ofi_dmabuf_viable();

/* Should be successful for ptrSupport invocation */
return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/nccl_ofi_ofiutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str
/* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if
* using the Libfabric 1.18 API with HMEM support.
*/
if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) {
if (FI_VERSION_GE(api_version, FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) {
#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
bool optval = false;
ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
Expand Down
35 changes: 34 additions & 1 deletion src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "nccl_ofi_ofiutils.h"
#include "nccl_ofi_pthread.h"
#include "nccl_ofi_platform.h"
#include "nccl_ofi_dmabuf.h"
#include "nccl_ofi_mr.h"

/* Message buffer size -- maximum span of simultaneous inflight messages */
Expand Down Expand Up @@ -518,6 +519,10 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev,
props->rma_supported = 1;
assert(is_max_write_inline_size_initialized);
props->max_write_inline_size = max_write_inline_size;
props->dmabuf_support = ((info->caps & FI_HMEM) != 0) &&
FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 20)) &&
nccl_ofi_dmabuf_viable()
;

return ret;
}
Expand Down Expand Up @@ -6374,7 +6379,8 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep,
}
#endif

ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep,
ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 20),
dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep,
&ep_rail->av, &ep_rail->cq);
if (ret != 0) {
return ret;
Expand Down Expand Up @@ -7281,6 +7287,32 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
goto error;
}



hints = fi_allocinfo();
if (!nccl_ofi_dmabuf_viable())
goto no_dmabuf;

get_hints(hints);
ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 20), hints,
&provider_list, &num_providers);
if (ret == 0) {
/* The 1.18 API allows providers to use CUDA to
* support HMEM pointers, so just having HMEM doesn't
* tell us anything about the usability of CUDA
* pointers with NCCL. So leave the state unknown
* until we create an endpoint and try to disable
* CUDA
*/
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric 1.20 API, with DMA-BUF support");
support_gdr = GDR_SUPPORTED;
goto found;
} else {
NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret));
goto error;
}

no_dmabuf:
get_hints(hints);
ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 18), hints,
&provider_list, &num_providers);
Expand All @@ -7301,6 +7333,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
}
fi_freeinfo(hints);

found:
ret = nccl_net_ofi_query_provider_capabilities(provider_list, num_providers);
if (ret != 0) {
NCCL_OFI_WARN("Querying provider capabilities failed: %d", ret);
Expand Down
34 changes: 31 additions & 3 deletions src/nccl_ofi_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <sys/mman.h>
#include <unistd.h>

#include <rdma/fabric.h>

#include "nccl_ofi.h"
#if HAVE_CUDA
#include "nccl_ofi_cuda.h"
Expand All @@ -23,6 +25,7 @@
#include "nccl_ofi_tracepoint.h"
#include "nccl_ofi_math.h"
#include "nccl_ofi_pthread.h"
#include "nccl_ofi_dmabuf.h"
#include "nccl_ofi_mr.h"


Expand Down Expand Up @@ -53,8 +56,11 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev,
props->max_communicators = NCCL_OFI_MIN(device->max_tag, INT_MAX);
}

props->rma_supported = 0;

props->max_write_inline_size = info->tx_attr->inject_size;
props->hmem_support = (info->caps & FI_HMEM);
props->rma_supported = (info->caps & FI_RMA);
props->dmabuf_support = props->hmem_support && FI_VERSION_GE(selected_api_version, FI_VERSION(1, 20));

/**
* TODO:
Expand Down Expand Up @@ -2488,13 +2494,13 @@ nccl_net_ofi_sendrecv_device_create(nccl_net_ofi_plugin_t *plugin,
return NULL;
}

static void get_hints(struct fi_info *hints, int req_gdr)
static void get_hints(struct fi_info *hints, int req_hmem)
{
hints->caps = FI_LOCAL_COMM | FI_REMOTE_COMM | FI_TAGGED | FI_MSG;
hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ENDPOINT;
hints->domain_attr->mr_key_size = (size_t) ofi_nccl_mr_key_size();

if (req_gdr) {
if (req_hmem) {
hints->caps |= FI_HMEM;
if (!cuda_flush) {
hints->caps |= FI_RMA | FI_READ;
Expand Down Expand Up @@ -2632,6 +2638,28 @@ int nccl_net_ofi_sendrecv_init(const char *provider_filter,
goto error;
}

if (!nccl_ofi_dmabuf_viable())
goto no_dmabuf;

get_hints(hints, true);
selected_api_version = FI_VERSION(1, 20);
ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints,
&provider_list, &num_providers);
if (ret == 0) {
/* The 1.20 API allows providers to use CUDA to
* support HMEM pointers, so just having HMEM doesn't
* tell us anything about the usability of CUDA
* pointers with NCCL. So leave the state unknown
* until we create an endpoint and try to disable
* CUDA
*/
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Using Libfabric 1.20 API, with DMA-BUF support");
support_gdr = GDR_UNKNOWN;
goto found;
}

no_dmabuf:
get_hints(hints, true);
selected_api_version = FI_VERSION(1, 18);
ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints,
Expand Down

0 comments on commit e99cff1

Please sign in to comment.