feat: add DMA-BUF support

This adds DMA-BUF support to the plugin and enables it under the following conditions: At build time, libfabric>=1.20 is required (build checks for FI_MR_DMABUF). At runtime: + The specific version of NCCL being used supports DMA-BUF and passes valid dmabuf fds to the plugin. + FI_HMEM must be supported. + For CUDA accelerators, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED is queried and FI_HMEM_CUDA is requested. + For Neuron, we assume all nrt versions are viable of dmabuf export. Libfabric as of today provides no hints in at init time that allow the plugin to differentiate between a provider that merely has FI_HMEM support, and one that has dmabuf support. In the case that libfabric rejects registration and the plugin aborts because the underlying provider and/or rdma-core cannot support it, a new environment variable is introduced to force the legacy path: OFI_NCCL_DMABUF_DISABLE Testing: Various combinations of + OFI_NCCL_DISABLE_DMABUF=0/1 + OFI_NCCL_PROTOCOL=RDMA/SENDRECV + FI_HMEM_CUDA_USE_GDRCOPY=0/1 Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
aws · Sep 22, 2024 · e99cff1 · e99cff1
1 parent a07fc67
commit e99cff1
Show file tree

Hide file tree

Showing 10 changed files with 156 additions and 12 deletions.
diff --git a/include/Makefile.am b/include/Makefile.am
@@ -28,6 +28,7 @@ noinst_HEADERS = \
 	nccl_ofi_topo.h \
 	nccl_ofi_tuner.h \
 	nccl_ofi_ofiutils.h \
+	nccl_ofi_dmabuf.h \
 	nccl_ofi_tracepoint.h \
 	tracing_impl/lttng.h \
 	tracing_impl/nvtx.h \

diff --git a/include/nccl_ofi_dmabuf.h b/include/nccl_ofi_dmabuf.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef NCCL_OFI_DMABUF_H_
+#define NCCL_OFI_DMABUF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int nccl_ofi_dmabuf_viable(void);
+
+#ifdef __cplusplus
+}  // End extern "C"
+#endif
+
+#endif // NCCL_OFI_DMABUF_H_
diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h
@@ -178,6 +178,23 @@ OFI_NCCL_PARAM_INT(disable_native_rdma_check, "DISABLE_NATIVE_RDMA_CHECK", 0);
  */
 OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0);
 
+/*
+ * In cases where libfabric>=1.20 is available, and the provider has FI_HMEM
+ * support, the only further stated requirement for a user application to use
+ * dmabuf is to pass FI_MR_DMABUF in the flags on the call to fi_regattr(3).
+ *
+ * Unfortunately, the plugin needs to signal DMABUF support or lack thereof back
+ * to NCCL prior to having an opportuntiy to make any any memory registrations.
+ * This ultimately means that the plugin will opimistically assume DMA-BUF is
+ * viable on all FI_HMEM providers beyond libfabric 1.20.
+ *
+ * If dmabuf registrations fail, (ie: if ibv_reg_dmabuf_mr cannot be resolved),
+ * the plugin has no freedom to renegotiate DMABUF support with NCCL, and so it
+ * is fatal. Under those conditions, users should set this environment variable
+ * to force NCCL to avoid providing dmabuf file desciptors.
+ */
+OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0);
+
 /*
  * Maximum size of a message in bytes before message is multiplexed
  */

diff --git a/m4/check_pkg_libfabric.m4 b/m4/check_pkg_libfabric.m4
@@ -57,6 +57,7 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [
                   FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES,
                   FI_OPT_MAX_MSG_SIZE,
                   FI_OPT_SHARED_MEMORY_PERMITTED,
+                  FI_MR_DMABUF,
 		  FI_OPT_INJECT_RMA_SIZE],
                   [], [], [AC_INCLUDES_DEFAULT
 [#include <rdma/fi_endpoint.h>

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -23,6 +23,7 @@ sources = \
 	nccl_ofi_idpool.c \
 	nccl_ofi_ofiutils.c \
 	nccl_ofi_pthread.c \
+	nccl_ofi_dmabuf.c \
 	nccl_ofi_ep_addr_list.c \
 	tracepoint.c
 

diff --git a/src/nccl_ofi_dmabuf.c b/src/nccl_ofi_dmabuf.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "config.h"
+#include <stdbool.h>
+#include "nccl_ofi_dmabuf.h"
+#include "nccl_ofi_param.h"
+#if HAVE_CUDA
+#include "nccl_ofi_cuda.h"
+#endif
+
+
+/* Check preconditions for using DMA-BUF. Note that we may disable DMA-BUF for
+ * other reasons, even if this function returns true. For example, if we do not
+ * resolve a provider with FI_HMEM support */
+int nccl_ofi_dmabuf_viable() {
+
+    /* Disable DMA-BUF if building against older libfabric. */
+    if (!HAVE_DECL_FI_MR_DMABUF) {
+        NCCL_OFI_WARN("Will not use DMA-BUF, requires Libfabric 1.20 or greater. Consider upgrading.");
+        return false;
+    }
+
+
+    /* Disable DMA-BUF if explicitly disabled by user. */
+    if (ofi_nccl_disable_dmabuf()) {
+        NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Will not attempt to use DMA-BUF, explicitly disabled by user.");
+        return false;
+    }
+
+
+    /* Disable DMA-BUF if using CUDA and CUDA does not report DMA-BUF
+     * support in device attributes. */
+#if HAVE_CUDA
+    if (!nccl_net_ofi_cuda_have_dma_buf_attr()) {
+        NCCL_OFI_WARN("Will not attempt to use DMA-BUF, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED was false.");
+        return false;
+    }
+#endif
+
+    /* When using an HMEM capable provider at API version 1.20 or greater,
+     * advertise DMA-BUF support in NCCL getProperties calls. When given dmabuf
+     * file descriptors from NCCL, forward them in fi_regattr calls and pass the
+     * FI_MR_DMABUF flag. */
+    NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+                  "Will attempt to resolve DMA-BUF capable providers. export OFI_NCCL_DMABUF_DISABLE=1 to disable");
+    return true;
+}
diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c
@@ -25,6 +25,7 @@
 #include "nccl_ofi_topo.h"
 #include "nccl_ofi_math.h"
 #include "nccl_ofi_idpool.h"
+#include "nccl_ofi_dmabuf.h"
 #include "nccl_ofi_platform.h"
 
 /* Indicates if GPUDirect is supported by libfabric provider */
@@ -380,13 +381,8 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov,
 	 */
 	props->max_group_receives = NCCL_OFI_MAX_RECVS;
 
-	if (support_gdr == GDR_SUPPORTED) {
-		props->hmem_support = true;
-	} else {
-		props->hmem_support = false;
-	}
-
-	props->dmabuf_support = false;
+	props->hmem_support = (support_gdr != GDR_UNSUPPORTED);
+	props->dmabuf_support = nccl_ofi_dmabuf_viable();
 
 	/* Should be successful for ptrSupport invocation */
 	return 0;

diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c
@@ -338,7 +338,7 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str
 	/* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if
 	 * using the Libfabric 1.18 API with HMEM support.
 	 */
-	if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) {
+	if (FI_VERSION_GE(api_version, FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) {
 #if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
 		bool optval = false;
 		ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,

diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c
@@ -26,6 +26,7 @@
 #include "nccl_ofi_ofiutils.h"
 #include "nccl_ofi_pthread.h"
 #include "nccl_ofi_platform.h"
+#include "nccl_ofi_dmabuf.h"
 #include "nccl_ofi_mr.h"
 
 /* Message buffer size -- maximum span of simultaneous inflight messages */
@@ -518,6 +519,10 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev,
 	props->rma_supported = 1;
 	assert(is_max_write_inline_size_initialized);
 	props->max_write_inline_size = max_write_inline_size;
+	props->dmabuf_support = ((info->caps & FI_HMEM) != 0) &&
+		FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 20)) &&
+		nccl_ofi_dmabuf_viable()
+		;
 
 	return ret;
 }
@@ -6374,7 +6379,8 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep,
 	}
 #endif
 
-	ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep,
+	ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 20),
+						dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep,
 						&ep_rail->av, &ep_rail->cq);
 	if (ret != 0) {
 		return ret;
@@ -7281,6 +7287,32 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
 		goto error;
 	}
 
+
+
+	hints = fi_allocinfo();
+	if (!nccl_ofi_dmabuf_viable())
+		goto no_dmabuf;
+
+	get_hints(hints);
+	ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 20), hints,
+					      &provider_list, &num_providers);
+	if (ret == 0) {
+		/* The 1.18 API allows providers to use CUDA to
+		 * support HMEM pointers, so just having HMEM doesn't
+		 * tell us anything about the usability of CUDA
+		 * pointers with NCCL.  So leave the state unknown
+		 * until we create an endpoint and try to disable
+		 * CUDA
+		 */
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric 1.20 API, with DMA-BUF support");
+		support_gdr = GDR_SUPPORTED;
+		goto found;
+	} else {
+		NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret));
+		goto error;
+	}
+
+no_dmabuf:
 	get_hints(hints);
 	ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 18), hints,
 					      &provider_list, &num_providers);
@@ -7301,6 +7333,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
 	}
 	fi_freeinfo(hints);
 
+found:
 	ret = nccl_net_ofi_query_provider_capabilities(provider_list, num_providers);
 	if (ret != 0) {
 		NCCL_OFI_WARN("Querying provider capabilities failed: %d", ret);

diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c
@@ -12,6 +12,8 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include <rdma/fabric.h>
+
 #include "nccl_ofi.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
@@ -23,6 +25,7 @@
 #include "nccl_ofi_tracepoint.h"
 #include "nccl_ofi_math.h"
 #include "nccl_ofi_pthread.h"
+#include "nccl_ofi_dmabuf.h"
 #include "nccl_ofi_mr.h"
 
 
@@ -53,8 +56,11 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev,
 		props->max_communicators = NCCL_OFI_MIN(device->max_tag, INT_MAX);
 	}
 
-	props->rma_supported = 0;
+
 	props->max_write_inline_size = info->tx_attr->inject_size;
+	props->hmem_support = (info->caps & FI_HMEM);
+	props->rma_supported = (info->caps & FI_RMA);
+	props->dmabuf_support = props->hmem_support && FI_VERSION_GE(selected_api_version, FI_VERSION(1, 20));
 
 	/**
 	 * TODO:
@@ -2488,13 +2494,13 @@ nccl_net_ofi_sendrecv_device_create(nccl_net_ofi_plugin_t *plugin,
 	return NULL;
 }
 
-static void get_hints(struct fi_info *hints, int req_gdr)
+static void get_hints(struct fi_info *hints, int req_hmem)
 {
 	hints->caps = FI_LOCAL_COMM | FI_REMOTE_COMM | FI_TAGGED | FI_MSG;
 	hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ENDPOINT;
 	hints->domain_attr->mr_key_size = (size_t) ofi_nccl_mr_key_size();
 
-	if (req_gdr) {
+	if (req_hmem) {
 		hints->caps |= FI_HMEM;
 		if (!cuda_flush) {
 			hints->caps |= FI_RMA | FI_READ;
@@ -2632,6 +2638,28 @@ int nccl_net_ofi_sendrecv_init(const char *provider_filter,
 		goto error;
 	}
 
+	if (!nccl_ofi_dmabuf_viable())
+		goto no_dmabuf;
+
+	get_hints(hints, true);
+	selected_api_version = FI_VERSION(1, 20);
+	ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints,
+					      &provider_list, &num_providers);
+	if (ret == 0) {
+		/* The 1.20 API allows providers to use CUDA to
+		 * support HMEM pointers, so just having HMEM doesn't
+		 * tell us anything about the usability of CUDA
+		 * pointers with NCCL.  So leave the state unknown
+		 * until we create an endpoint and try to disable
+		 * CUDA
+		 */
+		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
+					   "Using Libfabric 1.20 API, with DMA-BUF support");
+		support_gdr = GDR_UNKNOWN;
+		goto found;
+	}
+
+no_dmabuf:
 	get_hints(hints, true);
 	selected_api_version = FI_VERSION(1, 18);
 	ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints,