From e99cff18e8ae44845aa4c8550e0b6a595e72cd04 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sat, 21 Sep 2024 00:57:26 -0700 Subject: [PATCH] feat: add DMA-BUF support This adds DMA-BUF support to the plugin and enables it under the following conditions: At build time, libfabric>=1.20 is required (build checks for FI_MR_DMABUF). At runtime: + The specific version of NCCL being used supports DMA-BUF and passes valid dmabuf fds to the plugin. + FI_HMEM must be supported. + For CUDA accelerators, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED is queried and FI_HMEM_CUDA is requested. + For Neuron, we assume all nrt versions are viable of dmabuf export. Libfabric as of today provides no hints in at init time that allow the plugin to differentiate between a provider that merely has FI_HMEM support, and one that has dmabuf support. In the case that libfabric rejects registration and the plugin aborts because the underlying provider and/or rdma-core cannot support it, a new environment variable is introduced to force the legacy path: OFI_NCCL_DMABUF_DISABLE Testing: Various combinations of + OFI_NCCL_DISABLE_DMABUF=0/1 + OFI_NCCL_PROTOCOL=RDMA/SENDRECV + FI_HMEM_CUDA_USE_GDRCOPY=0/1 Signed-off-by: Nicholas Sielicki --- include/Makefile.am | 1 + include/nccl_ofi_dmabuf.h | 18 ++++++++++++++ include/nccl_ofi_param.h | 17 ++++++++++++++ m4/check_pkg_libfabric.m4 | 1 + src/Makefile.am | 1 + src/nccl_ofi_dmabuf.c | 49 +++++++++++++++++++++++++++++++++++++++ src/nccl_ofi_net.c | 10 +++----- src/nccl_ofi_ofiutils.c | 2 +- src/nccl_ofi_rdma.c | 35 +++++++++++++++++++++++++++- src/nccl_ofi_sendrecv.c | 34 ++++++++++++++++++++++++--- 10 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 include/nccl_ofi_dmabuf.h create mode 100644 src/nccl_ofi_dmabuf.c diff --git a/include/Makefile.am b/include/Makefile.am index df46233fd..d771184ad 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -28,6 +28,7 @@ noinst_HEADERS = \ nccl_ofi_topo.h \ nccl_ofi_tuner.h \ nccl_ofi_ofiutils.h \ + nccl_ofi_dmabuf.h \ nccl_ofi_tracepoint.h \ tracing_impl/lttng.h \ tracing_impl/nvtx.h \ diff --git a/include/nccl_ofi_dmabuf.h b/include/nccl_ofi_dmabuf.h new file mode 100644 index 000000000..4ee05abae --- /dev/null +++ b/include/nccl_ofi_dmabuf.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef NCCL_OFI_DMABUF_H_ +#define NCCL_OFI_DMABUF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int nccl_ofi_dmabuf_viable(void); + +#ifdef __cplusplus +} // End extern "C" +#endif + +#endif // NCCL_OFI_DMABUF_H_ diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index 7ae619c35..d2b13d215 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -178,6 +178,23 @@ OFI_NCCL_PARAM_INT(disable_native_rdma_check, "DISABLE_NATIVE_RDMA_CHECK", 0); */ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0); +/* + * In cases where libfabric>=1.20 is available, and the provider has FI_HMEM + * support, the only further stated requirement for a user application to use + * dmabuf is to pass FI_MR_DMABUF in the flags on the call to fi_regattr(3). + * + * Unfortunately, the plugin needs to signal DMABUF support or lack thereof back + * to NCCL prior to having an opportuntiy to make any any memory registrations. + * This ultimately means that the plugin will opimistically assume DMA-BUF is + * viable on all FI_HMEM providers beyond libfabric 1.20. + * + * If dmabuf registrations fail, (ie: if ibv_reg_dmabuf_mr cannot be resolved), + * the plugin has no freedom to renegotiate DMABUF support with NCCL, and so it + * is fatal. Under those conditions, users should set this environment variable + * to force NCCL to avoid providing dmabuf file desciptors. + */ +OFI_NCCL_PARAM_INT(disable_dmabuf, "DISABLE_DMABUF", 0); + /* * Maximum size of a message in bytes before message is multiplexed */ diff --git a/m4/check_pkg_libfabric.m4 b/m4/check_pkg_libfabric.m4 index 401f7c059..13fcbac9c 100644 --- a/m4/check_pkg_libfabric.m4 +++ b/m4/check_pkg_libfabric.m4 @@ -57,6 +57,7 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [ FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, FI_OPT_MAX_MSG_SIZE, FI_OPT_SHARED_MEMORY_PERMITTED, + FI_MR_DMABUF, FI_OPT_INJECT_RMA_SIZE], [], [], [AC_INCLUDES_DEFAULT [#include diff --git a/src/Makefile.am b/src/Makefile.am index c77a517e4..c2ce05bdc 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -23,6 +23,7 @@ sources = \ nccl_ofi_idpool.c \ nccl_ofi_ofiutils.c \ nccl_ofi_pthread.c \ + nccl_ofi_dmabuf.c \ nccl_ofi_ep_addr_list.c \ tracepoint.c diff --git a/src/nccl_ofi_dmabuf.c b/src/nccl_ofi_dmabuf.c new file mode 100644 index 000000000..22e6be421 --- /dev/null +++ b/src/nccl_ofi_dmabuf.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "config.h" +#include +#include "nccl_ofi_dmabuf.h" +#include "nccl_ofi_param.h" +#if HAVE_CUDA +#include "nccl_ofi_cuda.h" +#endif + + +/* Check preconditions for using DMA-BUF. Note that we may disable DMA-BUF for + * other reasons, even if this function returns true. For example, if we do not + * resolve a provider with FI_HMEM support */ +int nccl_ofi_dmabuf_viable() { + + /* Disable DMA-BUF if building against older libfabric. */ + if (!HAVE_DECL_FI_MR_DMABUF) { + NCCL_OFI_WARN("Will not use DMA-BUF, requires Libfabric 1.20 or greater. Consider upgrading."); + return false; + } + + + /* Disable DMA-BUF if explicitly disabled by user. */ + if (ofi_nccl_disable_dmabuf()) { + NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Will not attempt to use DMA-BUF, explicitly disabled by user."); + return false; + } + + + /* Disable DMA-BUF if using CUDA and CUDA does not report DMA-BUF + * support in device attributes. */ +#if HAVE_CUDA + if (!nccl_net_ofi_cuda_have_dma_buf_attr()) { + NCCL_OFI_WARN("Will not attempt to use DMA-BUF, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED was false."); + return false; + } +#endif + + /* When using an HMEM capable provider at API version 1.20 or greater, + * advertise DMA-BUF support in NCCL getProperties calls. When given dmabuf + * file descriptors from NCCL, forward them in fi_regattr calls and pass the + * FI_MR_DMABUF flag. */ + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "Will attempt to resolve DMA-BUF capable providers. export OFI_NCCL_DMABUF_DISABLE=1 to disable"); + return true; +} diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index 8e273d877..d9cce8c50 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -25,6 +25,7 @@ #include "nccl_ofi_topo.h" #include "nccl_ofi_math.h" #include "nccl_ofi_idpool.h" +#include "nccl_ofi_dmabuf.h" #include "nccl_ofi_platform.h" /* Indicates if GPUDirect is supported by libfabric provider */ @@ -380,13 +381,8 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov, */ props->max_group_receives = NCCL_OFI_MAX_RECVS; - if (support_gdr == GDR_SUPPORTED) { - props->hmem_support = true; - } else { - props->hmem_support = false; - } - - props->dmabuf_support = false; + props->hmem_support = (support_gdr != GDR_UNSUPPORTED); + props->dmabuf_support = nccl_ofi_dmabuf_viable(); /* Should be successful for ptrSupport invocation */ return 0; diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index ffcc11818..aaf1f3b13 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -338,7 +338,7 @@ int nccl_ofi_ofiutils_init_connection(int api_version, struct fi_info *info, str /* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if * using the Libfabric 1.18 API with HMEM support. */ - if (api_version == FI_VERSION(1,18) && support_gdr != GDR_UNSUPPORTED) { + if (FI_VERSION_GE(api_version, FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) { #if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED) bool optval = false; ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT, diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index ac2fc6c4b..049f06fc2 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -26,6 +26,7 @@ #include "nccl_ofi_ofiutils.h" #include "nccl_ofi_pthread.h" #include "nccl_ofi_platform.h" +#include "nccl_ofi_dmabuf.h" #include "nccl_ofi_mr.h" /* Message buffer size -- maximum span of simultaneous inflight messages */ @@ -518,6 +519,10 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev, props->rma_supported = 1; assert(is_max_write_inline_size_initialized); props->max_write_inline_size = max_write_inline_size; + props->dmabuf_support = ((info->caps & FI_HMEM) != 0) && + FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 20)) && + nccl_ofi_dmabuf_viable() + ; return ret; } @@ -6374,7 +6379,8 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep, } #endif - ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep, + ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 20), + dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep, &ep_rail->av, &ep_rail->cq); if (ret != 0) { return ret; @@ -7281,6 +7287,32 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, goto error; } + + + hints = fi_allocinfo(); + if (!nccl_ofi_dmabuf_viable()) + goto no_dmabuf; + + get_hints(hints); + ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 20), hints, + &provider_list, &num_providers); + if (ret == 0) { + /* The 1.18 API allows providers to use CUDA to + * support HMEM pointers, so just having HMEM doesn't + * tell us anything about the usability of CUDA + * pointers with NCCL. So leave the state unknown + * until we create an endpoint and try to disable + * CUDA + */ + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric 1.20 API, with DMA-BUF support"); + support_gdr = GDR_SUPPORTED; + goto found; + } else { + NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret)); + goto error; + } + +no_dmabuf: get_hints(hints); ret = nccl_ofi_ofiutils_get_providers(provider_filter, FI_VERSION(1, 18), hints, &provider_list, &num_providers); @@ -7301,6 +7333,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, } fi_freeinfo(hints); +found: ret = nccl_net_ofi_query_provider_capabilities(provider_list, num_providers); if (ret != 0) { NCCL_OFI_WARN("Querying provider capabilities failed: %d", ret); diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index e6ef8184c..7e0871ead 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -12,6 +12,8 @@ #include #include +#include + #include "nccl_ofi.h" #if HAVE_CUDA #include "nccl_ofi_cuda.h" @@ -23,6 +25,7 @@ #include "nccl_ofi_tracepoint.h" #include "nccl_ofi_math.h" #include "nccl_ofi_pthread.h" +#include "nccl_ofi_dmabuf.h" #include "nccl_ofi_mr.h" @@ -53,8 +56,11 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev, props->max_communicators = NCCL_OFI_MIN(device->max_tag, INT_MAX); } - props->rma_supported = 0; + props->max_write_inline_size = info->tx_attr->inject_size; + props->hmem_support = (info->caps & FI_HMEM); + props->rma_supported = (info->caps & FI_RMA); + props->dmabuf_support = props->hmem_support && FI_VERSION_GE(selected_api_version, FI_VERSION(1, 20)); /** * TODO: @@ -2488,13 +2494,13 @@ nccl_net_ofi_sendrecv_device_create(nccl_net_ofi_plugin_t *plugin, return NULL; } -static void get_hints(struct fi_info *hints, int req_gdr) +static void get_hints(struct fi_info *hints, int req_hmem) { hints->caps = FI_LOCAL_COMM | FI_REMOTE_COMM | FI_TAGGED | FI_MSG; hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ENDPOINT; hints->domain_attr->mr_key_size = (size_t) ofi_nccl_mr_key_size(); - if (req_gdr) { + if (req_hmem) { hints->caps |= FI_HMEM; if (!cuda_flush) { hints->caps |= FI_RMA | FI_READ; @@ -2632,6 +2638,28 @@ int nccl_net_ofi_sendrecv_init(const char *provider_filter, goto error; } + if (!nccl_ofi_dmabuf_viable()) + goto no_dmabuf; + + get_hints(hints, true); + selected_api_version = FI_VERSION(1, 20); + ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints, + &provider_list, &num_providers); + if (ret == 0) { + /* The 1.20 API allows providers to use CUDA to + * support HMEM pointers, so just having HMEM doesn't + * tell us anything about the usability of CUDA + * pointers with NCCL. So leave the state unknown + * until we create an endpoint and try to disable + * CUDA + */ + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, + "Using Libfabric 1.20 API, with DMA-BUF support"); + support_gdr = GDR_UNKNOWN; + goto found; + } + +no_dmabuf: get_hints(hints, true); selected_api_version = FI_VERSION(1, 18); ret = nccl_ofi_ofiutils_get_providers(provider_filter, selected_api_version, hints,