aws · aws-nslick · Sep 25, 2024 · Sep 13, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -58,13 +58,13 @@ jobs:
             efainstallerdir: ALINUX2023
             nvidiadistro: amzn2023
             configmanager: dnf config-manager
-            cudapackages: cuda-cudart-devel-12-5 cuda-driver-devel-12-5
+            cudapackages: cuda-cudart-devel-12-6 cuda-crt-12-6
 
           - container: amazonlinux:2
             efainstallerdir: ALINUX2
             nvidiadistro: rhel7
             configmanager: yum-config-manager
-            cudapackages: cuda-cudart-devel-12-4 cuda-driver-devel-12-4
+            cudapackages: cuda-cudart-devel-12-4 cuda-crt-12-4
 
     runs-on: ubuntu-latest
     container: ${{ matrix.container }}
@@ -177,7 +177,12 @@ jobs:
       - name: Install CUDA SDK
         if: matrix.sdk == 'cuda'
         run: |
-          sudo apt-get install -y nvidia-cuda-toolkit
+          sudo apt-get update -y && sudo apt-get install -y wget lsb-release
+          repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
+          wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update -y
+          sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6
 
       - name: Install Neuron SDK
         if: matrix.sdk == 'neuron'
@@ -295,7 +300,12 @@ jobs:
       - name: Install CUDA SDK
         if: matrix.sdk == 'cuda'
         run: |
-          sudo apt-get install -y nvidia-cuda-toolkit
+          sudo apt-get update -y && sudo apt-get install -y wget lsb-release
+          repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
+          wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update -y
+          sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6
 
       - name: Install Neuron SDK
         if: matrix.sdk == 'neuron'
@@ -379,7 +389,12 @@ jobs:
       - name: Install CUDA SDK
         if: matrix.sdk == 'cuda'
         run: |
-          sudo apt-get install -y nvidia-cuda-toolkit
+          sudo apt-get update -y && sudo apt-get install -y wget lsb-release
+          repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
+          wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update -y
+          sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6
 
       - name: Install Neuron SDK
         if: matrix.sdk == 'neuron'

@@ -86,19 +86,18 @@ AC_SEARCH_LIBS([log2], [m], [], [AC_MSG_ERROR([NCCL OFI Plugin requires the log2
 # Checks for external packages
 CHECK_PKG_LIBFABRIC([], [AC_MSG_ERROR([NCCL OFI Plugin could not find a working Libfabric install.])])
 
-CHECK_PKG_NVTX()
-CHECK_PKG_LTTNG()
-
 have_device_interface=no
 CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
                         [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
                         [want_cuda=no])
                   have_device_interface=neuron])
 CHECK_PKG_CUDA([have_device_interface=cuda])
-
 AS_IF([test "${have_device_interface}" = "no"],
       [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
 
+CHECK_PKG_LTTNG()
+CHECK_PKG_NVTX()
+
 CHECK_PKG_HWLOC([],
 		[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
 

@@ -28,6 +28,7 @@ noinst_HEADERS = \
 	nccl_ofi_topo.h \
 	nccl_ofi_tuner.h \
 	nccl_ofi_ofiutils.h \
+	nccl_ofi_dmabuf.h \
 	nccl_ofi_tracepoint.h \
 	tracing_impl/lttng.h \
 	tracing_impl/nvtx.h \

@@ -396,18 +396,8 @@ struct nccl_net_ofi_send_comm {
 	 * @return	0 on success
 	 *		non-zero on error
 	 */
-	int (*regMr)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int type,
-			      void **mhandle);
-
-	/*
-	 * @brief	Register DMA memory region on send communicator (both Host and CUDA)
-	 *
-	 * This operation is not supported.
-	 *
-	 * @return	Memory handle for data send operations
-	 */
-	int (*regMrDmaBuf)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size,
-				    int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle);
+	int (*regMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_ofi_mr_ckey_ref ckey, int type,
+				 void **mhandle);
 
 	/*
 	 * @brief	Deregister memory region on send communicator (both Host and CUDA)
@@ -439,18 +429,8 @@ struct nccl_net_ofi_recv_comm {
 	 * @return	0 on success
 	 *		non-zero on error
 	 */
-	int (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size, int type,
-			      void **mhandle);
-
-	/*
-	 * @brief	Register DMA memory region on recv communicator (both Host and CUDA)
-	 *
-	 * This operation is not supported.
-	 *
-	 * @return	Memory handle for data recv operations
-	 */
-	int (*regMrDmaBuf)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size,
-				    int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle);
+	int (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_ofi_mr_ckey_ref ckey, int type,
+				 void **mhandle);
 
 	/*
 	 * @brief	Deregister memory region on recv communicator (both Host and CUDA)
@@ -561,23 +541,6 @@ int nccl_net_ofi_plugin_fini(nccl_net_ofi_plugin_t *plugin);
  */
 int nccl_net_ofi_info_properties(struct fi_info *nic_prov, int dev_id, int num_devices, nccl_ofi_properties_t *props);
 
-
-/*
- * @brief	Register DMA buffer for send comm. Unimplemented.
- */
-int nccl_net_ofi_reg_mr_dma_buf_recv_comm(nccl_net_ofi_recv_comm_t *recv_comm,
-					  void *data, size_t size,
-					  int type, uint64_t offset, int fd,
-					  nccl_net_ofi_mr_handle_t **handle);
-
-/*
- * @brief	Register DMA buffer for recv comm. Unimplemented.
- */
-int nccl_net_ofi_reg_mr_dma_buf_send_comm(nccl_net_ofi_send_comm_t *send_comm,
-					  void *data, size_t size,
-					  int type, uint64_t offset, int fd,
-					  nccl_net_ofi_mr_handle_t **handle);
-
 /*
  * @brief	Allocate memory region for memory registration
  *

@@ -6,13 +6,10 @@
 #ifndef NCCL_OFI_CUDA_H_
 #define NCCL_OFI_CUDA_H_
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include <cuda.h>
-
 int nccl_net_ofi_cuda_init(void);
 
 /*
@@ -24,26 +21,53 @@ int nccl_net_ofi_cuda_init(void);
  * @return	Valid CUDA device ID on success
  *		-1 on error
  * @return	0 on success
- *		non-zero on error
+ *		-EINVAL on error
  */
-int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
+int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
 
-extern CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion);
+/*
+ * @brief	wraps cudaFlushGPUDirectRDMAWrites() with default args.
 
-extern CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);
 
-extern CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device);
-extern CUresult (*nccl_net_ofi_cuDeviceGetCount)(int* count);
+/*
+ * @brief	wraps cudaGetDevice()
 
-#if CUDA_VERSION >= 11030
-extern CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
-							   CUflushGPUDirectRDMAWritesScope scope);
-#else
-extern void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites;
-#endif
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_cuda_get_num_devices(void);
+
+/*
+ * @brief	wraps cudaGetDeviceCount()
+
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_cuda_get_active_device_idx(void);
+
+
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
+
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise.
+ */
+bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
+
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
+
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise
+ */
+bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
 
 #ifdef __cplusplus
-} // End extern "C"
+}  // End extern "C"
 #endif
 
-#endif // End NCCL_OFI_H_
+#endif  // End NCCL_OFI_H_
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef NCCL_OFI_DMABUF_H_
+#define NCCL_OFI_DMABUF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int nccl_ofi_dmabuf_viable(void);
+
+#ifdef __cplusplus
+}  // End extern "C"
+#endif
+
+#endif  // NCCL_OFI_DMABUF_H_