Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add DMA-BUF support #618

Merged
merged 6 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions .github/workflows/distcheck.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ jobs:
efainstallerdir: ALINUX2023
nvidiadistro: amzn2023
configmanager: dnf config-manager
cudapackages: cuda-cudart-devel-12-5 cuda-driver-devel-12-5
cudapackages: cuda-cudart-devel-12-6 cuda-crt-12-6

- container: amazonlinux:2
efainstallerdir: ALINUX2
nvidiadistro: rhel7
configmanager: yum-config-manager
cudapackages: cuda-cudart-devel-12-4 cuda-driver-devel-12-4
cudapackages: cuda-cudart-devel-12-4 cuda-crt-12-4

runs-on: ubuntu-latest
container: ${{ matrix.container }}
Expand Down Expand Up @@ -177,7 +177,12 @@ jobs:
- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
run: |
sudo apt-get install -y nvidia-cuda-toolkit
sudo apt-get update -y && sudo apt-get install -y wget lsb-release
repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update -y
sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6

- name: Install Neuron SDK
if: matrix.sdk == 'neuron'
Expand Down Expand Up @@ -295,7 +300,12 @@ jobs:
- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
run: |
sudo apt-get install -y nvidia-cuda-toolkit
sudo apt-get update -y && sudo apt-get install -y wget lsb-release
repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update -y
sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6

- name: Install Neuron SDK
if: matrix.sdk == 'neuron'
Expand Down Expand Up @@ -379,7 +389,12 @@ jobs:
- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
run: |
sudo apt-get install -y nvidia-cuda-toolkit
sudo apt-get update -y && sudo apt-get install -y wget lsb-release
repo="ubuntu$(lsb_release -r | cut -d':' -f2 | xargs | sed 's/[.]//g')"
wget https://developer.download.nvidia.com/compute/cuda/repos/${repo}/$(uname -m)/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update -y
sudo apt-get install -y cuda-cudart-dev-12-6 cuda-crt-12-6

- name: Install Neuron SDK
if: matrix.sdk == 'neuron'
Expand Down
7 changes: 3 additions & 4 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,18 @@ AC_SEARCH_LIBS([log2], [m], [], [AC_MSG_ERROR([NCCL OFI Plugin requires the log2
# Checks for external packages
CHECK_PKG_LIBFABRIC([], [AC_MSG_ERROR([NCCL OFI Plugin could not find a working Libfabric install.])])

CHECK_PKG_NVTX()
CHECK_PKG_LTTNG()

have_device_interface=no
CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
[AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
[want_cuda=no])
have_device_interface=neuron])
CHECK_PKG_CUDA([have_device_interface=cuda])

AS_IF([test "${have_device_interface}" = "no"],
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])

CHECK_PKG_LTTNG()
CHECK_PKG_NVTX()

CHECK_PKG_HWLOC([],
[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])

Expand Down
1 change: 1 addition & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ noinst_HEADERS = \
nccl_ofi_topo.h \
nccl_ofi_tuner.h \
nccl_ofi_ofiutils.h \
nccl_ofi_dmabuf.h \
nccl_ofi_tracepoint.h \
tracing_impl/lttng.h \
tracing_impl/nvtx.h \
Expand Down
45 changes: 4 additions & 41 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,18 +396,8 @@ struct nccl_net_ofi_send_comm {
* @return 0 on success
* non-zero on error
*/
int (*regMr)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size, int type,
void **mhandle);

/*
* @brief Register DMA memory region on send communicator (both Host and CUDA)
*
* This operation is not supported.
*
* @return Memory handle for data send operations
*/
int (*regMrDmaBuf)(nccl_net_ofi_send_comm_t *send_comm, void *data, size_t size,
bwbarrett marked this conversation as resolved.
Show resolved Hide resolved
int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle);
int (*regMr)(nccl_net_ofi_send_comm_t *send_comm, nccl_ofi_mr_ckey_ref ckey, int type,
void **mhandle);

/*
* @brief Deregister memory region on send communicator (both Host and CUDA)
Expand Down Expand Up @@ -439,18 +429,8 @@ struct nccl_net_ofi_recv_comm {
* @return 0 on success
* non-zero on error
*/
int (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size, int type,
void **mhandle);

/*
* @brief Register DMA memory region on recv communicator (both Host and CUDA)
*
* This operation is not supported.
*
* @return Memory handle for data recv operations
*/
int (*regMrDmaBuf)(nccl_net_ofi_recv_comm_t *recv_comm, void *data, size_t size,
int type, uint64_t offset, int fd, nccl_net_ofi_mr_handle_t **handle);
int (*regMr)(nccl_net_ofi_recv_comm_t *recv_comm, nccl_ofi_mr_ckey_ref ckey, int type,
void **mhandle);

/*
* @brief Deregister memory region on recv communicator (both Host and CUDA)
Expand Down Expand Up @@ -561,23 +541,6 @@ int nccl_net_ofi_plugin_fini(nccl_net_ofi_plugin_t *plugin);
*/
int nccl_net_ofi_info_properties(struct fi_info *nic_prov, int dev_id, int num_devices, nccl_ofi_properties_t *props);


/*
* @brief Register DMA buffer for send comm. Unimplemented.
*/
int nccl_net_ofi_reg_mr_dma_buf_recv_comm(nccl_net_ofi_recv_comm_t *recv_comm,
void *data, size_t size,
int type, uint64_t offset, int fd,
nccl_net_ofi_mr_handle_t **handle);

/*
* @brief Register DMA buffer for recv comm. Unimplemented.
*/
int nccl_net_ofi_reg_mr_dma_buf_send_comm(nccl_net_ofi_send_comm_t *send_comm,
void *data, size_t size,
int type, uint64_t offset, int fd,
nccl_net_ofi_mr_handle_t **handle);

/*
* @brief Allocate memory region for memory registration
*
Expand Down
58 changes: 41 additions & 17 deletions include/nccl_ofi_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,10 @@
#ifndef NCCL_OFI_CUDA_H_
#define NCCL_OFI_CUDA_H_


#ifdef __cplusplus
extern "C" {
#endif

#include <cuda.h>

int nccl_net_ofi_cuda_init(void);

/*
Expand All @@ -24,26 +21,53 @@ int nccl_net_ofi_cuda_init(void);
* @return Valid CUDA device ID on success
* -1 on error
* @return 0 on success
* non-zero on error
* -EINVAL on error
*/
int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
aws-nslick marked this conversation as resolved.
Show resolved Hide resolved

extern CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion);
/*
* @brief wraps cudaFlushGPUDirectRDMAWrites() with default args.

extern CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);

extern CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device);
extern CUresult (*nccl_net_ofi_cuDeviceGetCount)(int* count);
/*
* @brief wraps cudaGetDevice()

#if CUDA_VERSION >= 11030
extern CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
CUflushGPUDirectRDMAWritesScope scope);
#else
extern void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites;
#endif
* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_cuda_get_num_devices(void);

/*
* @brief wraps cudaGetDeviceCount()

* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_cuda_get_active_device_idx(void);


/*
* @brief query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED

* @return true if attr is fetched successfully and true.
* false otherwise.
*/
bool nccl_net_ofi_cuda_have_dma_buf_attr(void);

/*
* @brief query CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED

* @return true if attr is fetched successfully and true.
* false otherwise
*/
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);

#ifdef __cplusplus
} // End extern "C"
} // End extern "C"
#endif

#endif // End NCCL_OFI_H_
#endif // End NCCL_OFI_H_
18 changes: 18 additions & 0 deletions include/nccl_ofi_dmabuf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#ifndef NCCL_OFI_DMABUF_H_
#define NCCL_OFI_DMABUF_H_

#ifdef __cplusplus
extern "C" {
#endif

int nccl_ofi_dmabuf_viable(void);

#ifdef __cplusplus
} // End extern "C"
#endif

#endif // NCCL_OFI_DMABUF_H_
Loading
Loading