From bbc3435fb42c86a0bd628dd435e21391dfb2a693 Mon Sep 17 00:00:00 2001 From: Wataru Ishida Date: Thu, 22 Feb 2024 07:14:02 +0000 Subject: [PATCH 1/3] chore: plugin update to nccl-2.20 https://github.com/Mellanox/nccl-rdma-sharp-plugins/commit/728d4e88226c6e3fdd832c6395f5231baaeea742 Signed-off-by: Wataru Ishida --- nccl_plugin/include/ibvwrap.h | 4 +- nccl_plugin/include/nccl.h | 47 +- nccl_plugin/include/net.h | 5 +- nccl_plugin/include/net_device.h | 32 + nccl_plugin/include/net_v6.h | 2 - nccl_plugin/include/net_v7.h | 121 +++ nccl_plugin/include/net_v8.h | 137 ++++ nccl_plugin/include/p2p_plugin.h | 73 +- nccl_plugin/include/param.h | 9 +- nccl_plugin/include/socket.h | 2 +- nccl_plugin/include/timer.h | 6 +- nccl_plugin/include/utils.h | 9 +- nccl_plugin/src/ib_plugin.c | 1262 +++++++++++++++++++---------- nccl_plugin/src/ibvwrap.c | 38 +- nccl_plugin/src/optcast_plugin.cc | 24 +- nccl_plugin/src/p2p_plugin.c | 282 +++++-- nccl_plugin/src/param.c | 10 +- nccl_plugin/src/socket_plugin.c | 75 +- nccl_plugin/src/utils.c | 67 +- 19 files changed, 1463 insertions(+), 742 deletions(-) create mode 100644 nccl_plugin/include/net_device.h create mode 100644 nccl_plugin/include/net_v7.h create mode 100644 nccl_plugin/include/net_v8.h diff --git a/nccl_plugin/include/ibvwrap.h b/nccl_plugin/include/ibvwrap.h index c79de37..afd4ebb 100644 --- a/nccl_plugin/include/ibvwrap.h +++ b/nccl_plugin/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2015-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -59,6 +59,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); +ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); +ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); diff --git a/nccl_plugin/include/nccl.h b/nccl_plugin/include/nccl.h index 67736cf..a234af9 100644 --- a/nccl_plugin/include/nccl.h +++ b/nccl_plugin/include/nccl.h @@ -14,11 +14,11 @@ #endif #define NCCL_MAJOR 2 -#define NCCL_MINOR 15 -#define NCCL_PATCH 1 +#define NCCL_MINOR 20 +#define NCCL_PATCH 3 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 21510 +#define NCCL_VERSION_CODE 22003 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus @@ -42,15 +42,24 @@ typedef enum { ncclSuccess = 0, ncclInProgress = 7, ncclNumResults = 8 } ncclResult_t; +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 + /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ -typedef struct ncclConfig_v21400 { +typedef struct ncclConfig_v21700 { /* attributes that users should never touch. */ size_t size; unsigned int magic; unsigned int version; /* attributes that users are able to customize. */ int blocking; + int cgaClusterSize; + int minCTAs; + int maxCTAs; + const char *netName; + int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -59,9 +68,23 @@ typedef struct ncclConfig_v21400 { sizeof(ncclConfig_t), /* size */ \ 0xcafebeef, /* magic */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ - 1 /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } +/* NCCL malloc and free function for all types of NCCL optimizations + * (e.g. user buffer registration). The actual allocated size might + * be larger than requested due to granularity requirement. */ +ncclResult_t ncclMemAlloc(void** ptr, size_t size); +ncclResult_t pncclMemAlloc(void** ptr, size_t size); + +ncclResult_t ncclMemFree(void *ptr); +ncclResult_t pncclMemFree(void *ptr); + /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library @@ -119,6 +142,10 @@ ncclResult_t pncclCommAbort(ncclComm_t comm); const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); +/* Returns a human-readable message of the last error that occurred. */ + const char* ncclGetLastError(ncclComm_t comm); + const char* pncclGetLastError(ncclComm_t comm); + /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); @@ -135,6 +162,16 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + +/* Register CUDA buffer for zero-copy operation */ +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); + +/* Deregister CUDA buffer */ +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); +ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); + + /* Reduction operation selector */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; typedef enum { ncclSum = 0, diff --git a/nccl_plugin/include/net.h b/nccl_plugin/include/net.h index b3ddb70..1e60bad 100644 --- a/nccl_plugin/include/net.h +++ b/nccl_plugin/include/net.h @@ -22,10 +22,9 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +#include "net_v8.h" +#include "net_v7.h" #include "net_v6.h" #include "net_v5.h" -#include "net_v4.h" - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 #endif // end include guard diff --git a/nccl_plugin/include/net_device.h b/nccl_plugin/include/net_device.h new file mode 100644 index 0000000..bd1a02f --- /dev/null +++ b/nccl_plugin/include/net_device.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_DEVICE_H_ +#define NET_DEVICE_H_ + +#include "net_device.h" + +#define NCCL_NET_DEVICE_INVALID_VERSION 0x0 +#define NCCL_NET_MTU_SIZE 4096 + +// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin +// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 + +typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; + +typedef struct { + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + void* handle; + size_t size; + int needsProxyProgress; +} ncclNetDeviceHandle_v7_t; + +typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; +typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t; + +#endif diff --git a/nccl_plugin/include/net_v6.h b/nccl_plugin/include/net_v6.h index 5c87ca8..8e1d134 100644 --- a/nccl_plugin/include/net_v6.h +++ b/nccl_plugin/include/net_v6.h @@ -18,8 +18,6 @@ typedef struct { int maxRecvs; // Maximum number of grouped receives. }ncclNetProperties_v6_t; -typedef ncclNetProperties_v6_t ncclNetProperties_t; - typedef struct { // Name of the network (mainly for logs) const char* name; diff --git a/nccl_plugin/include/net_v7.h b/nccl_plugin/include/net_v7.h new file mode 100644 index 0000000..eef1f2e --- /dev/null +++ b/nccl_plugin/include/net_v7.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V7_H_ +#define NCCL_NET_V7_H_ + +#include "net_device.h" + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +// v7 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v7_t; + +#endif // end include guard diff --git a/nccl_plugin/include/net_v8.h b/nccl_plugin/include/net_v8.h new file mode 100644 index 0000000..f1bd56b --- /dev/null +++ b/nccl_plugin/include/net_v8.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V8_H_ +#define NCCL_NET_V8_H_ +#include "net_device.h" + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v8_t; + +typedef ncclNetProperties_v8_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v8_t; + + +typedef struct { + void* mhandle; + void* address; + uint32_t size; +} ncclNetSGE_v8_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v8_t; + + +#endif // end include guard diff --git a/nccl_plugin/include/p2p_plugin.h b/nccl_plugin/include/p2p_plugin.h index 7178fff..b859777 100644 --- a/nccl_plugin/include/p2p_plugin.h +++ b/nccl_plugin/include/p2p_plugin.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,8 +9,6 @@ #include #include -#define ENABLE_TIMER 0 -#include "timer.h" #include #include "nccl.h" @@ -31,13 +29,12 @@ typedef enum nccl_p2p_plugin { NCCL_P2P_IB, NCCL_P2P_UCX, NCCL_P2P_UCX_RMA, - NCCL_P2P_SOCKET, NCCL_P2P_LAST } nccl_p2p_plugin_t; struct ncclIbMr { uintptr_t addr; - int pages; + size_t pages; int refs; struct ibv_mr *mr; }; @@ -47,76 +44,86 @@ struct ncclIbMrCache { int capacity, population; }; +#define NCCL_IB_MAX_DEVS_PER_NIC 2 +#define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC +struct ncclIbMergedDev { + int ndevs; + int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs + int speed; + char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+' +} __attribute__((aligned(64))); + struct ncclIbRequest { - struct ncclIbVerbs* verbs; + struct ncclIbNetCommBase* base; int type; - int events; struct ncclSocket* sock; - struct ncclIbGidInfo* gidInfo; + int events[NCCL_IB_MAX_DEVS_PER_NIC]; + struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC]; int nreqs; union { struct { int size; void* data; - uint32_t lkey; + uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC]; int offset; } send; struct { - int sizes[NCCL_NET_IB_MAX_RECVS]; + int* sizes; } recv; }; }; -struct ncclIbVerbs { - int dev; - struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd +// Retain local RoCE address for error logging +struct ncclIbGidInfo { + uint8_t link_layer; + union ibv_gid localGid; +}; + +typedef struct ncclIbNetCommDevBase { + int ibDevN; + struct ibv_pd* pd; struct ibv_cq* cq; uint64_t pad[1]; - struct ncclIbRequest reqs[MAX_REQUESTS]; -}; + struct ncclIbGidInfo gidInfo; +} ncclIbNetCommDevBase; typedef struct ncclIbDev { pthread_mutex_t lock; int device; uint64_t guid; - uint8_t port; + uint8_t portNum; uint8_t link; uint8_t isSharpDev; int speed; struct ibv_context* context; int pdRefs; struct ibv_pd* pd; - struct ncclIbVerbs verbs; char devName[MAXNAMESIZE]; char *pciPath; int realPort; int maxQp; struct ncclIbMrCache mrCache; int ar; // ADAPTIVE_ROUTING -} __attribute__((aligned(64))) nccl_ib_dev_t; + struct ibv_port_attr portAttr; +} __attribute__((aligned(64))) ncclIbDev; -#define MAX_IB_PORT 15 -struct userIbDev { - char devName[MAXNAMESIZE]; - uint16_t port_en; -}; #define MAX_IB_DEVS 32 +extern struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS]; extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; -extern struct ncclIbDev userIbDevs[MAX_IB_DEVS]; /* Detect whether GDR can work on a given NIC with the current CUDA device * Returns : * ncclSuccess : GDR works * ncclSystemError : no module or module loaded but not supported by GPU */ -ncclResult_t nccl_p2p_gdr_support(int dev); +ncclResult_t nccl_p2p_gdr_support(); ncclResult_t nccl_p2p_dmabuf_support(int dev); -ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_name, char** path, int* real_port); +ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port); -ncclResult_t nccl_p2p_ib_get_properties(nccl_ib_dev_t *devs, int dev, ncclNetProperties_t* props); +ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int dev, ncclNetProperties_t* props); -ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction); +ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction); /* Convert value returtned by ibv_query_port to actual link width */ int nccl_p2p_ib_width(int width); @@ -126,14 +133,12 @@ int nccl_p2p_ib_speed(int speed); int64_t ncclParamSharpMaxComms(); +int64_t ncclParamIbMergeVfs(); + +int64_t ncclParamIbMergeNics(); + int ncclIbRelaxedOrderingCapable(void); -#ifdef __cplusplus -extern "C" { -#endif nccl_p2p_plugin_t nccl_p2p_get_plugin_type(); -#ifdef __cplusplus -} -#endif #endif diff --git a/nccl_plugin/include/param.h b/nccl_plugin/include/param.h index 0b27d0c..18f5c03 100644 --- a/nccl_plugin/include/param.h +++ b/nccl_plugin/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,14 +12,9 @@ const char* userHomeDir(); void setEnvFile(const char* fileName); void initEnv(); +const char *ncclGetEnv(const char *name); -#ifdef __cplusplus -extern "C" { -#endif void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); -#ifdef __cplusplus -} -#endif #define NCCL_PARAM(name, env, deftVal) \ int64_t ncclParam##name() { \ diff --git a/nccl_plugin/include/socket.h b/nccl_plugin/include/socket.h index e2e9a20..4c04ae1 100644 --- a/nccl_plugin/include/socket.h +++ b/nccl_plugin/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/nccl_plugin/include/timer.h b/nccl_plugin/include/timer.h index 41fd8f2..7f4eb77 100644 --- a/nccl_plugin/include/timer.h +++ b/nccl_plugin/include/timer.h @@ -52,9 +52,9 @@ static double startTimes[8]; printf("\n"); \ } while (0); #else -#define TIME_START(index) while(0); -#define TIME_STOP(index) while(0); -#define TIME_CANCEL(index) while(0); +#define TIME_START(index) do {} while(0); +#define TIME_STOP(index) do {} while(0); +#define TIME_CANCEL(index) do {} while(0); #define TIME_PRINT(name) #endif #endif diff --git a/nccl_plugin/include/utils.h b/nccl_plugin/include/utils.h index 2934ae8..96ca62e 100644 --- a/nccl_plugin/include/utils.h +++ b/nccl_plugin/include/utils.h @@ -13,13 +13,7 @@ #define NCCL_STATIC_ASSERT(_cond, _msg) \ switch(0) {case 0:case (_cond):;} -#ifdef __cplusplus -extern "C" { -#endif - ncclResult_t ncclIbMalloc(void** ptr, size_t size); -#ifdef __cplusplus -} -#endif +ncclResult_t ncclIbMalloc(void** ptr, size_t size); ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size); ncclResult_t getHostName(char* hostname, int maxlen); uint64_t getHostHash(); @@ -32,7 +26,6 @@ struct netIf { int parseStringList(const char* string, struct netIf* ifList, int maxList); int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact); -int readFileNumber(long *value, const char *filename_fmt, ...); const char *get_plugin_lib_path(); #endif diff --git a/nccl_plugin/src/ib_plugin.c b/nccl_plugin/src/ib_plugin.c index eda70ad..a1e663e 100644 --- a/nccl_plugin/src/ib_plugin.c +++ b/nccl_plugin/src/ib_plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,9 +13,9 @@ #include #include #include + #define ENABLE_TIMER 0 #include "timer.h" - #include "p2p_plugin.h" #include "core.h" #include "socket.h" @@ -46,25 +46,11 @@ NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); -pthread_t ncclIbAsyncThread; -static void* ncclIbAsyncThreadMain(void* args) { - struct ibv_context* context = (struct ibv_context*)args; - while (1) { - struct ibv_async_event event; - if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; } - char *str; - if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } - if (event.event_type != IBV_EVENT_COMM_EST) - WARN("NET/IB : Got async event : %s", str); - if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } - } - return NULL; -} +static pthread_t ncclIbAsyncThread; // Determine whether RELAXED_ORDERING is enabled and possible int ncclIbRelaxedOrderingCapable(void) { int roMode = ncclParamIbPciRelaxedOrdering(); - ncclResult_t r = ncclInternalError; if (roMode == 1 || roMode == 2) { if (!IBV_ACCESS_RELAXED_ORDERING) { if(roMode == 1) @@ -79,12 +65,12 @@ int ncclIbRelaxedOrderingCapable(void) { NCCL_PARAM(IbDisable, "IBEXT_DISABLE", 0); NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1); +NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1); extern ncclDebugLogger_t pluginLogFunction; ncclResult_t ncclIbDevices(int* ndev) { - *ndev = 1; -// *ndev = ncclNIbDevs; + *ndev = ncclNIbDevs; return ncclSuccess; } @@ -93,29 +79,81 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) return nccl_p2p_ib_get_properties(ncclIbDevs, dev, props); } -static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) { - memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); +ncclResult_t ncclIbGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) +{ + ncclNetProperties_t props; + ncclResult_t ret = nccl_p2p_ib_get_properties(ncclIbDevs, dev, &props); + if (ret != ncclSuccess) return ret; + props_v7->name = props.name; + props_v7->pciPath = props.pciPath; + props_v7->guid = props.guid; + props_v7->ptrSupport = props.ptrSupport; + props_v7->speed = props.speed; + props_v7->latency = props.latency; + props_v7->port = props.port; + props_v7->maxComms = props.maxComms; + props_v7->maxRecvs = props.maxRecvs; + props_v7->netDeviceType = props.netDeviceType; + props_v7->netDeviceVersion = props.netDeviceVersion; return ncclSuccess; } +ncclResult_t ncclIbGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) +{ + ncclNetProperties_t props; + ncclResult_t ret = nccl_p2p_ib_get_properties(ncclIbDevs, dev, &props); + if (ret != ncclSuccess) return ret; + props_v6->name = props.name; + props_v6->pciPath = props.pciPath; + props_v6->guid = props.guid; + props_v6->ptrSupport = props.ptrSupport; + props_v6->speed = props.speed; + props_v6->latency = props.latency; + props_v6->port = props.port; + props_v6->maxComms = props.maxComms; + props_v6->maxRecvs = props.maxRecvs; + + return ncclSuccess; +}; + #define NCCL_IB_MAX_QPS 128 -typedef struct ncclIbQpInfo { +struct ncclIbQpInfo { + uint32_t qpn; + + // Fields needed for ece (enhanced connection establishment) + struct ibv_ece ece; + int ece_supported; + int devIndex; +}; + + +// Per-Dev connection metadata +typedef struct ncclIbDevInfo { uint32_t lid; uint8_t ib_port; + enum ibv_mtu mtu; uint8_t link_layer; uint8_t is_global; - uint32_t qpn[NCCL_IB_MAX_QPS]; // For RoCE and IB GRH uint64_t spn; uint64_t iid; - enum ibv_mtu mtu; // FIFO RDMA info uint32_t fifoRkey; + union ibv_gid remoteGid; +} ncclIbDevInfo; + + +// Struct containing everything needed to establish connections +typedef struct ncclIbConnectionMetadata { + struct ncclIbQpInfo qpInfo[NCCL_IB_MAX_QPS]; + struct ncclIbDevInfo devs[NCCL_IB_MAX_DEVS_PER_NIC]; + char devName[MAX_MERGED_DEV_NAME]; uint64_t fifoAddr; -} ncclIbQpInfo; + int ndevs; +} ncclIbConnectionMetadata; enum ncclIbCommState { ncclIbCommStateStart = 0, @@ -142,12 +180,7 @@ struct ncclIbHandle { struct ncclIbCommStage stage; // Used by the other side when connecting }; -// Retain local and remote RoCE addresses for error logging -struct ncclIbGidInfo { - uint8_t link_layer; - union ibv_gid localGid; - union ibv_gid remoteGid; -}; + #define NCCL_NET_IB_REQ_UNUSED 0 #define NCCL_NET_IB_REQ_SEND 1 @@ -164,57 +197,97 @@ struct ncclIbListenComm { struct ncclIbSendFifo { uint64_t addr; int size; - uint32_t rkey; + uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC]; uint32_t nreqs; uint32_t tag; uint64_t idx; + char padding[24]; }; +typedef struct ncclIbQp { + struct ibv_qp* qp; + int devIndex; + int remDevIdx; +} ncclIbQp; + +struct ncclIbRemSizesFifo { + int elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + uint64_t fifoTail; + uint64_t addr; + uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC]; + uint32_t flags; + struct ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC]; + struct ibv_sge sge; +}; + +// A per-dev struct for netIbSendComm +typedef struct ncclIbSendCommDev { + struct ncclIbNetCommDevBase base; + struct ibv_mr* fifoMr; +} __attribute__((aligned(8))) ncclIbSendCommDev; + + +// Wrapper to track an MR per-device, if needed +struct ncclIbMrHandle { + struct ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC]; +}; + +typedef struct ncclIbNetCommBase { + int ndevs; + bool isSend; + struct ncclIbRequest reqs[MAX_REQUESTS]; + struct ncclIbQp qps[NCCL_IB_MAX_QPS]; + int nqps; + int qpIndex; + int devIndex; + struct ncclSocket sock; + int ready; + // Track necessary remDevInfo here + int nRemDevs; + struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC]; + } __attribute__((aligned(32))) ncclIbNetCommBase; + struct ncclIbSendComm { - struct ncclIbVerbs verbs; + struct ncclIbNetCommBase base; struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; - uint64_t fifoHead; + // Each dev correlates to a mergedIbDev + struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; - struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; - struct ncclSocket sock; - - int ready; - struct ibv_qp* qps[NCCL_IB_MAX_QPS]; - int nqps; - int qpIndex; - struct ibv_mr* fifoMr; - int ar; - struct ncclIbGidInfo gidInfo; + struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; + struct ncclIbRemSizesFifo remSizesFifo; + uint64_t fifoHead; + int ar; // Use adaptive routing when all merged devices have it enabled }; struct ncclIbGpuFlush { - int enabled; - int hostMem; struct ibv_mr* hostMr; struct ibv_sge sge; - struct ibv_qp* qp; + struct ncclIbQp qp; }; struct ncclIbRemFifo { struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; uint64_t fifoTail; uint64_t addr; - uint32_t rkey; uint32_t flags; - struct ibv_mr* mr; - struct ibv_sge sge; }; +struct ncclIbRecvCommDev { + struct ncclIbNetCommDevBase base; + struct ncclIbGpuFlush gpuFlush; + uint32_t fifoRkey; + struct ibv_mr* fifoMr; + struct ibv_sge fifoSge; + struct ibv_mr* sizesFifoMr; +} __attribute__((aligned(16))); + struct ncclIbRecvComm { - struct ncclIbVerbs verbs; + struct ncclIbNetCommBase base; + struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRemFifo remFifo; - struct ncclSocket sock; - int ready; - struct ibv_qp* qps[NCCL_IB_MAX_QPS]; - int nqps; - int qpIndex; - struct ncclIbGpuFlush gpuFlush; - struct ncclIbGidInfo gidInfo; + int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + int gpuFlushHostMem; + int flushEnabled; }; ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { @@ -223,55 +296,65 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { // The SendFifo needs to be 32-byte aligned and each element needs // to be a 32-byte multiple, so that an entry does not get split and // written out of order when IB Relaxed Ordering is enabled + NCCL_STATIC_ASSERT((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset"); NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned"); NCCL_STATIC_ASSERT((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples"); + NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned"); + NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned"); NCCL_STATIC_ASSERT((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned"); + return nccl_p2p_ib_init(&ncclNIbDevs, ncclIbDevs, ncclIbIfName, &ncclIbIfAddr, &ncclIbAsyncThread, logFunction); } NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1); -ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) { - verbs->dev = dev; +static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclIbNetCommDevBase* base) { + req->events[devIndex]++; + req->devBases[devIndex] = base; +} - pthread_mutex_lock(&ncclIbDevs[dev].lock); - if (0 == ncclIbDevs[dev].pdRefs++) { +ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) { + base->ibDevN = ibDevN; + ncclIbDev* ibDev = ncclIbDevs + ibDevN; + pthread_mutex_lock(&ibDev->lock); + if (0 == ibDev->pdRefs++) { ncclResult_t res; - NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure); + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context), res, failure); if (0) { failure: - pthread_mutex_unlock(&ncclIbDevs[dev].lock); + pthread_mutex_unlock(&ibDev->lock); return res; } } - verbs->pd = ncclIbDevs[dev].pd; - pthread_mutex_unlock(&ncclIbDevs[dev].lock); + base->pd = ibDev->pd; + pthread_mutex_unlock(&ibDev->lock); // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). - NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); + NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); + return ncclSuccess; } -ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) { +ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) { ncclResult_t res; - NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq)); + NCCLCHECK(wrap_ibv_destroy_cq(base->cq)); - pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); - if (0 == --ncclIbDevs[verbs->dev].pdRefs) { - NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning); + pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); + if (0 == --ncclIbDevs[base->ibDevN].pdRefs) { + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd), res, returning); } res = ncclSuccess; returning: - pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } -ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) { +ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) { struct ibv_qp_init_attr qpInitAttr; memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr)); - qpInitAttr.send_cq = verbs->cq; - qpInitAttr.recv_cq = verbs->cq; + qpInitAttr.send_cq = base->cq; + qpInitAttr.recv_cq = base->cq; qpInitAttr.qp_type = IBV_QPT_RC; // We might send 2 messages per send (RDMA and RDMA_WITH_IMM) qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS; @@ -279,31 +362,26 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce qpInitAttr.cap.max_send_sge = 1; qpInitAttr.cap.max_recv_sge = 1; qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0; - NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr)); + NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr)); struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_INIT; qpAttr.pkey_index = ncclParamIbPkey(); qpAttr.port_num = ib_port; qpAttr.qp_access_flags = access_flags; - NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)); + NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)); return ncclSuccess; } -ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbDevInfo* info) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; qpAttr.path_mtu = info->mtu; - qpAttr.dest_qp_num = qpn; + qpAttr.dest_qp_num = dest_qp_num; qpAttr.rq_psn = 0; qpAttr.max_dest_rd_atomic = 1; qpAttr.min_rnr_timer = 12; - qpAttr.ah_attr.is_global = 0; - qpAttr.ah_attr.dlid = info->lid; - qpAttr.ah_attr.sl = ncclParamIbSl(); - qpAttr.ah_attr.src_path_bits = 0; - qpAttr.ah_attr.port_num = info->ib_port; if (info->link_layer == IBV_LINK_LAYER_ETHERNET || info->is_global) { qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; @@ -312,7 +390,13 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* i qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex(); qpAttr.ah_attr.grh.hop_limit = 255; qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); + } else { + qpAttr.ah_attr.is_global = 0; + qpAttr.ah_attr.dlid = info->lid; } + qpAttr.ah_attr.sl = ncclParamIbSl(); + qpAttr.ah_attr.src_path_bits = 0; + qpAttr.ah_attr.port_num = info->ib_port; NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } @@ -346,12 +430,11 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { return ncclSuccess; } -ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { +ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; - enum ncclSocketState conState; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; - struct ncclIbQpInfo remQpInfo; + struct ncclIbConnectionMetadata remMeta; int ready; *sendComm = NULL; @@ -365,94 +448,188 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { } NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); - NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); + NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); stage->comm = comm; stage->state = ncclIbCommStateConnect; - NCCLCHECK(ncclSocketConnect(&comm->sock)); + NCCLCHECK(ncclSocketConnect(&comm->base.sock)); ib_connect_check: /* since ncclSocketConnect is async, we must check if connection is complete */ - NCCLCHECK(ncclSocketReady(&comm->sock, &ready)); + NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready)); if (!ready) return ncclSuccess; // IB Setup - struct ibv_context* ctx; - ctx = ncclIbDevs[dev].context; - NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs)); - uint8_t ib_port; - ib_port = ncclIbDevs[dev].port; - comm->nqps = ncclParamIbQpsPerConn(); - for (int q=0; qnqps; q++) { - NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q)); + struct ncclIbMergedDev* mergedDev; + mergedDev = ncclIbMergedDevs + dev; + comm->base.ndevs = mergedDev->ndevs; + comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device + comm->base.isSend = true; + + // Init PD, Ctx for each IB device + comm->ar = 1; // Set to 1 for logic + for (int i = 0; i < mergedDev->ndevs; i++) { + int ibDevN = mergedDev->devs[i]; + NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base)); + comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled } - comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - - // Send my QP Info to receiver through the socket. Hope this won't block. - struct ibv_port_attr portAttr; - NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); - struct ncclIbQpInfo qpInfo; - qpInfo.ib_port = ib_port; - for (int q=0; qnqps; q++) qpInfo.qpn[q] = comm->qps[q]->qp_num; - qpInfo.mtu = portAttr.active_mtu; - - // Prepare my fifo - NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - qpInfo.fifoRkey = comm->fifoMr->rkey; - qpInfo.fifoAddr = (uint64_t)comm->fifo; - - // RoCE support - qpInfo.lid = portAttr.lid; - qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer; - qpInfo.is_global = (ncclParamIbIsGlobal() -#if HAVE_DECL_IBV_QPF_GRH_REQUIRED - || (portAttr.flags & IBV_QPF_GRH_REQUIRED) + + struct ncclIbConnectionMetadata meta; + meta.ndevs = comm->base.ndevs; + + // Alternate QPs between devices + int devIndex; + devIndex = 0; + for (int q = 0; q < comm->base.nqps; q++) { + ncclIbSendCommDev* commDev = comm->devs + devIndex; + ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; + NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q)); + comm->base.qps[q].devIndex = devIndex; + meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num; + meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex; + + // Query ece capabilities (enhanced connection establishment) + NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + devIndex = (devIndex + 1) % comm->base.ndevs; + } + + for (int i = 0; i < comm->base.ndevs; i++) { + ncclIbSendCommDev* commDev = comm->devs + i; + ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; + // Send my QP Info to receiver through the socket. Hope this won't block. + // TODO - I thought I queried this in init? + NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr)); + + // Write to the metadata struct via this pointer + ncclIbDevInfo* devInfo = meta.devs + i; + devInfo->ib_port = ibDev->portNum; + devInfo->mtu = ibDev->portAttr.active_mtu; + devInfo->lid = ibDev->portAttr.lid; + + // Prepare my fifo + NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + devInfo->fifoRkey = commDev->fifoMr->rkey; + + // RoCE support + devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; + devInfo->is_global = (ncclParamIbIsGlobal() + #if HAVE_DECL_IBV_QPF_GRH_REQUIRED + || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED) #endif ); - if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND && !qpInfo.is_global) { // IB - for (int q=0; qnqps; q++) - INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); - } else { // RoCE - NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid)); - qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix; - qpInfo.iid = comm->gidInfo.localGid.global.interface_id; - for (int q=0; qnqps; q++) - INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); + + if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET || devInfo->is_global) { + NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &commDev->base.gidInfo.localGid)); + devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix; + devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id; + } + + if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB + for (int q = 0; q < comm->base.nqps; q++) { + // Print just the QPs for this dev + if (comm->base.qps[q].devIndex == i) + INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d fifoRkey=0x%x fifoLkey=0x%x", + comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", + dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->fifoRkey, commDev->fifoMr->lkey); + } + } else { // RoCE + for (int q = 0; q < comm->base.nqps; q++) { + // Print just the QPs for this dev + if (comm->base.qps[q].devIndex == i) + INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", + comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, + commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(), + devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey); + } + } } + meta.fifoAddr = (uint64_t)comm->fifo; + strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); + stage->state = ncclIbCommStateSend; stage->offset = 0; - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo))); - memcpy(stage->buffer, &qpInfo, sizeof(qpInfo)); + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta))); + + memcpy(stage->buffer, &meta, sizeof(meta)); ib_send: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset)); - if (stage->offset != sizeof(qpInfo)) return ncclSuccess; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset)); + if (stage->offset != sizeof(meta)) return ncclSuccess; + stage->state = ncclIbCommStateConnecting; stage->offset = 0; // Clear the staging buffer for re-use - memset(stage->buffer, 0, sizeof(qpInfo)); + memset(stage->buffer, 0, sizeof(meta)); ib_connect: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset)); - if (stage->offset != sizeof(remQpInfo)) return ncclSuccess; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset)); + if (stage->offset != sizeof(remMeta)) return ncclSuccess; + + memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata)); + + comm->base.nRemDevs = remMeta.ndevs; + if (comm->base.nRemDevs != comm->base.ndevs) { + mergedDev = ncclIbMergedDevs + dev; + WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d", + mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs); + } + + int link_layer; + link_layer = remMeta.devs[0].link_layer; + for (int i = 1; i < remMeta.ndevs; i++) { + if (remMeta.devs[i].link_layer != link_layer) { + WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", + i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer); + return ncclInternalError; + } + } + + // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. + for (int i = 0; i < remMeta.ndevs; i++) { + comm->base.remDevs[i] = remMeta.devs[i]; + comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].iid; + comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].spn; + + // Retain remote sizes fifo info and prepare RDMA ops + comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey; + comm->remSizesFifo.addr = remMeta.fifoAddr; + } + + for (int i=0; i < comm->base.ndevs; i++) { + NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); + } + comm->base.nRemDevs = remMeta.ndevs; + for (int q = 0; q < comm->base.nqps; q++) { + struct ncclIbQpInfo* remQpInfo = remMeta.qpInfo + q; + struct ncclIbDevInfo* remDevInfo = remMeta.devs + remQpInfo->devIndex; + + // Assign per-QP remDev + comm->base.qps[q].remDevIdx = remQpInfo->devIndex; - memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo)); + struct ibv_qp* qp = comm->base.qps[q].qp; + if (remQpInfo->ece_supported && remQpInfo->ece_supported) + NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported)); - comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; - comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; - for (int q=0; qnqps; q++) { - struct ibv_qp* qp = comm->qps[q]; - NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); + NCCLCHECK(ncclIbRtrQp(qp, remQpInfo->qpn, remDevInfo)); NCCLCHECK(ncclIbRtsQp(qp)); } - comm->ready = 1; + if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE + for (int q = 0; q < comm->base.nqps; q++) { + struct ncclIbQp* qp = comm->base.qps + q; + int ibDevN = comm->devs[qp->devIndex].base.ibDevN; + struct ncclIbDev* ibDev = ncclIbDevs + ibDevN; + INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", + ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask); + } + } + comm->base.ready = 1; stage->state = ncclIbCommStateConnected; stage->offset = 0; ib_send_ready: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); @@ -462,9 +639,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { return ncclSuccess; } +ncclResult_t ncclIbConnect_v6(int dev, void* opaqueHandle, void** sendComm) { + ncclNetDeviceHandle_v7_t* handle = NULL; + return ncclIbConnect(dev, opaqueHandle, sendComm, &handle); +} + NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); -ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { +ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; @@ -483,116 +665,177 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); stage->comm = rComm; stage->state = ncclIbCommStateAccept; - NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, NCCL_SOCKET_MAGIC, ncclSocketTypeUnknown, NULL, 0)); - NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); + NCCLCHECK(ncclSocketInit(&rComm->base.sock, NULL, NCCL_SOCKET_MAGIC, ncclSocketTypeUnknown, NULL, 0)); + NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock)); ib_accept_check: - NCCLCHECK(ncclSocketReady(&rComm->sock, &ready)); + NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready)); if (!ready) return ncclSuccess; - struct ncclIbQpInfo remQpInfo; + struct ncclIbConnectionMetadata remMeta; stage->state = ncclIbCommStateRecv; stage->offset = 0; - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo))); + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta))); ib_recv: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset)); - if (stage->offset != sizeof(remQpInfo)) return ncclSuccess; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset)); + if (stage->offset != sizeof(remMeta)) return ncclSuccess; /* copy back the received info */ - memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); - - rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; - rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; + memcpy(&remMeta, stage->buffer, sizeof(struct ncclIbConnectionMetadata)); + + // IB setup + // Pre-declare variables because of goto + struct ncclIbMergedDev* mergedDev; + struct ncclIbDev* ibDev; + int ibDevN; + struct ncclIbRecvCommDev* rCommDev; + struct ncclIbDevInfo* remDevInfo; + struct ncclIbQp* qp; + + mergedDev = ncclIbMergedDevs + lComm->dev; + rComm->base.ndevs = mergedDev->ndevs; + rComm->base.nqps = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device + rComm->base.isSend = false; + + rComm->base.nRemDevs = remMeta.ndevs; + if (rComm->base.nRemDevs != rComm->base.ndevs) { + WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d", + mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs); + } - // IB setup - struct ibv_context* ctx; - uint8_t ib_port; - ctx = ncclIbDevs[lComm->dev].context; - ib_port = ncclIbDevs[lComm->dev].port; - struct ibv_port_attr portAttr; - NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); - NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid)); - - // QP Creation - NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs)); - rComm->nqps = ncclParamIbQpsPerConn(); - for (int q=0; qnqps; q++) { - NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q)); + // Metadata to send back to requestor (sender) + struct ncclIbConnectionMetadata meta; + for (int i = 0; i < rComm->base.ndevs; i++) { + rCommDev = rComm->devs + i; + ibDevN = mergedDev->devs[i]; + NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base)); + ibDev = ncclIbDevs + ibDevN; + NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr)); + NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid)); } - // Adjust the MTU - remQpInfo.mtu = (enum ibv_mtu)MIN(remQpInfo.mtu, portAttr.active_mtu); + // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. + for (int i = 0; i < remMeta.ndevs; i++) { + rComm->base.remDevs[i] = remMeta.devs[i]; + rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].iid; + rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].spn; + } - // Setup QP - for (int q=0; qnqps; q++) { - struct ibv_qp* qp = rComm->qps[q]; - NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); - NCCLCHECK(ncclIbRtsQp(qp)); + // Stripe QP creation across merged devs + // Make sure to get correct remote peer dev and QP info + int remDevIndex; + int devIndex; + devIndex = 0; + for (int q = 0; q < rComm->base.nqps; q++) { + remDevIndex = remMeta.qpInfo[q].devIndex; + remDevInfo = remMeta.devs + remDevIndex; + qp = rComm->base.qps+q; + rCommDev = rComm->devs + devIndex; + qp->remDevIdx = remDevIndex; + + // Local ibDevN + ibDevN = rComm->devs[devIndex].base.ibDevN; + ibDev = ncclIbDevs + ibDevN; + NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp)); + qp->devIndex = devIndex; + devIndex = (devIndex + 1) % rComm->base.ndevs; + + // Set the ece (enhanced connection establishment) on this QP before RTR + if (remMeta.qpInfo[q].ece_supported) { + NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + + // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) + // Store this in our own qpInfo for returning to the requestor + if (meta.qpInfo[q].ece_supported) + NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); + } + NCCLCHECK(ncclIbRtrQp(qp->qp, remMeta.qpInfo[q].qpn, remDevInfo)); + NCCLCHECK(ncclIbRtsQp(qp->qp)); } - // Retain remote fifo info and prepare my RDMA ops - rComm->remFifo.rkey = remQpInfo.fifoRkey; - rComm->remFifo.addr = remQpInfo.fifoAddr; - NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); - rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey; - if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; - - // Allocate Flush dummy buffer for GPU Direct RDMA - rComm->gpuFlush.enabled = ((nccl_p2p_gdr_support(lComm->dev) == ncclSuccess) || nccl_p2p_dmabuf_support(lComm->dev) == ncclSuccess) && - (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0; - if (rComm->gpuFlush.enabled) { - NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); - rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem; - rComm->gpuFlush.sge.length = 1; - rComm->gpuFlush.sge.lkey = rComm->gpuFlush.hostMr->lkey; - NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp)); - struct ncclIbQpInfo localQpInfo; - localQpInfo.lid=portAttr.lid; - localQpInfo.link_layer=portAttr.link_layer; - localQpInfo.ib_port=ib_port; - localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; - localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id; - localQpInfo.is_global=(ncclParamIbIsGlobal() + rComm->flushEnabled = ((nccl_p2p_gdr_support() == ncclSuccess || nccl_p2p_dmabuf_support(lComm->dev) == ncclSuccess) + && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0; + + for (int i = 0; i < mergedDev->ndevs; i++) { + rCommDev = rComm->devs + i; + ibDevN = rCommDev->base.ibDevN; + ibDev = ncclIbDevs + ibDevN; + + // Retain remote fifo info and prepare my RDMA ops + rCommDev->fifoRkey = remMeta.devs[i].fifoRkey; + rComm->remFifo.addr = remMeta.fifoAddr; + NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); + rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey; + if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; + + // Allocate Flush dummy buffer for GPU Direct RDMA + if (rComm->flushEnabled) { + NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); + rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem; + rCommDev->gpuFlush.sge.length = 1; + rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey; + NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp)); + struct ncclIbDevInfo devInfo; + devInfo.lid = ibDev->portAttr.lid; + devInfo.link_layer = ibDev->portAttr.link_layer; + devInfo.ib_port = ibDev->portNum; + devInfo.spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix; + devInfo.iid = rCommDev->base.gidInfo.localGid.global.interface_id; + devInfo.is_global = (ncclParamIbIsGlobal() #if HAVE_DECL_IBV_QPF_GRH_REQUIRED - || (portAttr.flags & IBV_QPF_GRH_REQUIRED) + || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED) #endif ); - localQpInfo.mtu=portAttr.active_mtu; - NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo)); - NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp)); - } + devInfo.mtu = ibDev->portAttr.active_mtu; + NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo)); + NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp)); + } - // Fill Handle - struct ncclIbQpInfo qpInfo; - qpInfo.lid=portAttr.lid; - qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer; - qpInfo.ib_port=ib_port; - for (int q=0; qnqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num; - qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; - qpInfo.iid=rComm->gidInfo.localGid.global.interface_id; - qpInfo.is_global=(ncclParamIbIsGlobal() + // Fill Handle + meta.devs[i].lid = ibDev->portAttr.lid; + meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; + meta.devs[i].ib_port = ibDev->portNum; + meta.devs[i].spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix; + meta.devs[i].iid = rCommDev->base.gidInfo.localGid.global.interface_id; + meta.devs[i].is_global = (ncclParamIbIsGlobal() #if HAVE_DECL_IBV_QPF_GRH_REQUIRED - || (portAttr.flags & IBV_QPF_GRH_REQUIRED) + || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED) #endif ); - qpInfo.mtu=remQpInfo.mtu; + + // Adjust the MTU + remMeta.devs[i].mtu = (enum ibv_mtu)MIN(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu); + meta.devs[i].mtu = remMeta.devs[i].mtu; + + // Prepare sizes fifo + NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey; + } + meta.fifoAddr = (uint64_t)rComm->sizesFifo; + + for (int q = 0; q < rComm->base.nqps; q++) { + meta.qpInfo[q].qpn = rComm->base.qps[q].qp->qp_num; + meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex; + } + + meta.ndevs = rComm->base.ndevs; + strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); stage->state = ncclIbCommStateSend; stage->offset = 0; if (stage->buffer) free(stage->buffer); - NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo))); - memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo)); - + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata))); + memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata)); ib_send: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset)); - if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset)); + if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess; stage->offset = 0; stage->state = ncclIbCommStatePendingReady; ib_recv_ready: - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); @@ -606,14 +849,20 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { return ncclSuccess; } -ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) { +ncclResult_t ncclIbAccept_v6(void* listenComm, void** recvComm) { + ncclNetDeviceHandle_v7_t* handle = NULL; + return ncclIbAccept(listenComm, recvComm, &handle); +} + +ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) { for (int i=0; ireqs+i; + struct ncclIbRequest* r = base->reqs+i; if (r->type == NCCL_NET_IB_REQ_UNUSED) { - r->verbs = verbs; - r->events = 1; + r->base = base; r->sock = NULL; - r->gidInfo = NULL; + r->devBases[0] = NULL; + r->devBases[1] = NULL; + r->events[0] = r->events[1] = 0; *req = r; return ncclSuccess; } @@ -622,6 +871,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** *req = NULL; return ncclInternalError; } + ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { r->type = NCCL_NET_IB_REQ_UNUSED; return ncclSuccess; @@ -629,22 +879,17 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { ncclResult_t ncclIbTest(void* request, int* done, int* size); -/* DMA-BUF support */ -ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { - NCCL_STATIC_ASSERT(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset") - assert(size > 0); - +ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, struct ibv_mr** mhandle) { static __thread uintptr_t pageSize = 0; if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE); - struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; - struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; + struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; ncclResult_t res; - pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); for (int slot=0; /*true*/; slot++) { - if (slot == cache->population) { // didn't find in cache + if (slot == cache->population || addr < cache->slots[slot].addr) { // didn't find in cache if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; NCCLCHECKGOTO(ncclRealloc((void **)&cache->slots, sizeof(struct ncclIbMr)*cache->population, sizeof(struct ncclIbMr)*cache->capacity), res, returning); @@ -655,47 +900,75 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING; if (fd != -1) { /* DMA-BUF support */ - NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning); + NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning); } else { if (ncclIbRelaxedOrderingEnabled) { // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support - NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); + NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); } else { - NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); + NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags), res, returning); } } - INFO(NCCL_ALL,"regAddr %llx size %lld rkey %x fd %d type %s", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd, type == NCCL_PTR_HOST ? "host" : "gpu"); - cache->population += 1; + TRACE(NCCL_INIT|NCCL_NET,"regAddr=0x%lx size=%lld rkey=0x%x lkey=0x%x fd=%d", (unsigned long)addr, (long long)pages*pageSize, mr->rkey, mr->lkey, fd); + if (slot != cache->population) memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclIbMr)); cache->slots[slot].addr = addr; cache->slots[slot].pages = pages; cache->slots[slot].refs = 1; cache->slots[slot].mr = mr; + cache->population += 1; *mhandle = (void*)mr; res = ncclSuccess; goto returning; - } - else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) { + } else if ((addr >= cache->slots[slot].addr) && + ((addr-cache->slots[slot].addr)/pageSize+pages) <= cache->slots[slot].pages) { cache->slots[slot].refs += 1; - *mhandle = (void*)cache->slots[slot].mr; + *mhandle = cache->slots[slot].mr; res = ncclSuccess; goto returning; } } returning: - pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } -ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { +struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, int devIndex) { + if (base->isSend) { + struct ncclIbSendComm* sComm = (struct ncclIbSendComm*) base; + return &sComm->devs[devIndex].base; + } else { + struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*) base; + return &rComm->devs[devIndex].base; + } +} + +/* DMA-BUF support */ +ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { + assert(size > 0); + struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; + struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); + for (int i = 0; i < base->ndevs; i++) { + // Each ncclIbNetCommDevBase is at different offset in send and recv netComms + struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); + NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i)); + } + *mhandle = (void*) mhandleWrapper; + return ncclSuccess; +} + +ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) { + return ncclIbRegMrDmaBuf(comm, data, size, type, 0ULL, -1, mhandle); +} + +ncclResult_t ncclIbRegMr_v7(void* comm, void* data, int size, int type, void** mhandle) { return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle); } -ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { - struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; - struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; +ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, struct ibv_mr* mhandle) { + struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; ncclResult_t res; - pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); for (int i=0; i < cache->population; i++) { if (mhandle == cache->slots[i].mr) { if (0 == --cache->slots[i].refs) { @@ -705,7 +978,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { cache->slots = NULL; cache->capacity = 0; } - NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning); + NCCLCHECKGOTO(wrap_ibv_dereg_mr(mhandle), res, returning); } res = ncclSuccess; goto returning; @@ -714,11 +987,23 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); res = ncclInternalError; returning: - pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); return res; } -NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1); +ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { + struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; + struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; + for (int i = 0; i < base->ndevs; i++) { + // Each ncclIbNetCommDevBase is at different offset in send and recv netComms + struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); + NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i])); + } + free(mhandleWrapper); + return ncclSuccess; +} + +NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0); ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; @@ -727,21 +1012,18 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; uint64_t wr_id = 0ULL; - for (int r=0; rwrs+r; memset(wr, 0, sizeof(struct ibv_send_wr)); struct ibv_sge* sge = comm->sges+r; sge->addr=(uintptr_t)reqs[r]->send.data; - sge->lkey=reqs[r]->send.lkey; wr->opcode = IBV_WR_RDMA_WRITE; wr->send_flags = 0; wr->wr.rdma.remote_addr = slots[r].addr; - wr->wr.rdma.rkey = slots[r].rkey; - wr->next = wr+1; - wr_id += (reqs[r] - comm->verbs.reqs) << (r*8); + wr->next = wr + 1; + wr_id += (reqs[r] - comm->base.reqs) << (r*8); } // Write size as immediate data. In the case of multi-send, only write @@ -750,13 +1032,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { if (nreqs == 1) { immData = reqs[0]->send.size; } else { - if (nreqs > 32) { - WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs); - return ncclInternalError; - } - for (int r=0; rsend.size ? 1 : 0) << r; - } + int* sizes = comm->remSizesFifo.elems[slot]; + for (int r=0; rsend.size; + comm->remSizesFifo.sge.addr = (uint64_t)sizes; + comm->remSizesFifo.sge.length = nreqs*sizeof(int); } struct ibv_send_wr* lastWr = comm->wrs+nreqs-1; @@ -766,6 +1045,12 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { // completion. lastWr++; memset(lastWr, 0, sizeof(struct ibv_send_wr)); + if (nreqs > 1) { + // Write remote sizes Fifo + lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int); + lastWr->num_sge = 1; + lastWr->sg_list = &comm->remSizesFifo.sge; + } } lastWr->wr_id = wr_id; lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; @@ -775,23 +1060,40 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work const int align = 128; - const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; - for (int q=0; qbase.nqps : comm->base.ndevs; + for (int i = 0; i < nqps; i++) { + int qpIndex = comm->base.qpIndex; + ncclIbQp* qp = comm->base.qps + qpIndex; + int devIndex = qp->devIndex; for (int r=0; rdevs[devIndex].base); + + // Select proper rkey (needed even for 0-size send) + comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx]; + int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align; int length = MIN(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); if (length <= 0) { comm->wrs[r].sg_list = NULL; comm->wrs[r].num_sge = 0; } else { + // Select proper lkey + comm->sges[r].lkey = reqs[r]->send.lkeys[devIndex]; comm->sges[r].length = length; comm->wrs[r].sg_list = comm->sges+r; comm->wrs[r].num_sge = 1; } } + + if (nreqs > 1) { + // Also make sure lastWr writes remote sizes using the right lkey + comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey; + lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex]; + } + struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr)); - comm->qpIndex = (comm->qpIndex+1)%comm->nqps; + NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr)); for (int r=0; rsend.size, nqps), align) * align; @@ -799,6 +1101,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { comm->sges[r].addr += chunkSize; comm->wrs[r].wr.rdma.remote_addr += chunkSize; } + + // Select the next qpIndex + comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps; } return ncclSuccess; @@ -806,16 +1111,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; - if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; } - if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } + if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } - struct ibv_mr* mr = (struct ibv_mr*)mhandle; + struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; // Wait for the receiver to have posted the corresponding receive int nreqs = 0; volatile struct ncclIbSendFifo* slots; - int slot = (comm->fifoHead)%MAX_REQUESTS; + int slot = (comm->fifoHead) % MAX_REQUESTS; struct ncclIbRequest** reqs = comm->fifoReqs[slot]; slots = comm->fifo[slot]; uint64_t idx = comm->fifoHead+1; @@ -827,36 +1132,45 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh for (int r=0; r slots[r].size) { - char line[SOCKET_NAME_MAXLEN + 1]; - union ncclSocketAddress addr; - ncclSocketGetAddr(&comm->sock, &addr); - WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d", - r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 1), size, slots[r].size); - return ncclInvalidUsage; - } // plus any potential programming errors - else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { + if (size > slots[r].size) size = slots[r].size; + // Sanity checks + if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) { char line[SOCKET_NAME_MAXLEN + 1]; union ncclSocketAddress addr; - ncclSocketGetAddr(&comm->sock, &addr); - WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x", - r, nreqs, tag, ncclSocketToString(&addr, line, 1), slots[r].size, slots[r].addr, slots[r].rkey); - return ncclInternalError; + ncclSocketGetAddr(&comm->base.sock, &addr); + WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x", + r, nreqs, tag, ncclSocketToString(&addr, line, 1), slots[r].size, slots[r].addr, slots[r].rkeys[0]); } struct ncclIbRequest* req; - NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); + NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_SEND; - req->sock = &comm->sock; - req->verbs = &comm->verbs; + req->sock = &comm->base.sock; + req->base = &comm->base; req->nreqs = nreqs; req->send.size = size; req->send.data = data; - req->send.lkey = mr->lkey; req->send.offset = 0; - req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; - req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; - if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; + + // Populate events + int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; + int qpIndex = comm->base.qpIndex; + // Count down + while (nEvents > 0) { + ncclIbQp* qp = comm->base.qps + qpIndex; + int devIndex = qp->devIndex; + ncclIbAddEvent(req, devIndex, &comm->devs[devIndex].base); + // Track the valid lkey for this RDMA_Write + req->send.lkeys[devIndex] = mhandleWrapper->mrs[devIndex]->lkey; + nEvents--; + // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs inside ncclIbMultiSend() + qpIndex = (qpIndex+1)%comm->base.nqps; + } + + // Store all lkeys + for (int i = 0; i < comm->base.ndevs; i++) { + req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey; + } + *request = reqs[r] = req; // If this is a multi-recv, send only when all requests have matched. @@ -884,12 +1198,23 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int memset(&wr, 0, sizeof(wr)); int slot = comm->remFifo.fifoTail%MAX_REQUESTS; + req->recv.sizes = comm->sizesFifo[slot]; + for (int i=0; irecv.sizes[i] = 0; struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot]; + // Select the next devIndex (local) and QP to use for posting this CTS message + // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value + ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex; + comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs; + for (int i=0; irkey; + struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i]; + + // Send all applicable rkeys + for (int j = 0; j < comm->base.ndevs; j++) + localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey; + localElem[i].nreqs = n; localElem[i].size = sizes[i]; // Sanity/Debugging localElem[i].tag = tags[i]; @@ -897,11 +1222,17 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int } wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo); - wr.wr.rdma.rkey = comm->remFifo.rkey; - comm->remFifo.sge.addr = (uint64_t)localElem; - comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo); - wr.sg_list = &comm->remFifo.sge; + + // Lookup the correct fifoRkey + wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey; + + // Set the correct sge properties + comm->devs[ctsQp->devIndex].fifoSge.addr = (uint64_t)localElem; + comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo); + wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE; wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE @@ -926,14 +1257,17 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int // polling it will empty the Send Queue, can be posted) // - The status of all posted Send Request is considered unknown // - if (slot == 0) { + // slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled. + // This works out that each fifo posting QP gets drained + if (slot == ctsQp->devIndex) { + wr.send_flags |= IBV_SEND_SIGNALED; - wr.wr_id = req - comm->verbs.reqs; - req->events++; + wr.wr_id = req - comm->base.reqs; + ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base); } struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr)); + NCCLCHECK(wrap_ibv_post_send(ctsQp->qp, &wr, &bad_wr)); comm->remFifo.fifoTail++; return ncclSuccess; @@ -941,42 +1275,48 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; - if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; } - if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } + if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } + if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; struct ncclIbRequest* req; - NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); + NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_RECV; - req->sock = &comm->sock; + req->sock = &comm->base.sock; req->nreqs = n; - if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; - for (int i=0; irecv.sizes[i] = 0; + + for (int i = 0; i < comm->base.ndevs; i++) { + req->devBases[i] = &comm->devs[i].base; + } struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = req - comm->verbs.reqs; + wr.wr_id = req - comm->base.reqs; wr.sg_list = NULL; wr.num_sge = 0; TIME_START(1); - const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; - for (int q=0; qqps[comm->qpIndex]; - struct ibv_recv_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr)); - comm->qpIndex = (comm->qpIndex+1)%comm->nqps; + // Select either all QPs, or one qp per-device + const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; + + // Post recvs + struct ibv_recv_wr* bad_wr; + for (int i = 0; i < nqps; i++) { + struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex; + ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base); + NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr)); + comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps; } TIME_STOP(1); - req->events = nqps; - - *request = req; // Post to FIFO to notify sender TIME_START(2); NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req)); TIME_STOP(2); + + *request = req; return ncclSuccess; } @@ -984,30 +1324,34 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; int last = -1; for (int i=0; igpuFlush.enabled == 0 || last == -1) return ncclSuccess; + if (comm->flushEnabled == 0 || last == -1) return ncclSuccess; // Only flush once using the last non-zero receive struct ncclIbRequest* req; - NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); + NCCLCHECK(ncclIbGetRequest(&comm->base, &req)); req->type = NCCL_NET_IB_REQ_FLUSH; - req->sock = &comm->sock; - struct ibv_mr* mr = (struct ibv_mr*)mhandles[last]; - - struct ibv_send_wr wr; - memset(&wr, 0, sizeof(wr)); - wr.wr_id = req - comm->verbs.reqs; - - wr.wr.rdma.remote_addr = (uint64_t)data[last]; - wr.wr.rdma.rkey = mr->rkey; - wr.sg_list = &comm->gpuFlush.sge; - wr.num_sge = 1; - wr.opcode = IBV_WR_RDMA_READ; - wr.send_flags = IBV_SEND_SIGNALED; - - TIME_START(4); - struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr)); - TIME_STOP(4); + req->sock = &comm->base.sock; + struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last]; + + // We don't know which devIndex the recv was on, so we flush on all devices + for (int i = 0; i < comm->base.ndevs; i++) { + struct ibv_send_wr wr; + memset(&wr, 0, sizeof(wr)); + wr.wr_id = req - comm->base.reqs; + + wr.wr.rdma.remote_addr = (uint64_t)data[last]; + wr.wr.rdma.rkey = mhandle->mrs[i]->rkey; + wr.sg_list = &comm->devs[i].gpuFlush.sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_READ; + wr.send_flags = IBV_SEND_SIGNALED; + + TIME_START(4); + struct ibv_send_wr* bad_wr; + NCCLCHECK(wrap_ibv_post_send(comm->devs[i].gpuFlush.qp.qp, &wr, &bad_wr)); + TIME_STOP(4); + ncclIbAddEvent(req, i, &comm->devs[i].base); + } *request = req; return ncclSuccess; @@ -1018,74 +1362,108 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { *done = 0; while (1) { - if (r->events == 0) { + if (r->events[0] == 0 && r->events[1] == 0) { + TRACE(NCCL_NET, "r=%p done", r); *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; } + if (sizes && r->type == NCCL_NET_IB_REQ_SEND) { + sizes[0] = r->send.size; + } + + if (sizes && r->type == NCCL_NET_IB_REQ_SEND) { + sizes[0] = r->send.size; + } + NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } + int totalWrDone = 0; int wrDone = 0; struct ibv_wc wcs[4]; - TIME_START(3); - NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); - if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } - if (wrDone == 0) return ncclSuccess; - - for (int w=0; wstatus != IBV_WC_SUCCESS) { - char line[SOCKET_NAME_MAXLEN+1]; - union ncclSocketAddress addr; - ncclSocketGetAddr(r->sock, &addr); - char localGidString[INET6_ADDRSTRLEN] = ""; - char remoteGidString[INET6_ADDRSTRLEN] = ""; - const char* localGidStr = NULL, *remoteGidStr = NULL; - if (r->gidInfo) { - localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString)); - remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString)); - } - WARN("NET/IB : Got completion from peer %s with error %s(%d), opcode %d, len %d, vendor err %d (%s)%s%s%s%s", - ncclSocketToString(&addr, line, 1), ibv_wc_status_str(wc->status), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], - localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString); - return ncclRemoteError; - } + for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) { + TIME_START(3); + // If we expect any completions from this device's CQ + if (r->events[i]) { + NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone)); + totalWrDone += wrDone; + if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } + if (wrDone == 0) continue; + for (int w=0; wstatus != IBV_WC_SUCCESS) { + union ncclSocketAddress addr; + ncclSocketGetAddr(r->sock, &addr); + char localGidString[INET6_ADDRSTRLEN] = ""; + char remoteGidString[INET6_ADDRSTRLEN] = ""; + const char* localGidStr = NULL, *remoteGidStr = NULL; + if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) { + localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString)); + remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString)); + } - struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff); - if (req->type == NCCL_NET_IB_REQ_SEND) { - for (int i=0; inreqs; i++) { - struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff); - if ((sendReq->events <= 0)) return ncclInternalError; - sendReq->events--; - } - } else { - if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError; - if (req->nreqs > 1) { - // In the case of a multi recv, we only set sizes to 0 or 1. - for (int i=0; inreqs; i++) { - req->recv.sizes[i] = (wc->imm_data >> i) & 0x1; + char line[SOCKET_NAME_MAXLEN+1]; + WARN("NET/IB : Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s", + ncclSocketToString(&addr, line, 1), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], + localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString); + return ncclRemoteError; + } + + union ncclSocketAddress addr; + ncclSocketGetAddr(r->sock, &addr); + struct ncclIbRequest* req = r->base->reqs+(wc->wr_id & 0xff); + + #ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; + TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%d r=%p type=%d events={%d,%d}, i=%d", + ncclSocketToString(&addr, line, 1), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); + #endif + if (req->type == NCCL_NET_IB_REQ_SEND) { + for (int j = 0; j < req->nreqs; j++) { + struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff); + if ((sendReq->events[i] <= 0)) { + WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j); + return ncclInternalError; + } + sendReq->events[i]--; } } else { - req->recv.sizes[0] += wc->imm_data; + if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + if (req->type != NCCL_NET_IB_REQ_RECV) { + WARN("NET/IB: wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM and req->type=%d", req->type); + return ncclInternalError; + } + if (req->nreqs == 1) { + req->recv.sizes[0] += wc->imm_data; + } + } + req->events[i]--; } } - req->events--; } } + + // If no CQEs found on any device, return and come back later + if (totalWrDone == 0) return ncclSuccess; } } ncclResult_t ncclIbCloseSend(void* sendComm) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm) { - NCCLCHECK(ncclSocketClose(&comm->sock)); - for (int q=0; qnqps; q++) - if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); - if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); - NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); + NCCLCHECK(ncclSocketClose(&comm->base.sock)); + + for (int q = 0; q < comm->base.nqps; q++) + if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); + + for (int i = 0; i < comm->base.ndevs; i++) { + struct ncclIbSendCommDev* commDev = comm->devs + i; + if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr)); + if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i])); + NCCLCHECK(ncclIbDestroyBase(&commDev->base)); + } free(comm); } TIME_PRINT("IB"); @@ -1095,15 +1473,21 @@ ncclResult_t ncclIbCloseSend(void* sendComm) { ncclResult_t ncclIbCloseRecv(void* recvComm) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm) { - NCCLCHECK(ncclSocketClose(&comm->sock)); - for (int q=0; qnqps; q++) - if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); - if (comm->gpuFlush.enabled) { - if (comm->gpuFlush.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->gpuFlush.qp)); - if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr)); + NCCLCHECK(ncclSocketClose(&comm->base.sock)); + + for (int q = 0; q < comm->base.nqps; q++) + if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); + + for (int i = 0; i < comm->base.ndevs; i++) { + struct ncclIbRecvCommDev* commDev = comm->devs + i; + if (comm->flushEnabled) { + if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp)); + if (commDev->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.hostMr)); + } + if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr)); + if (commDev->sizesFifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->sizesFifoMr)); + NCCLCHECK(ncclIbDestroyBase(&commDev->base)); } - if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr)); - NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } return ncclSuccess; @@ -1118,8 +1502,8 @@ ncclResult_t ncclIbCloseListen(void* listenComm) { return ncclSuccess; } -const ncclNet_v6_t ibPlugin_v6 = { - .name = "IBext", +const ncclNet_v8_t ibPlugin_v8 = { + .name = "IBext_v8", .init = ncclIbInit, .devices = ncclIbDevices, .getProperties = ncclIbGetProperties, @@ -1136,17 +1520,20 @@ const ncclNet_v6_t ibPlugin_v6 = { .closeSend = ncclIbCloseSend, .closeRecv = ncclIbCloseRecv, .closeListen = ncclIbCloseListen, + NULL /* getDeviceMr */, + NULL /* irecvConsumed */ }; -const ncclNet_v5_t ibPlugin_v5 = { - .name = "IBext", +const ncclNet_v7_t ibPlugin_v7 = { + .name = "IBext_v7", .init = ncclIbInit, .devices = ncclIbDevices, - .getProperties = ncclIbGetProperties, + .getProperties = ncclIbGetProperties_v7, .listen = ncclIbListen, .connect = ncclIbConnect, .accept = ncclIbAccept, - .regMr = ncclIbRegMr, + .regMr = ncclIbRegMr_v7, + .regMrDmaBuf = ncclIbRegMrDmaBuf, .deregMr = ncclIbDeregMr, .isend = ncclIbIsend, .irecv = ncclIbIrecv, @@ -1155,60 +1542,43 @@ const ncclNet_v5_t ibPlugin_v5 = { .closeSend = ncclIbCloseSend, .closeRecv = ncclIbCloseRecv, .closeListen = ncclIbCloseListen, + NULL /* getDeviceMr */, + NULL /* irecvConsumed */ }; -static ncclResult_t ncclIbGetProperties_v4(int dev, ncclNetProperties_v4_t* props) { - ncclNetProperties_v6_t props_v6; - ncclResult_t ret = ncclIbGetProperties(dev, &props_v6); - if (ret != ncclSuccess) return ret; - props->name = props_v6.name; - props->pciPath = props_v6.pciPath; - props->guid = props_v6.guid; - props->ptrSupport = props_v6.ptrSupport; - props->speed = props_v6.speed; - props->port = props_v6.port; - props->maxComms = props_v6.maxComms; - return ncclSuccess; -} -static ncclResult_t ncclIbIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { - return ncclIbIsend(sendComm, data, size, 0, mhandle, request); -} -static ncclResult_t ncclIbIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { - int tag = 0; - return ncclIbIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); -} -static ncclResult_t ncclIbIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { - return ncclIbIflush(recvComm, 1, &data, &size, &mhandle, request); -} -static ncclResult_t ncclIbConnect_v4(int dev, void* handle, void** sendComm) { - ncclResult_t ret; - do { - ret = ncclIbConnect(dev, handle, sendComm); - } while (ret == ncclSuccess && *sendComm == NULL); - return ret; -} -static ncclResult_t ncclIbAccept_v4(void* listenComm, void** recvComm) { - ncclResult_t ret; - do { - ret = ncclIbAccept(listenComm, recvComm); - } while (ret == ncclSuccess && *recvComm == NULL); - return ret; -} - +const ncclNet_v6_t ibPlugin_v6 = { + .name = "IBext_v6", + .init = ncclIbInit, + .devices = ncclIbDevices, + .getProperties = ncclIbGetProperties_v6, + .listen = ncclIbListen, + .connect = ncclIbConnect_v6, + .accept = ncclIbAccept_v6, + .regMr = ncclIbRegMr_v7, + .regMrDmaBuf = ncclIbRegMrDmaBuf, + .deregMr = ncclIbDeregMr, + .isend = ncclIbIsend, + .irecv = ncclIbIrecv, + .iflush = ncclIbIflush, + .test = ncclIbTest, + .closeSend = ncclIbCloseSend, + .closeRecv = ncclIbCloseRecv, + .closeListen = ncclIbCloseListen, +}; -const ncclNet_v4_t ibPlugin_v4 = { - .name = "IBext", +const ncclNet_v5_t ibPlugin_v5 = { + .name = "IBext_v5", .init = ncclIbInit, .devices = ncclIbDevices, - .getProperties = ncclIbGetProperties_v4, + .getProperties = ncclIbGetProperties_v6, .listen = ncclIbListen, - .connect = ncclIbConnect_v4, - .accept = ncclIbAccept_v4, - .regMr = ncclIbRegMr, + .connect = ncclIbConnect_v6, + .accept = ncclIbAccept_v6, + .regMr = ncclIbRegMr_v7, .deregMr = ncclIbDeregMr, - .isend = ncclIbIsend_v4, - .irecv = ncclIbIrecv_v4, - .iflush = ncclIbIflush_v4, + .isend = ncclIbIsend, + .irecv = ncclIbIrecv, + .iflush = ncclIbIflush, .test = ncclIbTest, .closeSend = ncclIbCloseSend, .closeRecv = ncclIbCloseRecv, diff --git a/nccl_plugin/src/ibvwrap.c b/nccl_plugin/src/ibvwrap.c index 9d21063..4e4c771 100644 --- a/nccl_plugin/src/ibvwrap.c +++ b/nccl_plugin/src/ibvwrap.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2015-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -24,10 +24,24 @@ } \ return ncclSuccess; +#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(call, success_retval, name, supported) \ + int ret = call; \ + if (ret == ENOTSUP || ret == EOPNOTSUPP) { \ + INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + *supported = 0; \ + return ncclSuccess; \ + } else if (ret != success_retval) { \ + WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + *supported = 1; \ + return ncclSystemError; \ + } \ + *supported = 1; \ + return ncclSuccess; + #define IBV_INT_CHECK_RET_ERRNO(call, success_retval, name) \ int ret = call; \ if (ret != success_retval) { \ - WARN("Call to " name " failed with error %s", strerror(ret)); \ + WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ return ncclSystemError; \ } \ return ncclSuccess; @@ -169,6 +183,26 @@ ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struc return ncclSuccess; } +ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ +#if HAVE_DECL_IBV_QUERY_ECE + IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_query_ece(qp, ece), 0, "ibv_query_ece", supported); +#else + INFO(NCCL_NET, "Call to ibv_query_ece is skipped, doesn't exist"); + *supported = 0; + return ncclSuccess; +#endif +} + +ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ +#if HAVE_DECL_IBV_SET_ECE + IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_set_ece(qp, ece), 0, "ibv_set_ece", supported); +#else + INFO(NCCL_NET, "Call to ibv_set_ece skipped, doesn't exist"); + *supported = 0; + return ncclSuccess; +#endif +} + ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { *ret = (char *) ibv_event_type_str(event); return ncclSuccess; diff --git a/nccl_plugin/src/optcast_plugin.cc b/nccl_plugin/src/optcast_plugin.cc index 5414195..a684dd6 100644 --- a/nccl_plugin/src/optcast_plugin.cc +++ b/nccl_plugin/src/optcast_plugin.cc @@ -17,7 +17,8 @@ extern ncclNet_v6_t ncclNetPlugin_v6; extern ncclNet_v5_t ncclNetPlugin_v5; -extern ncclNet_v4_t ncclNetPlugin_v4; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 int ncclNSharpDevs = -1; @@ -389,11 +390,6 @@ static ncclResult_t ncclOptcastGetProperties_v5(int dev, ncclNetProperties_v5_t return ncclNetPlugin_v5.getProperties(dev, props); } -static ncclResult_t ncclOptcastGetProperties_v4(int dev, ncclNetProperties_v4_t *props) -{ - return ncclNetPlugin_v4.getProperties(dev, props); -} - static ncclResult_t ncclOptcastListen(int dev, void *opaqueHandle, void **listenComm) { struct optcastListenComm *lComm; @@ -639,19 +635,3 @@ ncclCollNet_v5_t ncclCollNetPlugin_v5 = { ncclOptcastTest, ncclOptcastCloseColl, ncclOptcastCloseListen}; - -ncclCollNet_v4_t ncclCollNetPlugin_v4 = { - "Optcast", - ncclOptcastInit, - ncclOptcastDevices, - ncclOptcastGetProperties_v4, - ncclOptcastListen, - ncclOptcastConnect, - ncclOptcastReduceSupport, - ncclOptcastRegMr, - ncclOptcastDeregMr, - ncclOptcastIallreduce, - ncclOptcastIflush, - ncclOptcastTest, - ncclOptcastCloseColl, - ncclOptcastCloseListen}; diff --git a/nccl_plugin/src/p2p_plugin.c b/nccl_plugin/src/p2p_plugin.c index 227a7be..daf90ce 100644 --- a/nccl_plugin/src/p2p_plugin.c +++ b/nccl_plugin/src/p2p_plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -15,26 +15,26 @@ #include "p2p_plugin.h" #ifdef HAVE_UCX_PLUGIN +extern ncclNet_v8_t ucxPlugin_v8; +extern ncclNet_v7_t ucxPlugin_v7; extern ncclNet_v6_t ucxPlugin_v6; extern ncclNet_v5_t ucxPlugin_v5; -extern ncclNet_v4_t ucxPlugin_v4; +extern ncclNet_v8_t ucxRmaPlugin_v8; +extern ncclNet_v7_t ucxRmaPlugin_v7; extern ncclNet_v6_t ucxRmaPlugin_v6; extern ncclNet_v5_t ucxRmaPlugin_v5; -extern ncclNet_v4_t ucxRmaPlugin_v4; #endif +extern ncclNet_v8_t ibPlugin_v8; +extern ncclNet_v7_t ibPlugin_v7; extern ncclNet_v6_t ibPlugin_v6; extern ncclNet_v5_t ibPlugin_v5; -extern ncclNet_v4_t ibPlugin_v4; - -extern ncclNet_v6_t socketPlugin_v6; -extern ncclNet_v5_t socketPlugin_v5; -extern ncclNet_v4_t socketPlugin_v4; pthread_mutex_t nccl_p2p_lock = PTHREAD_MUTEX_INITIALIZER; ncclDebugLogger_t pluginLogFunction; +static int ncclNMergedIbDevs = -1; -#ifdef HAVE_SHARP_PLUGIN +#if 1 extern int ncclNSharpDevs; #else /* In case sharp plugin is not there just define this variable locally to make code cleaner */ @@ -44,32 +44,43 @@ extern int ncclIbRelaxedOrderingEnabled; NCCL_PARAM(SharpMaxComms, "SHARP_MAX_COMMS", 1); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); -ncclResult_t pluginInit(ncclDebugLogger_t logFunction); +ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction); +ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction); +ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction); +ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction); + +ncclNet_v8_t ncclNetPlugin_v8 = { + "NCCL RDMA Plugin v8", + pluginInit_v8, +}; + +ncclNet_v7_t ncclNetPlugin_v7 = { + "NCCL RDMA Plugin v7", + pluginInit_v7, +}; ncclNet_v6_t ncclNetPlugin_v6 = { "NCCL RDMA Plugin v6", - pluginInit, + pluginInit_v6, }; ncclNet_v5_t ncclNetPlugin_v5 = { "NCCL RDMA Plugin v5", - pluginInit, + pluginInit_v5, }; -ncclNet_v4_t ncclNetPlugin_v4 = { - "NCCL RDMA Plugin v4", - pluginInit, -}; - static nccl_p2p_plugin_t p2p_plugin = NCCL_P2P_LAST; -ncclResult_t pluginInit(ncclDebugLogger_t logFunction) +static void pluginSetup() { - pluginLogFunction = logFunction; - p2p_plugin = NCCL_P2P_IB; + const char *plugin_path = get_plugin_lib_path(); + if (plugin_path != NULL) { + INFO(NCCL_INIT|NCCL_NET, "Plugin Path : %s", plugin_path);; + } + const char *p2p_layer = getenv("NCCL_PLUGIN_P2P"); if (p2p_layer != NULL) { if (!strcasecmp(p2p_layer, "ib")) p2p_plugin = NCCL_P2P_IB; @@ -77,50 +88,67 @@ ncclResult_t pluginInit(ncclDebugLogger_t logFunction) else if (!strcasecmp(p2p_layer, "ucx")) p2p_plugin = NCCL_P2P_UCX; else if (!strcasecmp(p2p_layer, "ucx_rma")) p2p_plugin = NCCL_P2P_UCX_RMA; #endif - else if (!strcasecmp(p2p_layer, "socket")) p2p_plugin = NCCL_P2P_SOCKET; else { WARN("Invalid value %s for NCCL_PLUGIN_P2P, using default", p2p_layer); } } switch (p2p_plugin) { - case NCCL_P2P_IB: - ncclNetPlugin_v6 = ibPlugin_v6; - ncclNetPlugin_v5 = ibPlugin_v5; - ncclNetPlugin_v4 = ibPlugin_v4; - break; - case NCCL_P2P_SOCKET: - ncclNetPlugin_v6 = socketPlugin_v6; - ncclNetPlugin_v5 = socketPlugin_v5; - ncclNetPlugin_v4 = socketPlugin_v4; - break; #ifdef HAVE_UCX_PLUGIN case NCCL_P2P_UCX: + ncclNetPlugin_v8 = ucxPlugin_v8; + ncclNetPlugin_v7 = ucxPlugin_v7; ncclNetPlugin_v6 = ucxPlugin_v6; ncclNetPlugin_v5 = ucxPlugin_v5; - ncclNetPlugin_v4 = ucxPlugin_v4; break; case NCCL_P2P_UCX_RMA: + ncclNetPlugin_v8 = ucxRmaPlugin_v8; + ncclNetPlugin_v7 = ucxRmaPlugin_v7; ncclNetPlugin_v6 = ucxRmaPlugin_v6; ncclNetPlugin_v5 = ucxRmaPlugin_v5; - ncclNetPlugin_v4 = ucxRmaPlugin_v4; break; #endif + default: + ncclNetPlugin_v8 = ibPlugin_v8; + ncclNetPlugin_v7 = ibPlugin_v7; + ncclNetPlugin_v6 = ibPlugin_v6; + ncclNetPlugin_v5 = ibPlugin_v5; + break; } - INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", NCCL_PLUGIN_SYMBOL.name); - return NCCL_PLUGIN_SYMBOL.init(logFunction); } -NCCL_PARAM(GdrDisable, "GDR_DISABLE", 0); +ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction) { + pluginLogFunction = logFunction; + pluginSetup(); + INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v8.name); + return ncclNetPlugin_v8.init(logFunction); +} + +ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction) { + pluginLogFunction = logFunction; + pluginSetup(); + INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v7.name); + return ncclNetPlugin_v7.init(logFunction); +} -ncclResult_t nccl_p2p_gdr_support(int dev) +ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction) { + pluginLogFunction = logFunction; + pluginSetup(); + INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v6.name); + return ncclNetPlugin_v6.init(logFunction); +} + +ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction) { + pluginLogFunction = logFunction; + pluginSetup(); + INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v5.name); + return ncclNetPlugin_v5.init(logFunction); +} + +ncclResult_t nccl_p2p_gdr_support() { static int module_loaded = -1; - if (ncclParamGdrDisable() == 1) { - return ncclSystemError; - } - if (module_loaded == -1) { module_loaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1; } @@ -142,13 +170,20 @@ ncclResult_t nccl_p2p_dmabuf_support(int dev) { ncclResult_t res; struct ibv_pd* pd; struct ibv_context* ctx; - ctx = ncclIbDevs[dev].context; - NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); - // Test kernel DMA-BUF support with a dummy call (fd=-1) - (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); - // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) - dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0; - NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); + struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev; + + // Test each dev + for (int i = 0; i < mergedDev->ndevs; i++) { + int ibDev = mergedDev->devs[i]; + ctx = ncclIbDevs[ibDev].context; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); + // Test kernel DMA-BUF support with a dummy call (fd=-1) + (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/); + // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) + dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0; + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); + } + } if (dmaBufSupported == 0) return ncclSystemError; return ncclSuccess; @@ -158,39 +193,52 @@ ncclResult_t nccl_p2p_dmabuf_support(int dev) { } -ncclResult_t nccl_p2p_ib_get_properties(nccl_ib_dev_t *devs, int dev, ncclNetProperties_t* props) +ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int dev, ncclNetProperties_t* props) { - props->name = devs[dev].devName; - props->pciPath = devs[dev].pciPath; - props->guid = devs[dev].guid; + struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev; + props->name = mergedDev->devName; + props->speed = mergedDev->speed; + + // Take the rest of the properties from an arbitrary sub-device (should be the same) + struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0]; + props->pciPath = ibDev->pciPath; + props->guid = ibDev->guid; + props->ptrSupport = NCCL_PTR_HOST; - if (nccl_p2p_gdr_support(dev) == ncclSuccess) { + if (nccl_p2p_gdr_support() == ncclSuccess) { props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem INFO(NCCL_NET,"NET/IB : GPU Direct RDMA (nvidia-peermem) enabled for HCA %d '%s", dev, devs[dev].devName); } + props->regIsGlobal = 1; if (p2p_plugin == NCCL_P2P_IB && nccl_p2p_dmabuf_support(dev) == ncclSuccess) { props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF INFO(NCCL_NET,"NET/IB : GPU Direct RDMA (DMABUF) enabled for HCA %d '%s", dev, devs[dev].devName); } - props->speed = devs[dev].speed; props->latency = 0; // Not set - props->port = devs[dev].port + devs[dev].realPort; - props->maxComms = devs[dev].maxQp; - props->maxRecvs = (p2p_plugin == NCCL_P2P_IB) ? NCCL_NET_IB_MAX_RECVS : 1; + props->port = ibDev->portNum + ibDev->realPort; + props->maxComms = ibDev->maxQp; + + if (p2p_plugin == NCCL_P2P_IB || p2p_plugin == NCCL_P2P_UCX) { + props->maxRecvs = NCCL_NET_IB_MAX_RECVS; + } else { + props->maxRecvs = 1; + } + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } static void* ncclIbAsyncThreadMain(void* args) { - struct ibv_context* context = (struct ibv_context*)args; + struct ncclIbDev* dev = (struct ncclIbDev*)args; while (1) { struct ibv_async_event event; - if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; } + if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; } char *str; if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } if (event.event_type != IBV_EVENT_COMM_EST) - WARN("NET/IB : Got async event : %s", str); + WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str); if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } } return NULL; @@ -206,7 +254,26 @@ int devSharpCompare(const void *a, const void *b) else { return 1; } } -ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction) +// Compare ncclIbDev[dev] to all stored mergedIbDevs +int ncclIbFindMatchingDev(int dev) { + for (int i = 0; i < ncclNMergedIbDevs; i++) { + if (ncclIbMergedDevs[i].ndevs < NCCL_IB_MAX_DEVS_PER_NIC) { + int compareDev = ncclIbMergedDevs[i].devs[0]; + if (strcmp(ncclIbDevs[dev].pciPath, ncclIbDevs[compareDev].pciPath) == 0 && + (ncclIbDevs[dev].guid == ncclIbDevs[compareDev].guid) && + (ncclIbDevs[dev].link == ncclIbDevs[compareDev].link)) { + TRACE(NCCL_NET, "NET/IB: Matched name1=%s pciPath1=%s guid1=0x%lx link1=%u name2=%s pciPath2=%s guid2=0x%lx link2=%u", + ncclIbDevs[dev].devName, ncclIbDevs[dev].pciPath, ncclIbDevs[dev].guid, ncclIbDevs[dev].link, + ncclIbDevs[compareDev].devName, ncclIbDevs[compareDev].pciPath, ncclIbDevs[compareDev].guid, ncclIbDevs[compareDev].link); + return i; + } + } + } + + return ncclNMergedIbDevs; +} + +ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction) { int ncclNIbDevs = *num_devs; @@ -216,6 +283,7 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc wrap_ibv_fork_init(); if (ncclNIbDevs == -1) { ncclNIbDevs = 0; + ncclNMergedIbDevs = 0; ncclNSharpDevs = 0; if (ncclFindInterfaces(ncclIbIfName, ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { WARN("NET/IB : No IP interface found."); @@ -249,10 +317,10 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } continue; } - for (int port = 1; port <= devAttr.phys_port_cnt; port++) { + for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) { struct ibv_port_attr portAttr; - if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { - WARN("NET/IB : Unable to query port %d", port); + if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) { + WARN("NET/IB : Unable to query port_num %d", port_num); continue; } if (portAttr.state != IBV_PORT_ACTIVE) continue; @@ -260,15 +328,13 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) { + if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } - TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; - ncclIbDevs[ncclNIbDevs].port = port; + ncclIbDevs[ncclNIbDevs].portNum = port_num; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; ncclIbDevs[ncclNIbDevs].speed = nccl_p2p_ib_speed(portAttr.active_speed) * nccl_p2p_ib_width(portAttr.active_width); ncclIbDevs[ncclNIbDevs].context = context; @@ -281,20 +347,49 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc ncclIbDevs[ncclNIbDevs].mrCache.population = 0; ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; - // Enable ADAPTIVE_ROUTING by default on IB networks + // Enable ADAPTIVE_ROUTING by default on IB networks // But allow it to be overloaded by an env parameter ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); - ncclIbDevs[ncclNIbDevs].isSharpDev = 1; - ncclIbDevs[ncclNIbDevs].maxQp = ncclParamSharpMaxComms(); - ncclNSharpDevs++; - + ncclIbDevs[ncclNIbDevs].isSharpDev = 0; + if (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) + { + ncclIbDevs[ncclNIbDevs].isSharpDev = 1; + ncclIbDevs[ncclNIbDevs].maxQp = ncclParamSharpMaxComms(); + ncclNSharpDevs++; + } + TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, + portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); if (ncclIbAsyncThread != NULL) { - pthread_create(ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + pthread_create(ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs); ncclSetThreadName(*ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); pthread_detach(*ncclIbAsyncThread); // will not be pthread_join()'d } + + int mergedDev = ncclNMergedIbDevs; + if (ncclParamIbMergeNics()) { + mergedDev = ncclIbFindMatchingDev(ncclNIbDevs); + } + + // No matching dev found, create new mergedDev entry (it's okay if there's only one dev inside) + if (mergedDev == ncclNMergedIbDevs) { + // Set ndevs to 1, assign first ibDevN to the current IB device + ncclIbMergedDevs[mergedDev].ndevs = 1; + ncclIbMergedDevs[mergedDev].devs[0] = ncclNIbDevs; + ncclNMergedIbDevs++; + strncpy(ncclIbMergedDevs[mergedDev].devName, ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE); + // Matching dev found, edit name + } else { + // Set next device in this array to the current IB device + int ndevs = ncclIbMergedDevs[mergedDev].ndevs; + ncclIbMergedDevs[mergedDev].devs[ndevs] = ncclNIbDevs; + ncclIbMergedDevs[mergedDev].ndevs++; + snprintf(ncclIbMergedDevs[mergedDev].devName + strlen(ncclIbMergedDevs[mergedDev].devName), MAXNAMESIZE+1, "+%s", ncclIbDevs[ncclNIbDevs].devName); + } + + // Aggregate speed + ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed; ncclNIbDevs++; nPorts++; } @@ -310,33 +405,48 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc qsort(ncclIbDevs, ncclNIbDevs, sizeof(struct ncclIbDev), devSharpCompare); } - char line[1024]; + char line[2048]; line[0] = '\0'; // Determine whether RELAXED_ORDERING is enabled and possible ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); - for (int d=0; dndevs > 1) { + // Print out merged dev info + snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d); + for (int i = 0; i < mergedDev->ndevs; i++) { + int ibDev = mergedDev->devs[i]; + snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName, + ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", + // Insert comma to delineate + i == (mergedDev->ndevs - 1) ? "" : ", "); + } + snprintf(line+strlen(line), 2047-strlen(line), "}"); + } else { + int ibDev = mergedDev->devs[0]; #ifdef HAVE_SHARP_PLUGIN - snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s%s", d, ncclIbDevs[d].devName, - ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", - ncclIbDevs[d].isSharpDev ? "/SHARP" : ""); + snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName, + ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", + ncclIbDevs[ibDev].isSharpDev ? "/SHARP" : ""); #else - snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, - ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName, + ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); #endif + } } - line[1023] = '\0'; + line[2047] = '\0'; char addrline[SOCKET_NAME_MAXLEN+1]; INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", ncclIbIfName, ncclSocketToString(ncclIbIfAddr, addrline, 1)); } - *num_devs = ncclNIbDevs; + *num_devs = ncclNMergedIbDevs; pthread_mutex_unlock(&nccl_p2p_lock); } return ncclSuccess; } -ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_name, char** path, int* real_port) +ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port) { char device_path[PATH_MAX]; snprintf(device_path, PATH_MAX, "/sys/class/infiniband/%s/device", dev_name); @@ -347,7 +457,7 @@ ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_n // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; // Also merge virtual functions (VF) into the same device - p[strlen(p)-3] = '0'; + if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // And keep the real port aside (the ibv port is always 1 on recent cards) *real_port = 0; for (int d=0; d 0) { errno = 0; @@ -80,3 +80,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 } pthread_mutex_unlock(&mutex); } + +const char *ncclGetEnv(const char *name) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, initEnv); + return getenv(name); +} diff --git a/nccl_plugin/src/socket_plugin.c b/nccl_plugin/src/socket_plugin.c index 2336950..4fd3712 100644 --- a/nccl_plugin/src/socket_plugin.c +++ b/nccl_plugin/src/socket_plugin.c @@ -102,6 +102,19 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) { return ncclSuccess; } +ncclResult_t ncclSockGetProperties_v6(int dev, ncclNetProperties_v6_t* props) { + props->name = ncclNetSocketDevs[dev].devName; + props->pciPath = ncclNetSocketDevs[dev].pciPath; + props->guid = dev; + props->ptrSupport = NCCL_PTR_HOST; + NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed)); + props->latency = 0; // Not set + props->port = 0; + props->maxComms = 65536; + props->maxRecvs = 1; + return ncclSuccess; +} + ncclResult_t ncclSockGetProperties(int dev, ncclNetProperties_t* props) { props->name = ncclNetSocketDevs[dev].devName; props->pciPath = ncclNetSocketDevs[dev].pciPath; @@ -614,7 +627,7 @@ const ncclNet_v6_t socketPlugin_v6 = { .name = "SOCKext", .init = ncclSockInit, .devices = ncclSockDevices, - .getProperties = ncclSockGetProperties, + .getProperties = ncclSockGetProperties_v6, .listen = ncclSockListen, .connect = ncclSockConnect, .accept = ncclSockAccept, @@ -634,7 +647,7 @@ const ncclNet_v5_t socketPlugin_v5 = { .name = "SOCKext", .init = ncclSockInit, .devices = ncclSockDevices, - .getProperties = ncclSockGetProperties, + .getProperties = ncclSockGetProperties_v6, .listen = ncclSockListen, .connect = ncclSockConnect, .accept = ncclSockAccept, @@ -648,61 +661,3 @@ const ncclNet_v5_t socketPlugin_v5 = { .closeRecv = ncclSockClose, .closeListen = ncclSockCloseListen, }; - -static ncclResult_t ncclSockGetProperties_v4(int dev, ncclNetProperties_v4_t* props) { - ncclNetProperties_v6_t props_v6; - ncclResult_t ret = ncclSockGetProperties(dev, &props_v6); - if (ret != ncclSuccess) return ret; - props->name = props_v6.name; - props->pciPath = props_v6.pciPath; - props->guid = props_v6.guid; - props->ptrSupport = props_v6.ptrSupport; - props->speed = props_v6.speed; - props->port = props_v6.port; - props->maxComms = props_v6.maxComms; - return ncclSuccess; -} -static ncclResult_t ncclSockIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { - return ncclSockIsend(sendComm, data, size, 0, mhandle, request); -} -static ncclResult_t ncclSockIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { - int tag = 0; - return ncclSockIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); -} -static ncclResult_t ncclSockIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { - return ncclSockIflush(recvComm, 1, &data, &size, &mhandle, request); -} -static ncclResult_t ncclSockConnect_v4(int dev, void* handle, void** sendComm) { - ncclResult_t ret; - do { - ret = ncclSockConnect(dev, handle, sendComm); - } while (ret == ncclSuccess && *sendComm == NULL); - return ret; -} -static ncclResult_t ncclSockAccept_v4(void* listenComm, void** recvComm) { - ncclResult_t ret; - do { - ret = ncclSockAccept(listenComm, recvComm); - } while (ret == ncclSuccess && *recvComm == NULL); - return ret; -} - - -const ncclNet_v4_t socketPlugin_v4 = { - .name = "SOCKext", - .init = ncclSockInit, - .devices = ncclSockDevices, - .getProperties = ncclSockGetProperties_v4, - .listen = ncclSockListen, - .connect = ncclSockConnect_v4, - .accept = ncclSockAccept_v4, - .regMr = ncclSockRegMr, - .deregMr = ncclSockDeregMr, - .isend = ncclSockIsend_v4, - .irecv = ncclSockIrecv_v4, - .iflush = ncclSockIflush_v4, - .test = ncclSockTest, - .closeSend = ncclSockClose, - .closeRecv = ncclSockClose, - .closeListen = ncclSockCloseListen, -}; diff --git a/nccl_plugin/src/utils.c b/nccl_plugin/src/utils.c index 2564892..d36beb5 100644 --- a/nccl_plugin/src/utils.c +++ b/nccl_plugin/src/utils.c @@ -4,6 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ +#define _GNU_SOURCE #include #include #include @@ -109,71 +110,17 @@ int matchIfList(const char* string, int port, struct netIf* ifList, int listSize return 0; } -static size_t readFileVarArg(char *buffer, size_t max, - const char *filename_fmt, va_list ap) +const char *get_plugin_lib_path() { - char filename[PATH_MAX]; - ssize_t read_bytes; - int fd; + Dl_info dl_info; + int ret; - vsnprintf(filename, PATH_MAX, filename_fmt, ap); + ret = dladdr((void*)&get_plugin_lib_path, &dl_info); + if (ret == 0) return NULL; - fd = open(filename, O_RDONLY); - if (fd < 0) { - return -1; - } - - read_bytes = read(fd, buffer, max - 1); - if (read_bytes < 0) { - return -1; - } - - if (read_bytes < max) { - buffer[read_bytes] = '\0'; - } - -out_close: - close(fd); + return dl_info.dli_fname; } -int readFileNumber(long *value, const char *filename_fmt, ...) -{ - char buffer[64], *tail; - ssize_t read_bytes; - va_list ap; - long n; - - va_start(ap, filename_fmt); - read_bytes = readFileVarArg(buffer, sizeof(buffer) - 1, - filename_fmt, ap); - va_end(ap); - - if (read_bytes < 0) { - /* read error */ - return -1; - } - - n = strtol(buffer, &tail, 0); - if ((*tail != '\0') && !isspace(*tail)) { - /* parse error */ - return -1; - } - - *value = n; - return 0; -} - -//const char *get_plugin_lib_path() -//{ -// Dl_info dl_info; -// int ret; -// -// ret = dladdr((void*)&get_plugin_lib_path, &dl_info); -// if (ret == 0) return NULL; -// -// return dl_info.dli_fname; -//} - NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { From 42ab4c68c2cb7eb415902b09e2aadf97d267a0e8 Mon Sep 17 00:00:00 2001 From: Wataru Ishida Date: Wed, 28 Feb 2024 13:24:48 +0000 Subject: [PATCH 2/3] fix(nccl_plugin): fix p2p_plugin type socket Signed-off-by: Wataru Ishida --- nccl_plugin/include/p2p_plugin.h | 1 + nccl_plugin/include/utils.h | 8 ++++++++ nccl_plugin/src/p2p_plugin.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/nccl_plugin/include/p2p_plugin.h b/nccl_plugin/include/p2p_plugin.h index b859777..8cee85c 100644 --- a/nccl_plugin/include/p2p_plugin.h +++ b/nccl_plugin/include/p2p_plugin.h @@ -29,6 +29,7 @@ typedef enum nccl_p2p_plugin { NCCL_P2P_IB, NCCL_P2P_UCX, NCCL_P2P_UCX_RMA, + NCCL_P2P_SOCKET, NCCL_P2P_LAST } nccl_p2p_plugin_t; diff --git a/nccl_plugin/include/utils.h b/nccl_plugin/include/utils.h index 96ca62e..33936a9 100644 --- a/nccl_plugin/include/utils.h +++ b/nccl_plugin/include/utils.h @@ -13,6 +13,10 @@ #define NCCL_STATIC_ASSERT(_cond, _msg) \ switch(0) {case 0:case (_cond):;} +#ifdef __cplusplus +extern "C" { +#endif + ncclResult_t ncclIbMalloc(void** ptr, size_t size); ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size); ncclResult_t getHostName(char* hostname, int maxlen); @@ -28,4 +32,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList); int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact); const char *get_plugin_lib_path(); +#ifdef __cplusplus +} // end extern "C" +#endif + #endif diff --git a/nccl_plugin/src/p2p_plugin.c b/nccl_plugin/src/p2p_plugin.c index daf90ce..d0824bf 100644 --- a/nccl_plugin/src/p2p_plugin.c +++ b/nccl_plugin/src/p2p_plugin.c @@ -25,6 +25,9 @@ extern ncclNet_v6_t ucxRmaPlugin_v6; extern ncclNet_v5_t ucxRmaPlugin_v5; #endif +extern ncclNet_v6_t socketPlugin_v6; +extern ncclNet_v5_t socketPlugin_v5; + extern ncclNet_v8_t ibPlugin_v8; extern ncclNet_v7_t ibPlugin_v7; extern ncclNet_v6_t ibPlugin_v6; @@ -88,11 +91,16 @@ static void pluginSetup() else if (!strcasecmp(p2p_layer, "ucx")) p2p_plugin = NCCL_P2P_UCX; else if (!strcasecmp(p2p_layer, "ucx_rma")) p2p_plugin = NCCL_P2P_UCX_RMA; #endif + else if (!strcasecmp(p2p_layer, "socket")) p2p_plugin = NCCL_P2P_SOCKET; else { WARN("Invalid value %s for NCCL_PLUGIN_P2P, using default", p2p_layer); } } switch (p2p_plugin) { + case NCCL_P2P_SOCKET: + ncclNetPlugin_v6 = socketPlugin_v6; + ncclNetPlugin_v5 = socketPlugin_v5; + break; #ifdef HAVE_UCX_PLUGIN case NCCL_P2P_UCX: ncclNetPlugin_v8 = ucxPlugin_v8; From 6d57ebce5cdcce9f43fb096646717735dc893c98 Mon Sep 17 00:00:00 2001 From: Wataru Ishida Date: Wed, 28 Feb 2024 13:25:30 +0000 Subject: [PATCH 3/3] chore: fix Docker build Signed-off-by: Wataru Ishida --- Dockerfile | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1c576bc..035a6bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,27 +1,48 @@ -FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS nccl +FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS nccl -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common libibverbs-dev libopenmpi-dev autoconf libtool COPY test test + +RUN cd test && make build-nccl +RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make build-nccl-tests + COPY nccl_plugin nccl_plugin -RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make && make install +RUN cd test && make build-nccl-plugin +RUN cd test && make install -FROM ghcr.io/rust-lang/rust:nightly-bullseye-slim AS optcast +FROM ubuntu:22.04 AS optcast -COPY reduction_server reduction_server +RUN apt-get update && apt-get install -y \ + curl \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# Rust install +ENV RUST_HOME /usr/local/lib/rust +ENV RUSTUP_HOME ${RUST_HOME}/rustup +ENV CARGO_HOME ${RUST_HOME}/cargo +RUN mkdir /usr/local/lib/rust && \ + chmod 0755 $RUST_HOME +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > ${RUST_HOME}/rustup.sh \ + && chmod +x ${RUST_HOME}/rustup.sh \ + && ${RUST_HOME}/rustup.sh -y --default-toolchain nightly --no-modify-path +ENV PATH $PATH:$CARGO_HOME/bin COPY --from=nccl /usr/local/lib /usr/local/lib RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy clang libibverbs1 +COPY reduction_server reduction_server + RUN cd reduction_server && cargo build -r FROM optcast AS unittest -RUN cd reduction_server && cargo test --all +ENV RUST_LOG=info +RUN cd reduction_server && cargo test --all -- --nocapture -FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS final +FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS final RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy --no-install-recommends openmpi-bin