From bbc3435fb42c86a0bd628dd435e21391dfb2a693 Mon Sep 17 00:00:00 2001
From: Wataru Ishida <wataru.ishid@gmail.com>
Date: Thu, 22 Feb 2024 07:14:02 +0000
Subject: [PATCH 1/3] chore: plugin update to nccl-2.20

https://github.com/Mellanox/nccl-rdma-sharp-plugins/commit/728d4e88226c6e3fdd832c6395f5231baaeea742

Signed-off-by: Wataru Ishida <wataru.ishid@gmail.com>
---
 nccl_plugin/include/ibvwrap.h     |    4 +-
 nccl_plugin/include/nccl.h        |   47 +-
 nccl_plugin/include/net.h         |    5 +-
 nccl_plugin/include/net_device.h  |   32 +
 nccl_plugin/include/net_v6.h      |    2 -
 nccl_plugin/include/net_v7.h      |  121 +++
 nccl_plugin/include/net_v8.h      |  137 ++++
 nccl_plugin/include/p2p_plugin.h  |   73 +-
 nccl_plugin/include/param.h       |    9 +-
 nccl_plugin/include/socket.h      |    2 +-
 nccl_plugin/include/timer.h       |    6 +-
 nccl_plugin/include/utils.h       |    9 +-
 nccl_plugin/src/ib_plugin.c       | 1262 +++++++++++++++++++----------
 nccl_plugin/src/ibvwrap.c         |   38 +-
 nccl_plugin/src/optcast_plugin.cc |   24 +-
 nccl_plugin/src/p2p_plugin.c      |  282 +++++--
 nccl_plugin/src/param.c           |   10 +-
 nccl_plugin/src/socket_plugin.c   |   75 +-
 nccl_plugin/src/utils.c           |   67 +-
 19 files changed, 1463 insertions(+), 742 deletions(-)
 create mode 100644 nccl_plugin/include/net_device.h
 create mode 100644 nccl_plugin/include/net_v7.h
 create mode 100644 nccl_plugin/include/net_v8.h

diff --git a/nccl_plugin/include/ibvwrap.h b/nccl_plugin/include/ibvwrap.h
index c79de37..afd4ebb 100644
--- a/nccl_plugin/include/ibvwrap.h
+++ b/nccl_plugin/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -59,6 +59,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
 ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
 ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
 ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
diff --git a/nccl_plugin/include/nccl.h b/nccl_plugin/include/nccl.h
index 67736cf..a234af9 100644
--- a/nccl_plugin/include/nccl.h
+++ b/nccl_plugin/include/nccl.h
@@ -14,11 +14,11 @@
 #endif
 
 #define NCCL_MAJOR 2
-#define NCCL_MINOR 15
-#define NCCL_PATCH 1
+#define NCCL_MINOR 20
+#define NCCL_PATCH 3
 #define NCCL_SUFFIX ""
 
-#define NCCL_VERSION_CODE 21510
+#define NCCL_VERSION_CODE 22003
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -42,15 +42,24 @@ typedef enum { ncclSuccess                 =  0,
                ncclInProgress              =  7,
                ncclNumResults              =  8 } ncclResult_t;
 
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
-typedef struct ncclConfig_v21400 {
+typedef struct ncclConfig_v21700 {
   /* attributes that users should never touch. */
   size_t size;
   unsigned int magic;
   unsigned int version;
   /* attributes that users are able to customize. */
   int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -59,9 +68,23 @@ typedef struct ncclConfig_v21400 {
   sizeof(ncclConfig_t), /* size */                                      \
   0xcafebeef,           /* magic */                                     \
   NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  1                     /* blocking */                                  \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }
 
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
@@ -119,6 +142,10 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
+/* Returns a human-readable message of the last error that occurred. */
+ const char*  ncclGetLastError(ncclComm_t comm);
+ const char* pncclGetLastError(ncclComm_t comm);
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -135,6 +162,16 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
 ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 
+
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
+
 /* Reduction operation selector */
 typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
 typedef enum { ncclSum        = 0,
diff --git a/nccl_plugin/include/net.h b/nccl_plugin/include/net.h
index b3ddb70..1e60bad 100644
--- a/nccl_plugin/include/net.h
+++ b/nccl_plugin/include/net.h
@@ -22,10 +22,9 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
+#include "net_v8.h"
+#include "net_v7.h"
 #include "net_v6.h"
 #include "net_v5.h"
-#include "net_v4.h"
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
 
 #endif // end include guard
diff --git a/nccl_plugin/include/net_device.h b/nccl_plugin/include/net_device.h
new file mode 100644
index 0000000..bd1a02f
--- /dev/null
+++ b/nccl_plugin/include/net_device.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_DEVICE_H_
+#define NET_DEVICE_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
+
+#endif
diff --git a/nccl_plugin/include/net_v6.h b/nccl_plugin/include/net_v6.h
index 5c87ca8..8e1d134 100644
--- a/nccl_plugin/include/net_v6.h
+++ b/nccl_plugin/include/net_v6.h
@@ -18,8 +18,6 @@ typedef struct {
   int maxRecvs;   // Maximum number of grouped receives.
 }ncclNetProperties_v6_t;
 
-typedef ncclNetProperties_v6_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
diff --git a/nccl_plugin/include/net_v7.h b/nccl_plugin/include/net_v7.h
new file mode 100644
index 0000000..eef1f2e
--- /dev/null
+++ b/nccl_plugin/include/net_v7.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V7_H_
+#define NCCL_NET_V7_H_
+
+#include "net_device.h"
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+// v7 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#endif // end include guard
diff --git a/nccl_plugin/include/net_v8.h b/nccl_plugin/include/net_v8.h
new file mode 100644
index 0000000..f1bd56b
--- /dev/null
+++ b/nccl_plugin/include/net_v8.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V8_H_
+#define NCCL_NET_V8_H_
+#include "net_device.h"
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef ncclNetProperties_v8_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+
+#endif // end include guard
diff --git a/nccl_plugin/include/p2p_plugin.h b/nccl_plugin/include/p2p_plugin.h
index 7178fff..b859777 100644
--- a/nccl_plugin/include/p2p_plugin.h
+++ b/nccl_plugin/include/p2p_plugin.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,8 +9,6 @@
 
 #include <stdint.h>
 #include <unistd.h>
-#define ENABLE_TIMER 0
-#include "timer.h"
 #include <assert.h>
 
 #include "nccl.h"
@@ -31,13 +29,12 @@ typedef enum nccl_p2p_plugin {
   NCCL_P2P_IB,
   NCCL_P2P_UCX,
   NCCL_P2P_UCX_RMA,
-  NCCL_P2P_SOCKET,
   NCCL_P2P_LAST
 } nccl_p2p_plugin_t;
 
 struct ncclIbMr {
   uintptr_t addr;
-  int pages;
+  size_t pages;
   int refs;
   struct ibv_mr *mr;
 };
@@ -47,76 +44,86 @@ struct ncclIbMrCache {
   int capacity, population;
 };
 
+#define NCCL_IB_MAX_DEVS_PER_NIC 2
+#define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
+struct ncclIbMergedDev {
+  int ndevs;
+  int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
+  int speed;
+  char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
+} __attribute__((aligned(64)));
+
 struct ncclIbRequest {
-  struct ncclIbVerbs* verbs;
+  struct ncclIbNetCommBase* base;
   int type;
-  int events;
   struct ncclSocket* sock;
-  struct ncclIbGidInfo* gidInfo;
+  int events[NCCL_IB_MAX_DEVS_PER_NIC];
+  struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
   int nreqs;
   union {
     struct {
       int size;
       void* data;
-      uint32_t lkey;
+      uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC];
       int offset;
     } send;
     struct {
-      int sizes[NCCL_NET_IB_MAX_RECVS];
+      int* sizes;
     } recv;
   };
 };
 
-struct ncclIbVerbs {
-  int    dev;
-  struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd
+// Retain local RoCE address for error logging
+struct ncclIbGidInfo {
+  uint8_t link_layer;
+  union ibv_gid localGid;
+};
+
+typedef struct ncclIbNetCommDevBase {
+  int ibDevN;
+  struct ibv_pd* pd;
   struct ibv_cq* cq;
   uint64_t pad[1];
-  struct ncclIbRequest reqs[MAX_REQUESTS];
-};
+  struct ncclIbGidInfo gidInfo;
+} ncclIbNetCommDevBase;
 
 typedef struct ncclIbDev {
   pthread_mutex_t lock;
   int      device;
   uint64_t guid;
-  uint8_t  port;
+  uint8_t portNum;
   uint8_t  link;
   uint8_t  isSharpDev;
   int      speed;
   struct   ibv_context* context;
   int      pdRefs;
   struct ibv_pd*  pd;
-  struct   ncclIbVerbs verbs;
   char     devName[MAXNAMESIZE];
   char     *pciPath;
   int      realPort;
   int      maxQp;
   struct   ncclIbMrCache mrCache;
   int ar; // ADAPTIVE_ROUTING
-} __attribute__((aligned(64))) nccl_ib_dev_t;
+  struct ibv_port_attr portAttr;
+} __attribute__((aligned(64))) ncclIbDev;
 
-#define MAX_IB_PORT 15
-struct userIbDev {
-  char devName[MAXNAMESIZE];
-  uint16_t port_en;
-};
 
 #define MAX_IB_DEVS 32
+extern struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS];
 extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
-extern struct ncclIbDev userIbDevs[MAX_IB_DEVS];
 /* Detect whether GDR can work on a given NIC with the current CUDA device
  * Returns :
  * ncclSuccess : GDR works
  * ncclSystemError : no module or module loaded but not supported by GPU */
-ncclResult_t nccl_p2p_gdr_support(int dev);
+ncclResult_t nccl_p2p_gdr_support();
 
 ncclResult_t nccl_p2p_dmabuf_support(int dev);
 
-ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_name, char** path, int* real_port);
+ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port);
 
-ncclResult_t nccl_p2p_ib_get_properties(nccl_ib_dev_t *devs, int dev, ncclNetProperties_t* props);
+ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int dev, ncclNetProperties_t* props);
 
-ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
+ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
 
 /* Convert value returtned by ibv_query_port to actual link width */
 int nccl_p2p_ib_width(int width);
@@ -126,14 +133,12 @@ int nccl_p2p_ib_speed(int speed);
 
 int64_t ncclParamSharpMaxComms();
 
+int64_t ncclParamIbMergeVfs();
+
+int64_t ncclParamIbMergeNics();
+
 int ncclIbRelaxedOrderingCapable(void);
 
-#ifdef __cplusplus
-extern "C" { 
-#endif
 nccl_p2p_plugin_t nccl_p2p_get_plugin_type();
-#ifdef __cplusplus
-}
-#endif
 
 #endif
diff --git a/nccl_plugin/include/param.h b/nccl_plugin/include/param.h
index 0b27d0c..18f5c03 100644
--- a/nccl_plugin/include/param.h
+++ b/nccl_plugin/include/param.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,14 +12,9 @@
 const char* userHomeDir();
 void setEnvFile(const char* fileName);
 void initEnv();
+const char *ncclGetEnv(const char *name);
 
-#ifdef __cplusplus
-extern "C" { 
-#endif
 void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
-#ifdef __cplusplus
-}
-#endif
 
 #define NCCL_PARAM(name, env, deftVal) \
   int64_t ncclParam##name() { \
diff --git a/nccl_plugin/include/socket.h b/nccl_plugin/include/socket.h
index e2e9a20..4c04ae1 100644
--- a/nccl_plugin/include/socket.h
+++ b/nccl_plugin/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/nccl_plugin/include/timer.h b/nccl_plugin/include/timer.h
index 41fd8f2..7f4eb77 100644
--- a/nccl_plugin/include/timer.h
+++ b/nccl_plugin/include/timer.h
@@ -52,9 +52,9 @@ static double startTimes[8];
   printf("\n"); \
 } while (0);
 #else
-#define TIME_START(index) while(0);
-#define TIME_STOP(index) while(0);
-#define TIME_CANCEL(index) while(0);
+#define TIME_START(index) do {} while(0);
+#define TIME_STOP(index) do {} while(0);
+#define TIME_CANCEL(index) do {} while(0);
 #define TIME_PRINT(name)
 #endif
 #endif
diff --git a/nccl_plugin/include/utils.h b/nccl_plugin/include/utils.h
index 2934ae8..96ca62e 100644
--- a/nccl_plugin/include/utils.h
+++ b/nccl_plugin/include/utils.h
@@ -13,13 +13,7 @@
 #define NCCL_STATIC_ASSERT(_cond, _msg) \
     switch(0) {case 0:case (_cond):;}
 
-#ifdef __cplusplus
-extern "C" { 
-#endif
-  ncclResult_t ncclIbMalloc(void** ptr, size_t size);
-#ifdef __cplusplus
-}
-#endif
+ncclResult_t ncclIbMalloc(void** ptr, size_t size);
 ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size);
 ncclResult_t getHostName(char* hostname, int maxlen);
 uint64_t getHostHash();
@@ -32,7 +26,6 @@ struct netIf {
 
 int parseStringList(const char* string, struct netIf* ifList, int maxList);
 int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact);
-int readFileNumber(long *value, const char *filename_fmt, ...);
 const char *get_plugin_lib_path();
 
 #endif
diff --git a/nccl_plugin/src/ib_plugin.c b/nccl_plugin/src/ib_plugin.c
index eda70ad..a1e663e 100644
--- a/nccl_plugin/src/ib_plugin.c
+++ b/nccl_plugin/src/ib_plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,9 +13,9 @@
 #include <sys/types.h>
 #include <sys/time.h>
 #include <unistd.h>
+
 #define ENABLE_TIMER 0
 #include "timer.h"
-
 #include "p2p_plugin.h"
 #include "core.h"
 #include "socket.h"
@@ -46,25 +46,11 @@ NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 
-pthread_t ncclIbAsyncThread;
-static void* ncclIbAsyncThreadMain(void* args) {
-  struct ibv_context* context = (struct ibv_context*)args;
-  while (1) {
-    struct ibv_async_event event;
-    if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; }
-    char *str;
-    if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
-    if (event.event_type != IBV_EVENT_COMM_EST)
-      WARN("NET/IB : Got async event : %s", str);
-    if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
-  }
-  return NULL;
-}
+static pthread_t ncclIbAsyncThread;
 
 // Determine whether RELAXED_ORDERING is enabled and possible
 int ncclIbRelaxedOrderingCapable(void) {
   int roMode = ncclParamIbPciRelaxedOrdering();
-  ncclResult_t r = ncclInternalError;
   if (roMode == 1 || roMode == 2) {
     if (!IBV_ACCESS_RELAXED_ORDERING) {
       if(roMode == 1)
@@ -79,12 +65,12 @@ int ncclIbRelaxedOrderingCapable(void) {
 
 NCCL_PARAM(IbDisable, "IBEXT_DISABLE", 0);
 NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
+NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1);
 
 extern ncclDebugLogger_t pluginLogFunction;
 
 ncclResult_t ncclIbDevices(int* ndev) {
-  *ndev = 1;
-//  *ndev = ncclNIbDevs;
+  *ndev = ncclNIbDevs;
   return ncclSuccess;
 }
 
@@ -93,29 +79,81 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props)
   return nccl_p2p_ib_get_properties(ncclIbDevs, dev, props);
 }
 
-static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
-  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+ncclResult_t ncclIbGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7)
+{
+  ncclNetProperties_t props;
+  ncclResult_t ret = nccl_p2p_ib_get_properties(ncclIbDevs, dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v7->name = props.name;
+  props_v7->pciPath = props.pciPath;
+  props_v7->guid = props.guid;
+  props_v7->ptrSupport = props.ptrSupport;
+  props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
+  props_v7->port = props.port;
+  props_v7->maxComms = props.maxComms;
+  props_v7->maxRecvs = props.maxRecvs;
+  props_v7->netDeviceType = props.netDeviceType;
+  props_v7->netDeviceVersion = props.netDeviceVersion;
   return ncclSuccess;
 }
 
+ncclResult_t ncclIbGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6)
+{
+  ncclNetProperties_t props;
+  ncclResult_t ret = nccl_p2p_ib_get_properties(ncclIbDevs, dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v6->name = props.name;
+  props_v6->pciPath = props.pciPath;
+  props_v6->guid = props.guid;
+  props_v6->ptrSupport = props.ptrSupport;
+  props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
+  props_v6->port = props.port;
+  props_v6->maxComms = props.maxComms;
+  props_v6->maxRecvs = props.maxRecvs;
+
+  return ncclSuccess;
+};
+
 #define NCCL_IB_MAX_QPS 128
 
-typedef struct ncclIbQpInfo {
+struct ncclIbQpInfo {
+  uint32_t qpn;
+
+  // Fields needed for ece (enhanced connection establishment)
+  struct ibv_ece ece;
+  int ece_supported;
+  int devIndex;
+};
+
+
+// Per-Dev connection metadata
+typedef struct ncclIbDevInfo {
   uint32_t lid;
   uint8_t ib_port;
+  enum ibv_mtu mtu;
   uint8_t link_layer;
   uint8_t is_global;
-  uint32_t qpn[NCCL_IB_MAX_QPS];
 
   // For RoCE and IB GRH
   uint64_t spn;
   uint64_t iid;
-  enum ibv_mtu mtu;
 
   // FIFO RDMA info
   uint32_t fifoRkey;
+  union ibv_gid remoteGid;
+} ncclIbDevInfo;
+
+
+// Struct containing everything needed to establish connections
+typedef struct ncclIbConnectionMetadata {
+  struct ncclIbQpInfo qpInfo[NCCL_IB_MAX_QPS];
+  struct ncclIbDevInfo devs[NCCL_IB_MAX_DEVS_PER_NIC];
+  char devName[MAX_MERGED_DEV_NAME];
   uint64_t fifoAddr;
-} ncclIbQpInfo;
+  int ndevs;
+} ncclIbConnectionMetadata;
 
 enum ncclIbCommState {
   ncclIbCommStateStart = 0,
@@ -142,12 +180,7 @@ struct ncclIbHandle {
   struct ncclIbCommStage stage; // Used by the other side when connecting
 };
 
-// Retain local and remote RoCE addresses for error logging
-struct ncclIbGidInfo {
-  uint8_t link_layer;
-  union ibv_gid localGid;
-  union ibv_gid remoteGid;
-};
+
 
 #define NCCL_NET_IB_REQ_UNUSED 0
 #define NCCL_NET_IB_REQ_SEND 1
@@ -164,57 +197,97 @@ struct ncclIbListenComm {
 struct ncclIbSendFifo {
   uint64_t addr;
   int      size;
-  uint32_t rkey;
+  uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC];
   uint32_t nreqs;
   uint32_t tag;
   uint64_t idx;
+  char padding[24];
 };
 
+typedef struct ncclIbQp {
+  struct ibv_qp* qp;
+  int devIndex;
+  int remDevIdx;
+} ncclIbQp;
+
+struct ncclIbRemSizesFifo {
+  int elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  uint64_t fifoTail;
+  uint64_t addr;
+  uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC];
+  uint32_t flags;
+  struct ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC];
+  struct ibv_sge sge;
+};
+
+// A per-dev struct for netIbSendComm
+typedef struct ncclIbSendCommDev {
+  struct ncclIbNetCommDevBase base;
+  struct ibv_mr* fifoMr;
+} __attribute__((aligned(8))) ncclIbSendCommDev;
+
+
+// Wrapper to track an MR per-device, if needed
+struct ncclIbMrHandle {
+  struct ibv_mr* mrs[NCCL_IB_MAX_DEVS_PER_NIC];
+};
+
+typedef struct ncclIbNetCommBase {
+  int ndevs;
+  bool isSend;
+  struct ncclIbRequest reqs[MAX_REQUESTS];
+  struct ncclIbQp qps[NCCL_IB_MAX_QPS];
+  int nqps;
+  int qpIndex;
+  int devIndex;
+  struct ncclSocket sock;
+  int ready;
+  // Track necessary remDevInfo here
+  int nRemDevs;
+  struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
+ }  __attribute__((aligned(32))) ncclIbNetCommBase;
+
 struct ncclIbSendComm {
-  struct ncclIbVerbs verbs;
+  struct ncclIbNetCommBase base;
   struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
-  uint64_t fifoHead;
+  // Each dev correlates to a mergedIbDev
+  struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
-  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
   struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
-  struct ncclSocket sock;
-
-  int ready;
-  struct ibv_qp* qps[NCCL_IB_MAX_QPS];
-  int nqps;
-  int qpIndex;
-  struct ibv_mr* fifoMr;
-  int ar;
-  struct ncclIbGidInfo gidInfo;
+  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
+  struct ncclIbRemSizesFifo remSizesFifo;
+  uint64_t fifoHead;
+  int ar; // Use adaptive routing when all merged devices have it enabled
 };
 struct ncclIbGpuFlush {
-  int enabled;
-  int hostMem;
   struct ibv_mr* hostMr;
   struct ibv_sge sge;
-  struct ibv_qp* qp;
+  struct ncclIbQp qp;
 };
 
 struct ncclIbRemFifo {
   struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
   uint64_t fifoTail;
   uint64_t addr;
-  uint32_t rkey;
   uint32_t flags;
-  struct ibv_mr* mr;
-  struct ibv_sge sge;
 };
 
+struct ncclIbRecvCommDev {
+  struct ncclIbNetCommDevBase base;
+  struct ncclIbGpuFlush gpuFlush;
+  uint32_t fifoRkey;
+  struct ibv_mr* fifoMr;
+  struct ibv_sge fifoSge;
+  struct ibv_mr* sizesFifoMr;
+} __attribute__((aligned(16)));
+
 struct ncclIbRecvComm {
-  struct ncclIbVerbs verbs;
+  struct ncclIbNetCommBase base;
+  struct ncclIbRecvCommDev    devs[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbRemFifo remFifo;
-  struct ncclSocket sock;
-  int ready;
-  struct ibv_qp* qps[NCCL_IB_MAX_QPS];
-  int nqps;
-  int qpIndex;
-  struct ncclIbGpuFlush gpuFlush;
-  struct ncclIbGidInfo gidInfo;
+  int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  int gpuFlushHostMem;
+  int flushEnabled;
 };
 
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
@@ -223,55 +296,65 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
   // The SendFifo needs to be 32-byte aligned and each element needs
   // to be a 32-byte multiple, so that an entry does not get split and
   // written out of order when IB Relaxed Ordering is enabled
+  NCCL_STATIC_ASSERT((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
   NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
   NCCL_STATIC_ASSERT((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
+  NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
+  NCCL_STATIC_ASSERT((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
   NCCL_STATIC_ASSERT((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
 
+
   return nccl_p2p_ib_init(&ncclNIbDevs, ncclIbDevs, ncclIbIfName, &ncclIbIfAddr, &ncclIbAsyncThread, logFunction);
 }
 
 NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
 
-ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) {
-  verbs->dev = dev;
+static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclIbNetCommDevBase* base) {
+  req->events[devIndex]++;
+  req->devBases[devIndex] = base;
+}
 
-  pthread_mutex_lock(&ncclIbDevs[dev].lock);
-  if (0 == ncclIbDevs[dev].pdRefs++) {
+ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) {
+  base->ibDevN = ibDevN;
+  ncclIbDev* ibDev = ncclIbDevs + ibDevN;
+  pthread_mutex_lock(&ibDev->lock);
+  if (0 == ibDev->pdRefs++) {
     ncclResult_t res;
-    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure);
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context), res, failure);
     if (0) {
     failure:
-      pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+      pthread_mutex_unlock(&ibDev->lock);
       return res;
     }
   }
-  verbs->pd = ncclIbDevs[dev].pd;
-  pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+  base->pd = ibDev->pd;
+  pthread_mutex_unlock(&ibDev->lock);
 
   // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
-  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
+  NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
+
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
   ncclResult_t res;
-  NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
+  NCCLCHECK(wrap_ibv_destroy_cq(base->cq));
 
-  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
-  if (0 == --ncclIbDevs[verbs->dev].pdRefs) {
-    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning);
+  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
+  if (0 == --ncclIbDevs[base->ibDevN].pdRefs) {
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd), res, returning);
   }
   res = ncclSuccess;
 returning:
-  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
   return res;
 }
 
-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) {
   struct ibv_qp_init_attr qpInitAttr;
   memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
-  qpInitAttr.send_cq = verbs->cq;
-  qpInitAttr.recv_cq = verbs->cq;
+  qpInitAttr.send_cq = base->cq;
+  qpInitAttr.recv_cq = base->cq;
   qpInitAttr.qp_type = IBV_QPT_RC;
   // We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
   qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
@@ -279,31 +362,26 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
   qpInitAttr.cap.max_send_sge = 1;
   qpInitAttr.cap.max_recv_sge = 1;
   qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
-  NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
+  NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr));
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_INIT;
   qpAttr.pkey_index = ncclParamIbPkey();
   qpAttr.port_num = ib_port;
   qpAttr.qp_access_flags = access_flags;
-  NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+  NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
   qpAttr.path_mtu = info->mtu;
-  qpAttr.dest_qp_num = qpn;
+  qpAttr.dest_qp_num = dest_qp_num;
   qpAttr.rq_psn = 0;
   qpAttr.max_dest_rd_atomic = 1;
   qpAttr.min_rnr_timer = 12;
-  qpAttr.ah_attr.is_global = 0;
-  qpAttr.ah_attr.dlid = info->lid;
-  qpAttr.ah_attr.sl = ncclParamIbSl();
-  qpAttr.ah_attr.src_path_bits = 0;
-  qpAttr.ah_attr.port_num = info->ib_port;
   if (info->link_layer == IBV_LINK_LAYER_ETHERNET || info->is_global) {
     qpAttr.ah_attr.is_global = 1;
     qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
@@ -312,7 +390,13 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* i
     qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex();
     qpAttr.ah_attr.grh.hop_limit = 255;
     qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
+  } else {
+    qpAttr.ah_attr.is_global = 0;
+    qpAttr.ah_attr.dlid = info->lid;
   }
+  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.src_path_bits = 0;
+  qpAttr.ah_attr.port_num = info->ib_port;
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -346,12 +430,11 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
+ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
-  enum ncclSocketState conState;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
-  struct ncclIbQpInfo remQpInfo;
+  struct ncclIbConnectionMetadata remMeta;
   int ready;
   *sendComm = NULL;
 
@@ -365,94 +448,188 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   }
 
   NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-  NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
+  NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
   stage->comm = comm;
   stage->state = ncclIbCommStateConnect;
-  NCCLCHECK(ncclSocketConnect(&comm->sock));
+  NCCLCHECK(ncclSocketConnect(&comm->base.sock));
 
 ib_connect_check:
   /* since ncclSocketConnect is async, we must check if connection is complete */
-  NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
+  NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready));
   if (!ready) return ncclSuccess;
 
   // IB Setup
-  struct ibv_context* ctx;
-  ctx = ncclIbDevs[dev].context;
-  NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs));
-  uint8_t ib_port;
-  ib_port = ncclIbDevs[dev].port;
-  comm->nqps = ncclParamIbQpsPerConn();
-  for (int q=0; q<comm->nqps; q++) {
-    NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
+  struct ncclIbMergedDev* mergedDev;
+  mergedDev = ncclIbMergedDevs + dev;
+  comm->base.ndevs = mergedDev->ndevs;
+  comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device
+  comm->base.isSend = true;
+
+  // Init PD, Ctx for each IB device
+  comm->ar = 1; // Set to 1 for logic
+  for (int i = 0; i < mergedDev->ndevs; i++) {
+    int ibDevN = mergedDev->devs[i];
+    NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base));
+    comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
   }
-  comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING
-
-  // Send my QP Info to receiver through the socket. Hope this won't block.
-  struct ibv_port_attr portAttr;
-  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
-  struct ncclIbQpInfo qpInfo;
-  qpInfo.ib_port = ib_port;
-  for (int q=0; q<comm->nqps; q++) qpInfo.qpn[q] = comm->qps[q]->qp_num;
-  qpInfo.mtu = portAttr.active_mtu;
-
-  // Prepare my fifo
-  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  qpInfo.fifoRkey = comm->fifoMr->rkey;
-  qpInfo.fifoAddr = (uint64_t)comm->fifo;
-
-  // RoCE support
-  qpInfo.lid = portAttr.lid;
-  qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer;
-  qpInfo.is_global = (ncclParamIbIsGlobal()
-#if HAVE_DECL_IBV_QPF_GRH_REQUIRED
-                     || (portAttr.flags & IBV_QPF_GRH_REQUIRED)
+
+  struct ncclIbConnectionMetadata meta;
+  meta.ndevs = comm->base.ndevs;
+
+  // Alternate QPs between devices
+  int devIndex;
+  devIndex = 0;
+  for (int q = 0; q < comm->base.nqps; q++) {
+    ncclIbSendCommDev* commDev = comm->devs + devIndex;
+    ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
+    NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q));
+    comm->base.qps[q].devIndex = devIndex;
+    meta.qpInfo[q].qpn      = comm->base.qps[q].qp->qp_num;
+    meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
+
+    // Query ece capabilities (enhanced connection establishment)
+    NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+    devIndex = (devIndex + 1) % comm->base.ndevs;
+  }
+
+  for (int i = 0; i < comm->base.ndevs; i++) {
+    ncclIbSendCommDev* commDev = comm->devs + i;
+    ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
+    // Send my QP Info to receiver through the socket. Hope this won't block.
+    // TODO - I thought I queried this in init?
+    NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
+
+    // Write to the metadata struct via this pointer
+    ncclIbDevInfo* devInfo = meta.devs + i;
+    devInfo->ib_port       = ibDev->portNum;
+    devInfo->mtu           = ibDev->portAttr.active_mtu;
+    devInfo->lid           = ibDev->portAttr.lid;
+
+    // Prepare my fifo
+    NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+    devInfo->fifoRkey = commDev->fifoMr->rkey;
+
+    // RoCE support
+    devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
+    devInfo->is_global = (ncclParamIbIsGlobal()
+    #if HAVE_DECL_IBV_QPF_GRH_REQUIRED
+                     || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED)
 #endif
     );
-  if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND && !qpInfo.is_global) { // IB
-    for (int q=0; q<comm->nqps; q++)
-      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
-  } else { // RoCE
-    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
-    qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix;
-    qpInfo.iid = comm->gidInfo.localGid.global.interface_id;
-    for (int q=0; q<comm->nqps; q++)
-      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+
+    if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET || devInfo->is_global) {
+      NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &commDev->base.gidInfo.localGid));
+      devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix;
+      devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id;
+    }
+
+    if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
+      for (int q = 0; q < comm->base.nqps; q++) {
+        // Print just the QPs for this dev
+        if (comm->base.qps[q].devIndex == i)
+          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d fifoRkey=0x%x fifoLkey=0x%x",
+            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
+            dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->fifoRkey, commDev->fifoMr->lkey);
+      }
+    } else { // RoCE
+      for (int q = 0; q < comm->base.nqps; q++) {
+        // Print just the QPs for this dev
+        if (comm->base.qps[q].devIndex == i)
+          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
+            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
+            commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
+            devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
+      }
+    }
   }
 
+  meta.fifoAddr = (uint64_t)comm->fifo;
+  strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
+
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo)));
-  memcpy(stage->buffer, &qpInfo, sizeof(qpInfo));
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)));
+
+  memcpy(stage->buffer, &meta, sizeof(meta));
 
 ib_send:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
-  if (stage->offset != sizeof(qpInfo)) return ncclSuccess;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset));
+  if (stage->offset != sizeof(meta)) return ncclSuccess;
+
 
   stage->state = ncclIbCommStateConnecting;
   stage->offset = 0;
   // Clear the staging buffer for re-use
-  memset(stage->buffer, 0, sizeof(qpInfo));
+  memset(stage->buffer, 0, sizeof(meta));
 
 ib_connect:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset));
-  if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
+   NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset));
+  if (stage->offset != sizeof(remMeta)) return ncclSuccess;
+
+  memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
+
+  comm->base.nRemDevs = remMeta.ndevs;
+  if (comm->base.nRemDevs != comm->base.ndevs) {
+    mergedDev = ncclIbMergedDevs + dev;
+    WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d",
+      mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs);
+  }
+
+  int link_layer;
+  link_layer = remMeta.devs[0].link_layer;
+  for (int i = 1; i < remMeta.ndevs; i++) {
+    if (remMeta.devs[i].link_layer != link_layer) {
+      WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
+      i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
+      return ncclInternalError;
+    }
+  }
+
+  // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
+  for (int i = 0; i < remMeta.ndevs; i++) {
+    comm->base.remDevs[i] = remMeta.devs[i];
+    comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].iid;
+    comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].spn;
+
+    // Retain remote sizes fifo info and prepare RDMA ops
+    comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey;
+    comm->remSizesFifo.addr = remMeta.fifoAddr;
+  }
+
+  for (int i=0; i < comm->base.ndevs; i++) {
+    NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+  }
+  comm->base.nRemDevs = remMeta.ndevs;
+  for (int q = 0; q < comm->base.nqps; q++) {
+    struct ncclIbQpInfo* remQpInfo   = remMeta.qpInfo + q;
+    struct ncclIbDevInfo* remDevInfo = remMeta.devs + remQpInfo->devIndex;
+
+    // Assign per-QP remDev
+    comm->base.qps[q].remDevIdx = remQpInfo->devIndex;
 
-  memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
+    struct ibv_qp* qp = comm->base.qps[q].qp;
+    if (remQpInfo->ece_supported && remQpInfo->ece_supported)
+      NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
 
-  comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
-  comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
-  for (int q=0; q<comm->nqps; q++) {
-    struct ibv_qp* qp = comm->qps[q];
-    NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
+    NCCLCHECK(ncclIbRtrQp(qp, remQpInfo->qpn, remDevInfo));
     NCCLCHECK(ncclIbRtsQp(qp));
   }
 
-  comm->ready = 1;
+  if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
+    for (int q = 0; q < comm->base.nqps; q++) {
+      struct ncclIbQp* qp = comm->base.qps + q;
+      int ibDevN = comm->devs[qp->devIndex].base.ibDevN;
+      struct ncclIbDev* ibDev = ncclIbDevs + ibDevN;
+      INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
+        ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask);
+    }
+  }
+  comm->base.ready = 1;
   stage->state = ncclIbCommStateConnected;
   stage->offset = 0;
 
 ib_send_ready:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset));
   if (stage->offset != sizeof(int)) return ncclSuccess;
 
   free(stage->buffer);
@@ -462,9 +639,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclIbConnect_v6(int dev, void* opaqueHandle, void** sendComm) {
+  ncclNetDeviceHandle_v7_t* handle = NULL;
+  return ncclIbConnect(dev, opaqueHandle, sendComm, &handle);
+}
+
 NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
-ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm,  ncclNetDeviceHandle_t** recvDevComm) {
   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
@@ -483,116 +665,177 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
   stage->comm = rComm;
   stage->state = ncclIbCommStateAccept;
-  NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, NCCL_SOCKET_MAGIC, ncclSocketTypeUnknown, NULL, 0));
-  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
+  NCCLCHECK(ncclSocketInit(&rComm->base.sock, NULL, NCCL_SOCKET_MAGIC, ncclSocketTypeUnknown, NULL, 0));
+  NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock));
 
 ib_accept_check:
-  NCCLCHECK(ncclSocketReady(&rComm->sock, &ready));
+  NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready));
   if (!ready) return ncclSuccess;
 
-  struct ncclIbQpInfo remQpInfo;
+  struct ncclIbConnectionMetadata remMeta;
   stage->state = ncclIbCommStateRecv;
   stage->offset = 0;
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
 
 ib_recv:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
-  if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset));
+  if (stage->offset != sizeof(remMeta)) return ncclSuccess;
 
   /* copy back the received info */
-  memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
-
-  rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn;
-  rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
+  memcpy(&remMeta, stage->buffer, sizeof(struct ncclIbConnectionMetadata));
+
+    // IB setup
+  // Pre-declare variables because of goto
+  struct ncclIbMergedDev* mergedDev;
+  struct ncclIbDev* ibDev;
+  int ibDevN;
+  struct ncclIbRecvCommDev* rCommDev;
+  struct ncclIbDevInfo* remDevInfo;
+  struct ncclIbQp* qp;
+
+  mergedDev = ncclIbMergedDevs + lComm->dev;
+  rComm->base.ndevs = mergedDev->ndevs;
+  rComm->base.nqps  = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device
+  rComm->base.isSend = false;
+
+  rComm->base.nRemDevs = remMeta.ndevs;
+  if (rComm->base.nRemDevs != rComm->base.ndevs) {
+    WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d",
+      mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs);
+  }
 
-  // IB setup
-  struct ibv_context* ctx;
-  uint8_t ib_port;
-  ctx = ncclIbDevs[lComm->dev].context;
-  ib_port = ncclIbDevs[lComm->dev].port;
-  struct ibv_port_attr portAttr;
-  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
-  NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid));
-
-  // QP Creation
-  NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
-  rComm->nqps = ncclParamIbQpsPerConn();
-  for (int q=0; q<rComm->nqps; q++) {
-    NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q));
+  // Metadata to send back to requestor (sender)
+  struct ncclIbConnectionMetadata meta;
+  for (int i = 0; i < rComm->base.ndevs; i++) {
+    rCommDev = rComm->devs + i;
+    ibDevN = mergedDev->devs[i];
+    NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
+    ibDev = ncclIbDevs + ibDevN;
+    NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
+    NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid));
   }
 
-  // Adjust the MTU
-  remQpInfo.mtu = (enum ibv_mtu)MIN(remQpInfo.mtu, portAttr.active_mtu);
+  // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
+  for (int i = 0; i < remMeta.ndevs; i++) {
+    rComm->base.remDevs[i] = remMeta.devs[i];
+    rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].iid;
+    rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].spn;
+  }
 
-  // Setup QP
-  for (int q=0; q<rComm->nqps; q++) {
-    struct ibv_qp* qp = rComm->qps[q];
-    NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
-    NCCLCHECK(ncclIbRtsQp(qp));
+  // Stripe QP creation across merged devs
+  // Make sure to get correct remote peer dev and QP info
+  int remDevIndex;
+  int devIndex;
+  devIndex = 0;
+  for (int q = 0; q < rComm->base.nqps; q++) {
+    remDevIndex = remMeta.qpInfo[q].devIndex;
+    remDevInfo = remMeta.devs + remDevIndex;
+    qp = rComm->base.qps+q;
+    rCommDev = rComm->devs + devIndex;
+    qp->remDevIdx = remDevIndex;
+
+    // Local ibDevN
+    ibDevN = rComm->devs[devIndex].base.ibDevN;
+    ibDev = ncclIbDevs + ibDevN;
+    NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp));
+    qp->devIndex = devIndex;
+    devIndex = (devIndex + 1) % rComm->base.ndevs;
+
+    // Set the ece (enhanced connection establishment) on this QP before RTR
+    if (remMeta.qpInfo[q].ece_supported) {
+      NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+
+      // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
+      // Store this in our own qpInfo for returning to the requestor
+      if (meta.qpInfo[q].ece_supported)
+        NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+    }
+    NCCLCHECK(ncclIbRtrQp(qp->qp, remMeta.qpInfo[q].qpn, remDevInfo));
+    NCCLCHECK(ncclIbRtsQp(qp->qp));
   }
 
-  // Retain remote fifo info and prepare my RDMA ops
-  rComm->remFifo.rkey = remQpInfo.fifoRkey;
-  rComm->remFifo.addr = remQpInfo.fifoAddr;
-  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
-  rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
-  if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
-
-  // Allocate Flush dummy buffer for GPU Direct RDMA
-  rComm->gpuFlush.enabled = ((nccl_p2p_gdr_support(lComm->dev) == ncclSuccess) || nccl_p2p_dmabuf_support(lComm->dev) == ncclSuccess) &&
-                                                 (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
-  if (rComm->gpuFlush.enabled) {
-    NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
-    rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem;
-    rComm->gpuFlush.sge.length = 1;
-    rComm->gpuFlush.sge.lkey = rComm->gpuFlush.hostMr->lkey;
-    NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
-    struct ncclIbQpInfo localQpInfo;
-    localQpInfo.lid=portAttr.lid;
-    localQpInfo.link_layer=portAttr.link_layer;
-    localQpInfo.ib_port=ib_port;
-    localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
-    localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
-    localQpInfo.is_global=(ncclParamIbIsGlobal()
+  rComm->flushEnabled = ((nccl_p2p_gdr_support() == ncclSuccess || nccl_p2p_dmabuf_support(lComm->dev) == ncclSuccess)
+                            && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
+
+  for (int i = 0; i < mergedDev->ndevs; i++) {
+    rCommDev = rComm->devs + i;
+    ibDevN = rCommDev->base.ibDevN;
+    ibDev = ncclIbDevs + ibDevN;
+
+    // Retain remote fifo info and prepare my RDMA ops
+    rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
+    rComm->remFifo.addr = remMeta.fifoAddr;
+    NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+    rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
+    if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
+
+    // Allocate Flush dummy buffer for GPU Direct RDMA
+    if (rComm->flushEnabled) {
+      NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
+      rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
+      rCommDev->gpuFlush.sge.length = 1;
+      rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
+      NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp));
+      struct ncclIbDevInfo devInfo;
+      devInfo.lid         = ibDev->portAttr.lid;
+      devInfo.link_layer  = ibDev->portAttr.link_layer;
+      devInfo.ib_port     = ibDev->portNum;
+      devInfo.spn         = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
+      devInfo.iid         = rCommDev->base.gidInfo.localGid.global.interface_id;
+      devInfo.is_global   = (ncclParamIbIsGlobal()
 #if HAVE_DECL_IBV_QPF_GRH_REQUIRED
-                     || (portAttr.flags & IBV_QPF_GRH_REQUIRED)
+                     || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED)
 #endif
       );
-    localQpInfo.mtu=portAttr.active_mtu;
-    NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo));
-    NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
-  }
+      devInfo.mtu         = ibDev->portAttr.active_mtu;
+      NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
+      NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
+    }
 
-  // Fill Handle
-  struct ncclIbQpInfo qpInfo;
-  qpInfo.lid=portAttr.lid;
-  qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer;
-  qpInfo.ib_port=ib_port;
-  for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
-  qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix;
-  qpInfo.iid=rComm->gidInfo.localGid.global.interface_id;
-  qpInfo.is_global=(ncclParamIbIsGlobal()
+    // Fill Handle
+    meta.devs[i].lid        = ibDev->portAttr.lid;
+    meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
+    meta.devs[i].ib_port    = ibDev->portNum;
+    meta.devs[i].spn        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
+    meta.devs[i].iid        = rCommDev->base.gidInfo.localGid.global.interface_id;
+    meta.devs[i].is_global  = (ncclParamIbIsGlobal()
 #if HAVE_DECL_IBV_QPF_GRH_REQUIRED
-                     || (portAttr.flags & IBV_QPF_GRH_REQUIRED)
+                     || (ibDev->portAttr.flags & IBV_QPF_GRH_REQUIRED)
 #endif
       );
-  qpInfo.mtu=remQpInfo.mtu;
+
+    // Adjust the MTU
+    remMeta.devs[i].mtu    = (enum ibv_mtu)MIN(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu);
+    meta.devs[i].mtu      = remMeta.devs[i].mtu;
+
+    // Prepare sizes fifo
+    NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+    meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
+  }
+  meta.fifoAddr = (uint64_t)rComm->sizesFifo;
+
+  for (int q = 0; q < rComm->base.nqps; q++) {
+    meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
+    meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex;
+  }
+
+  meta.ndevs = rComm->base.ndevs;
+  strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
   if (stage->buffer) free(stage->buffer);
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
-  memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
-
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)));
+  memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata));
 ib_send:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
-  if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset));
+  if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess;
 
   stage->offset = 0;
   stage->state = ncclIbCommStatePendingReady;
 
 ib_recv_ready:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV,  &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV,  &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset));
   if (stage->offset != sizeof(int)) return ncclSuccess;
 
   free(stage->buffer);
@@ -606,14 +849,20 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
+ncclResult_t ncclIbAccept_v6(void* listenComm, void** recvComm) {
+  ncclNetDeviceHandle_v7_t* handle = NULL;
+  return ncclIbAccept(listenComm, recvComm, &handle);
+}
+
+ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) {
   for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclIbRequest* r = verbs->reqs+i;
+    struct ncclIbRequest* r = base->reqs+i;
     if (r->type == NCCL_NET_IB_REQ_UNUSED) {
-      r->verbs = verbs;
-      r->events = 1;
+      r->base = base;
       r->sock = NULL;
-      r->gidInfo = NULL;
+      r->devBases[0] = NULL;
+      r->devBases[1] = NULL;
+      r->events[0] = r->events[1] = 0;
       *req = r;
       return ncclSuccess;
     }
@@ -622,6 +871,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
   *req = NULL;
   return ncclInternalError;
 }
+
 ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
   r->type = NCCL_NET_IB_REQ_UNUSED;
   return ncclSuccess;
@@ -629,22 +879,17 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
 
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
-/* DMA-BUF support */
-ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
-  NCCL_STATIC_ASSERT(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset")
-  assert(size > 0);
-
+ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, struct ibv_mr** mhandle) {
   static __thread uintptr_t pageSize = 0;
   if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE);
 
-  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
-  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+  struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache;
   uintptr_t addr = (uintptr_t)data & -pageSize;
   size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
   ncclResult_t res;
-  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
   for (int slot=0; /*true*/; slot++) {
-    if (slot == cache->population) { // didn't find in cache
+    if (slot == cache->population || addr < cache->slots[slot].addr) { // didn't find in cache
       if (cache->population == cache->capacity) { // must grow cache
         cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
         NCCLCHECKGOTO(ncclRealloc((void **)&cache->slots, sizeof(struct ncclIbMr)*cache->population, sizeof(struct ncclIbMr)*cache->capacity), res, returning);
@@ -655,47 +900,75 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
       if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
       if (fd != -1) {
         /* DMA-BUF support */
-        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
       } else {
         if (ncclIbRelaxedOrderingEnabled) {
           // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
-          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
+          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
         }
         else {
-          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags), res, returning);
         }
       }
-      INFO(NCCL_ALL,"regAddr %llx size %lld rkey %x fd %d type %s", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd, type == NCCL_PTR_HOST ? "host" : "gpu");
-      cache->population += 1;
+      TRACE(NCCL_INIT|NCCL_NET,"regAddr=0x%lx size=%lld rkey=0x%x lkey=0x%x fd=%d", (unsigned long)addr, (long long)pages*pageSize, mr->rkey, mr->lkey, fd);
+      if (slot != cache->population) memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclIbMr));
       cache->slots[slot].addr = addr;
       cache->slots[slot].pages = pages;
       cache->slots[slot].refs = 1;
       cache->slots[slot].mr = mr;
+      cache->population += 1;
       *mhandle = (void*)mr;
       res = ncclSuccess;
       goto returning;
-    }
-    else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) {
+     } else if ((addr >= cache->slots[slot].addr) &&
+        ((addr-cache->slots[slot].addr)/pageSize+pages) <= cache->slots[slot].pages) {
       cache->slots[slot].refs += 1;
-      *mhandle = (void*)cache->slots[slot].mr;
+      *mhandle = cache->slots[slot].mr;
       res = ncclSuccess;
       goto returning;
     }
   }
 returning:
-  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
   return res;
 }
 
-ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, int devIndex) {
+  if (base->isSend) {
+    struct ncclIbSendComm* sComm = (struct ncclIbSendComm*) base;
+    return &sComm->devs[devIndex].base;
+  } else {
+    struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*) base;
+    return &rComm->devs[devIndex].base;
+  }
+}
+
+/* DMA-BUF support */
+ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
+  assert(size > 0);
+  struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
+  struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
+  for (int i = 0; i < base->ndevs; i++) {
+    // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
+    struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
+    NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i));
+  }
+  *mhandle = (void*) mhandleWrapper;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  return ncclIbRegMrDmaBuf(comm, data, size, type, 0ULL, -1, mhandle);
+}
+
+ncclResult_t ncclIbRegMr_v7(void* comm, void* data, int size, int type, void** mhandle) {
   return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
 }
 
-ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
-  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
-  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, struct ibv_mr* mhandle) {
+  struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache;
   ncclResult_t res;
-  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
   for (int i=0; i < cache->population; i++) {
     if (mhandle == cache->slots[i].mr) {
       if (0 == --cache->slots[i].refs) {
@@ -705,7 +978,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
           cache->slots = NULL;
           cache->capacity = 0;
         }
-        NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning);
+         NCCLCHECKGOTO(wrap_ibv_dereg_mr(mhandle), res, returning);
       }
       res = ncclSuccess;
       goto returning;
@@ -714,11 +987,23 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
   WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population);
   res = ncclInternalError;
 returning:
-  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
   return res;
 }
 
-NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
+ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
+  struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
+  for (int i = 0; i < base->ndevs; i++) {
+    // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
+    struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
+    NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i]));
+  }
+  free(mhandleWrapper);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
 ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
@@ -727,21 +1012,18 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
 
   uint64_t wr_id = 0ULL;
-
   for (int r=0; r<nreqs; r++) {
     struct ibv_send_wr* wr = comm->wrs+r;
     memset(wr, 0, sizeof(struct ibv_send_wr));
 
     struct ibv_sge* sge = comm->sges+r;
     sge->addr=(uintptr_t)reqs[r]->send.data;
-    sge->lkey=reqs[r]->send.lkey;
 
     wr->opcode = IBV_WR_RDMA_WRITE;
     wr->send_flags = 0;
     wr->wr.rdma.remote_addr = slots[r].addr;
-    wr->wr.rdma.rkey = slots[r].rkey;
-    wr->next = wr+1;
-    wr_id += (reqs[r] - comm->verbs.reqs) << (r*8);
+    wr->next = wr + 1;
+    wr_id += (reqs[r] - comm->base.reqs) << (r*8);
   }
 
   // Write size as immediate data. In the case of multi-send, only write
@@ -750,13 +1032,10 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   if (nreqs == 1) {
     immData = reqs[0]->send.size;
   } else {
-    if (nreqs > 32) {
-      WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
-      return ncclInternalError;
-    }
-    for (int r=0; r<nreqs; r++) {
-      immData |= (reqs[r]->send.size ? 1 : 0) << r;
-    }
+    int* sizes = comm->remSizesFifo.elems[slot];
+    for (int r=0; r<nreqs; r++) sizes[r] = reqs[r]->send.size;
+    comm->remSizesFifo.sge.addr = (uint64_t)sizes;
+    comm->remSizesFifo.sge.length = nreqs*sizeof(int);
   }
 
   struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
@@ -766,6 +1045,12 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     // completion.
     lastWr++;
     memset(lastWr, 0, sizeof(struct ibv_send_wr));
+    if (nreqs > 1) {
+      // Write remote sizes Fifo
+      lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
+      lastWr->num_sge = 1;
+      lastWr->sg_list = &comm->remSizesFifo.sge;
+    }
   }
   lastWr->wr_id = wr_id;
   lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
@@ -775,23 +1060,40 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
   // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
   const int align = 128;
-  const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
-  for (int q=0; q<nqps; q++) {
+  int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+  for (int i = 0; i < nqps; i++) {
+    int qpIndex = comm->base.qpIndex;
+    ncclIbQp* qp = comm->base.qps + qpIndex;
+    int devIndex = qp->devIndex;
     for (int r=0; r<nreqs; r++) {
+      // Track this event for completion
+      //ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
+
+      // Select proper rkey (needed even for 0-size send)
+      comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
+
       int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
       int length = MIN(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
       if (length <= 0) {
         comm->wrs[r].sg_list = NULL;
         comm->wrs[r].num_sge = 0;
       } else {
+        // Select proper lkey
+        comm->sges[r].lkey = reqs[r]->send.lkeys[devIndex];
         comm->sges[r].length = length;
         comm->wrs[r].sg_list = comm->sges+r;
         comm->wrs[r].num_sge = 1;
       }
     }
+
+    if (nreqs > 1) {
+      // Also make sure lastWr writes remote sizes using the right lkey
+      comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
+      lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
+    }
+
     struct ibv_send_wr* bad_wr;
-    NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr));
-    comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
+    NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr));
 
     for (int r=0; r<nreqs; r++) {
       int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
@@ -799,6 +1101,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
       comm->sges[r].addr += chunkSize;
       comm->wrs[r].wr.rdma.remote_addr += chunkSize;
     }
+
+    // Select the next qpIndex
+    comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps;
   }
 
   return ncclSuccess;
@@ -806,16 +1111,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
 ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
-  if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; }
-  if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
+  if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
+  if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
 
-  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+  struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
 
   // Wait for the receiver to have posted the corresponding receive
   int nreqs = 0;
   volatile struct ncclIbSendFifo* slots;
 
-  int slot = (comm->fifoHead)%MAX_REQUESTS;
+  int slot = (comm->fifoHead) % MAX_REQUESTS;
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   slots = comm->fifo[slot];
   uint64_t idx = comm->fifoHead+1;
@@ -827,36 +1132,45 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
   for (int r=0; r<nreqs; r++) {
     if (reqs[r] != NULL || slots[r].tag != tag) continue;
 
-    // Sanity checks to catch user collective call count/size mismatches
-    if (size > slots[r].size) {
-      char line[SOCKET_NAME_MAXLEN + 1];
-      union ncclSocketAddress addr;
-      ncclSocketGetAddr(&comm->sock, &addr);
-      WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
-           r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line, 1), size, slots[r].size);
-      return ncclInvalidUsage;
-    } // plus any potential programming errors
-    else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
+    if (size > slots[r].size) size = slots[r].size;
+    // Sanity checks
+    if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
       char line[SOCKET_NAME_MAXLEN + 1];
       union ncclSocketAddress addr;
-      ncclSocketGetAddr(&comm->sock, &addr);
-      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
-        r, nreqs, tag, ncclSocketToString(&addr, line, 1), slots[r].size, slots[r].addr, slots[r].rkey);
-      return ncclInternalError;
+      ncclSocketGetAddr(&comm->base.sock, &addr);
+      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x",
+        r, nreqs, tag, ncclSocketToString(&addr, line, 1), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
     }
     struct ncclIbRequest* req;
-    NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+    NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
     req->type = NCCL_NET_IB_REQ_SEND;
-    req->sock = &comm->sock;
-    req->verbs = &comm->verbs;
+    req->sock = &comm->base.sock;
+    req->base = &comm->base;
     req->nreqs = nreqs;
     req->send.size = size;
     req->send.data = data;
-    req->send.lkey = mr->lkey;
     req->send.offset = 0;
-    req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
-    req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
-    if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
+
+    // Populate events
+    int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+    int qpIndex = comm->base.qpIndex;
+    // Count down
+    while (nEvents > 0) {
+      ncclIbQp* qp = comm->base.qps + qpIndex;
+      int devIndex = qp->devIndex;
+      ncclIbAddEvent(req, devIndex, &comm->devs[devIndex].base);
+      // Track the valid lkey for this RDMA_Write
+      req->send.lkeys[devIndex] = mhandleWrapper->mrs[devIndex]->lkey;
+      nEvents--;
+      // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs inside ncclIbMultiSend()
+      qpIndex = (qpIndex+1)%comm->base.nqps;
+    }
+
+    // Store all lkeys
+    for (int i = 0; i < comm->base.ndevs; i++) {
+      req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey;
+    }
+
     *request = reqs[r] = req;
 
     // If this is a multi-recv, send only when all requests have matched.
@@ -884,12 +1198,23 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   memset(&wr, 0, sizeof(wr));
 
   int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
+  req->recv.sizes = comm->sizesFifo[slot];
+  for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
   struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
 
+  // Select the next devIndex (local) and QP to use for posting this CTS message
+  // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
+  ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
+  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs;
+
   for (int i=0; i<n; i++) {
     localElem[i].addr = (uint64_t)data[i];
-    struct ibv_mr* mr = (struct ibv_mr*)mhandles[i];
-    localElem[i].rkey = mr->rkey;
+    struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
+
+    // Send all applicable rkeys
+    for (int j = 0; j < comm->base.ndevs; j++)
+      localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
     localElem[i].nreqs = n;
     localElem[i].size = sizes[i]; // Sanity/Debugging
     localElem[i].tag = tags[i];
@@ -897,11 +1222,17 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   }
 
   wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
-  wr.wr.rdma.rkey = comm->remFifo.rkey;
-  comm->remFifo.sge.addr = (uint64_t)localElem;
-  comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo);
-  wr.sg_list = &comm->remFifo.sge;
+
+  // Lookup the correct fifoRkey
+  wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
+
+  // Set the correct sge properties
+  comm->devs[ctsQp->devIndex].fifoSge.addr   = (uint64_t)localElem;
+  comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
+  wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
+
   wr.num_sge = 1;
+
   wr.opcode = IBV_WR_RDMA_WRITE;
   wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE
 
@@ -926,14 +1257,17 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   //    polling it will empty the Send Queue, can be posted)
   //  - The status of all posted Send Request is considered unknown
   //
-  if (slot == 0) {
+  // slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
+  // This works out that each fifo posting QP gets drained
+  if (slot == ctsQp->devIndex) {
+
     wr.send_flags |= IBV_SEND_SIGNALED;
-    wr.wr_id = req - comm->verbs.reqs;
-    req->events++;
+    wr.wr_id = req - comm->base.reqs;
+    ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
   }
 
   struct ibv_send_wr* bad_wr;
-  NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr));
+  NCCLCHECK(wrap_ibv_post_send(ctsQp->qp, &wr, &bad_wr));
   comm->remFifo.fifoTail++;
 
   return ncclSuccess;
@@ -941,42 +1275,48 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
 
 ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
-  if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; }
-  if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
+  if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
+  if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
+
   if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
 
   struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
   req->type = NCCL_NET_IB_REQ_RECV;
-  req->sock = &comm->sock;
+  req->sock = &comm->base.sock;
   req->nreqs = n;
-  if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo;
-  for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
+
+  for (int i = 0; i < comm->base.ndevs; i++) {
+    req->devBases[i] = &comm->devs[i].base;
+  }
 
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = req - comm->verbs.reqs;
 
+  wr.wr_id = req - comm->base.reqs;
   wr.sg_list = NULL;
   wr.num_sge = 0;
 
   TIME_START(1);
-  const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1;
-  for (int q=0; q<nqps; q++) {
-    struct ibv_qp* qp = comm->qps[comm->qpIndex];
-    struct ibv_recv_wr* bad_wr;
-    NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
-    comm->qpIndex = (comm->qpIndex+1)%comm->nqps;
+  // Select either all QPs, or one qp per-device
+  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+
+  // Post recvs
+  struct ibv_recv_wr* bad_wr;
+  for (int i = 0; i < nqps; i++) {
+    struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
+    ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+    NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
+    comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
   }
   TIME_STOP(1);
-  req->events = nqps;
-
-  *request = req;
 
   // Post to FIFO to notify sender
   TIME_START(2);
   NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
   TIME_STOP(2);
+
+  *request = req;
   return ncclSuccess;
 }
 
@@ -984,30 +1324,34 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   int last = -1;
   for (int i=0; i<n; i++) if (sizes[i]) last = i;
-  if (comm->gpuFlush.enabled == 0 || last == -1) return ncclSuccess;
+  if (comm->flushEnabled == 0 || last == -1) return ncclSuccess;
 
   // Only flush once using the last non-zero receive
   struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
   req->type = NCCL_NET_IB_REQ_FLUSH;
-  req->sock = &comm->sock;
-  struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
-
-  struct ibv_send_wr wr;
-  memset(&wr, 0, sizeof(wr));
-  wr.wr_id = req - comm->verbs.reqs;
-
-  wr.wr.rdma.remote_addr = (uint64_t)data[last];
-  wr.wr.rdma.rkey = mr->rkey;
-  wr.sg_list = &comm->gpuFlush.sge;
-  wr.num_sge = 1;
-  wr.opcode = IBV_WR_RDMA_READ;
-  wr.send_flags = IBV_SEND_SIGNALED;
-
-  TIME_START(4);
-  struct ibv_send_wr* bad_wr;
-  NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
-  TIME_STOP(4);
+  req->sock = &comm->base.sock;
+  struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last];
+
+  // We don't know which devIndex the recv was on, so we flush on all devices
+  for (int i = 0; i < comm->base.ndevs; i++) {
+    struct ibv_send_wr wr;
+    memset(&wr, 0, sizeof(wr));
+    wr.wr_id = req - comm->base.reqs;
+
+    wr.wr.rdma.remote_addr = (uint64_t)data[last];
+    wr.wr.rdma.rkey = mhandle->mrs[i]->rkey;
+    wr.sg_list = &comm->devs[i].gpuFlush.sge;
+    wr.num_sge = 1;
+    wr.opcode = IBV_WR_RDMA_READ;
+    wr.send_flags = IBV_SEND_SIGNALED;
+
+    TIME_START(4);
+    struct ibv_send_wr* bad_wr;
+    NCCLCHECK(wrap_ibv_post_send(comm->devs[i].gpuFlush.qp.qp, &wr, &bad_wr));
+    TIME_STOP(4);
+    ncclIbAddEvent(req, i, &comm->devs[i].base);
+  }
 
   *request = req;
   return ncclSuccess;
@@ -1018,74 +1362,108 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   *done = 0;
 
   while (1) {
-    if (r->events == 0) {
+    if (r->events[0] == 0 && r->events[1] == 0) {
+      TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
         for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
       }
+      if (sizes && r->type == NCCL_NET_IB_REQ_SEND) {
+        sizes[0] = r->send.size;
+      }
+
+      if (sizes && r->type == NCCL_NET_IB_REQ_SEND) {
+        sizes[0] = r->send.size;
+      }
+
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
 
+    int totalWrDone = 0;
     int wrDone = 0;
     struct ibv_wc wcs[4];
-    TIME_START(3);
-    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
-    if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
-    if (wrDone == 0) return ncclSuccess;
-
-    for (int w=0; w<wrDone; w++) {
-      struct ibv_wc *wc = wcs+w;
-      if (wc->status != IBV_WC_SUCCESS) {
-        char line[SOCKET_NAME_MAXLEN+1];
-        union ncclSocketAddress addr;
-        ncclSocketGetAddr(r->sock, &addr);
-        char localGidString[INET6_ADDRSTRLEN] = "";
-        char remoteGidString[INET6_ADDRSTRLEN] = "";
-        const char* localGidStr = NULL, *remoteGidStr = NULL;
-        if (r->gidInfo) {
-            localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString));
-            remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString));
-        }
-        WARN("NET/IB : Got completion from peer %s with error %s(%d), opcode %d, len %d, vendor err %d (%s)%s%s%s%s",
-            ncclSocketToString(&addr, line, 1), ibv_wc_status_str(wc->status), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
-            localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString);
-        return ncclRemoteError;
-      }
+        for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) {
+      TIME_START(3);
+      // If we expect any completions from this device's CQ
+      if (r->events[i]) {
+        NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone));
+        totalWrDone += wrDone;
+        if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
+        if (wrDone == 0) continue;
+        for (int w=0; w<wrDone; w++) {
+          struct ibv_wc *wc = wcs+w;
+          if (wc->status != IBV_WC_SUCCESS) {
+            union ncclSocketAddress addr;
+            ncclSocketGetAddr(r->sock, &addr);
+            char localGidString[INET6_ADDRSTRLEN] = "";
+            char remoteGidString[INET6_ADDRSTRLEN] = "";
+            const char* localGidStr = NULL, *remoteGidStr = NULL;
+            if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
+              localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
+              remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
+            }
 
-      struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
-      if (req->type == NCCL_NET_IB_REQ_SEND) {
-        for (int i=0; i<req->nreqs; i++) {
-          struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff);
-          if ((sendReq->events <= 0)) return ncclInternalError;
-          sendReq->events--;
-        }
-      } else {
-        if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-          if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
-          if (req->nreqs > 1) {
-            // In the case of a multi recv, we only set sizes to 0 or 1.
-            for (int i=0; i<req->nreqs; i++) {
-              req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
+            char line[SOCKET_NAME_MAXLEN+1];
+            WARN("NET/IB : Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s",
+                ncclSocketToString(&addr, line, 1), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
+                localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString);
+            return ncclRemoteError;
+          }
+
+          union ncclSocketAddress addr;
+          ncclSocketGetAddr(r->sock, &addr);
+          struct ncclIbRequest* req = r->base->reqs+(wc->wr_id & 0xff);
+
+          #ifdef ENABLE_TRACE
+          char line[SOCKET_NAME_MAXLEN+1];
+          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%d r=%p type=%d events={%d,%d}, i=%d",
+              ncclSocketToString(&addr, line, 1), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
+          #endif
+          if (req->type == NCCL_NET_IB_REQ_SEND) {
+            for (int j = 0; j < req->nreqs; j++) {
+              struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
+              if ((sendReq->events[i] <= 0)) {
+                WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j);
+                return ncclInternalError;
+              }
+              sendReq->events[i]--;
             }
           } else {
-            req->recv.sizes[0] += wc->imm_data;
+            if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+              if (req->type != NCCL_NET_IB_REQ_RECV) {
+                WARN("NET/IB: wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM and req->type=%d", req->type);
+                return ncclInternalError;
+              }
+              if (req->nreqs == 1) {
+                req->recv.sizes[0] += wc->imm_data;
+              }
+            }
+            req->events[i]--;
           }
         }
-        req->events--;
       }
     }
+
+    // If no CQEs found on any device, return and come back later
+    if (totalWrDone == 0) return ncclSuccess;
   }
 }
 
 ncclResult_t ncclIbCloseSend(void* sendComm) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm) {
-    NCCLCHECK(ncclSocketClose(&comm->sock));
-    for (int q=0; q<comm->nqps; q++)
-      if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
-    if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
-    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
+    NCCLCHECK(ncclSocketClose(&comm->base.sock));
+
+    for (int q = 0; q < comm->base.nqps; q++)
+      if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
+
+    for (int i = 0; i < comm->base.ndevs; i++) {
+      struct ncclIbSendCommDev* commDev = comm->devs + i;
+      if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
+      if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i]));
+      NCCLCHECK(ncclIbDestroyBase(&commDev->base));
+    }
     free(comm);
   }
   TIME_PRINT("IB");
@@ -1095,15 +1473,21 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
 ncclResult_t ncclIbCloseRecv(void* recvComm) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm) {
-    NCCLCHECK(ncclSocketClose(&comm->sock));
-    for (int q=0; q<comm->nqps; q++)
-      if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
-    if (comm->gpuFlush.enabled) {
-      if (comm->gpuFlush.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->gpuFlush.qp));
-      if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
+    NCCLCHECK(ncclSocketClose(&comm->base.sock));
+
+    for (int q = 0; q < comm->base.nqps; q++)
+      if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
+
+    for (int i = 0; i < comm->base.ndevs; i++) {
+      struct ncclIbRecvCommDev* commDev = comm->devs + i;
+      if (comm->flushEnabled) {
+        if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp));
+        if (commDev->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.hostMr));
+      }
+      if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
+      if (commDev->sizesFifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->sizesFifoMr));
+      NCCLCHECK(ncclIbDestroyBase(&commDev->base));
     }
-    if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
-    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
   return ncclSuccess;
@@ -1118,8 +1502,8 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
   return ncclSuccess;
 }
 
-const ncclNet_v6_t ibPlugin_v6 = {
-  .name = "IBext",
+const ncclNet_v8_t ibPlugin_v8 = {
+  .name = "IBext_v8",
   .init = ncclIbInit,
   .devices = ncclIbDevices,
   .getProperties = ncclIbGetProperties,
@@ -1136,17 +1520,20 @@ const ncclNet_v6_t ibPlugin_v6 = {
   .closeSend = ncclIbCloseSend,
   .closeRecv = ncclIbCloseRecv,
   .closeListen = ncclIbCloseListen,
+  NULL /* getDeviceMr */,
+  NULL /* irecvConsumed */
 };
 
-const ncclNet_v5_t ibPlugin_v5 = {
-  .name = "IBext",
+const ncclNet_v7_t ibPlugin_v7 = {
+  .name = "IBext_v7",
   .init = ncclIbInit,
   .devices = ncclIbDevices,
-  .getProperties = ncclIbGetProperties,
+  .getProperties = ncclIbGetProperties_v7,
   .listen = ncclIbListen,
   .connect = ncclIbConnect,
   .accept = ncclIbAccept,
-  .regMr = ncclIbRegMr,
+  .regMr = ncclIbRegMr_v7,
+  .regMrDmaBuf = ncclIbRegMrDmaBuf,
   .deregMr = ncclIbDeregMr,
   .isend = ncclIbIsend,
   .irecv = ncclIbIrecv,
@@ -1155,60 +1542,43 @@ const ncclNet_v5_t ibPlugin_v5 = {
   .closeSend = ncclIbCloseSend,
   .closeRecv = ncclIbCloseRecv,
   .closeListen = ncclIbCloseListen,
+  NULL /* getDeviceMr */,
+  NULL /* irecvConsumed */
 };
 
-static ncclResult_t ncclIbGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
-  ncclNetProperties_v6_t props_v6;
-  ncclResult_t ret = ncclIbGetProperties(dev, &props_v6);
-  if (ret != ncclSuccess) return ret;
-  props->name = props_v6.name;
-  props->pciPath = props_v6.pciPath;
-  props->guid = props_v6.guid;
-  props->ptrSupport = props_v6.ptrSupport;
-  props->speed = props_v6.speed;
-  props->port = props_v6.port;
-  props->maxComms = props_v6.maxComms;
-  return ncclSuccess;
-}
-static ncclResult_t ncclIbIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
-  return ncclIbIsend(sendComm, data, size, 0, mhandle, request);
-}
-static ncclResult_t ncclIbIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
-  int tag = 0;
-  return ncclIbIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
-}
-static ncclResult_t ncclIbIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
-  return ncclIbIflush(recvComm, 1, &data, &size, &mhandle, request);
-}
-static ncclResult_t ncclIbConnect_v4(int dev, void* handle, void** sendComm) {
-  ncclResult_t ret;
-  do {
-    ret = ncclIbConnect(dev, handle, sendComm);
-  } while (ret == ncclSuccess && *sendComm == NULL);
-  return ret;
-}
-static ncclResult_t ncclIbAccept_v4(void* listenComm, void** recvComm) {
-  ncclResult_t ret;
-  do {
-    ret = ncclIbAccept(listenComm, recvComm);
-  } while (ret == ncclSuccess && *recvComm == NULL);
-  return ret;
-}
-
+const ncclNet_v6_t ibPlugin_v6 = {
+  .name = "IBext_v6",
+  .init = ncclIbInit,
+  .devices = ncclIbDevices,
+  .getProperties = ncclIbGetProperties_v6,
+  .listen = ncclIbListen,
+  .connect = ncclIbConnect_v6,
+  .accept = ncclIbAccept_v6,
+  .regMr = ncclIbRegMr_v7,
+  .regMrDmaBuf = ncclIbRegMrDmaBuf,
+  .deregMr = ncclIbDeregMr,
+  .isend = ncclIbIsend,
+  .irecv = ncclIbIrecv,
+  .iflush = ncclIbIflush,
+  .test = ncclIbTest,
+  .closeSend = ncclIbCloseSend,
+  .closeRecv = ncclIbCloseRecv,
+  .closeListen = ncclIbCloseListen,
+};
 
-const ncclNet_v4_t ibPlugin_v4 = {
-  .name = "IBext",
+const ncclNet_v5_t ibPlugin_v5 = {
+  .name = "IBext_v5",
   .init = ncclIbInit,
   .devices = ncclIbDevices,
-  .getProperties = ncclIbGetProperties_v4,
+  .getProperties = ncclIbGetProperties_v6,
   .listen = ncclIbListen,
-  .connect = ncclIbConnect_v4,
-  .accept = ncclIbAccept_v4,
-  .regMr = ncclIbRegMr,
+  .connect = ncclIbConnect_v6,
+  .accept = ncclIbAccept_v6,
+  .regMr = ncclIbRegMr_v7,
   .deregMr = ncclIbDeregMr,
-  .isend = ncclIbIsend_v4,
-  .irecv = ncclIbIrecv_v4,
-  .iflush = ncclIbIflush_v4,
+  .isend = ncclIbIsend,
+  .irecv = ncclIbIrecv,
+  .iflush = ncclIbIflush,
   .test = ncclIbTest,
   .closeSend = ncclIbCloseSend,
   .closeRecv = ncclIbCloseRecv,
diff --git a/nccl_plugin/src/ibvwrap.c b/nccl_plugin/src/ibvwrap.c
index 9d21063..4e4c771 100644
--- a/nccl_plugin/src/ibvwrap.c
+++ b/nccl_plugin/src/ibvwrap.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -24,10 +24,24 @@
   } \
   return ncclSuccess;
 
+#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(call, success_retval, name, supported) \
+  int ret = call; \
+  if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
+    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 0; \
+    return ncclSuccess; \
+  } else if (ret != success_retval) { \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 1; \
+    return ncclSystemError; \
+  } \
+  *supported = 1; \
+  return ncclSuccess;
+
 #define IBV_INT_CHECK_RET_ERRNO(call, success_retval, name) \
   int ret = call; \
   if (ret != success_retval) { \
-    WARN("Call to " name " failed with error %s", strerror(ret)); \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
@@ -169,6 +183,26 @@ ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struc
   return ncclSuccess;
 }
 
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+#if HAVE_DECL_IBV_QUERY_ECE
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_query_ece(qp, ece), 0, "ibv_query_ece", supported);
+#else
+    INFO(NCCL_NET, "Call to ibv_query_ece is skipped, doesn't exist");
+    *supported = 0;
+    return ncclSuccess;
+#endif
+}
+
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+#if HAVE_DECL_IBV_SET_ECE
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibv_set_ece(qp, ece), 0, "ibv_set_ece", supported);
+#else
+    INFO(NCCL_NET, "Call to ibv_set_ece skipped, doesn't exist");
+    *supported = 0;
+    return ncclSuccess;
+#endif
+}
+
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
   *ret = (char *) ibv_event_type_str(event);
   return ncclSuccess;
diff --git a/nccl_plugin/src/optcast_plugin.cc b/nccl_plugin/src/optcast_plugin.cc
index 5414195..a684dd6 100644
--- a/nccl_plugin/src/optcast_plugin.cc
+++ b/nccl_plugin/src/optcast_plugin.cc
@@ -17,7 +17,8 @@
 
 extern ncclNet_v6_t ncclNetPlugin_v6;
 extern ncclNet_v5_t ncclNetPlugin_v5;
-extern ncclNet_v4_t ncclNetPlugin_v4;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
 
 int ncclNSharpDevs = -1;
 
@@ -389,11 +390,6 @@ static ncclResult_t ncclOptcastGetProperties_v5(int dev, ncclNetProperties_v5_t
   return ncclNetPlugin_v5.getProperties(dev, props);
 }
 
-static ncclResult_t ncclOptcastGetProperties_v4(int dev, ncclNetProperties_v4_t *props)
-{
-  return ncclNetPlugin_v4.getProperties(dev, props);
-}
-
 static ncclResult_t ncclOptcastListen(int dev, void *opaqueHandle, void **listenComm)
 {
   struct optcastListenComm *lComm;
@@ -639,19 +635,3 @@ ncclCollNet_v5_t ncclCollNetPlugin_v5 = {
     ncclOptcastTest,
     ncclOptcastCloseColl,
     ncclOptcastCloseListen};
-
-ncclCollNet_v4_t ncclCollNetPlugin_v4 = {
-    "Optcast",
-    ncclOptcastInit,
-    ncclOptcastDevices,
-    ncclOptcastGetProperties_v4,
-    ncclOptcastListen,
-    ncclOptcastConnect,
-    ncclOptcastReduceSupport,
-    ncclOptcastRegMr,
-    ncclOptcastDeregMr,
-    ncclOptcastIallreduce,
-    ncclOptcastIflush,
-    ncclOptcastTest,
-    ncclOptcastCloseColl,
-    ncclOptcastCloseListen};
diff --git a/nccl_plugin/src/p2p_plugin.c b/nccl_plugin/src/p2p_plugin.c
index 227a7be..daf90ce 100644
--- a/nccl_plugin/src/p2p_plugin.c
+++ b/nccl_plugin/src/p2p_plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -15,26 +15,26 @@
 #include "p2p_plugin.h"
 
 #ifdef HAVE_UCX_PLUGIN
+extern ncclNet_v8_t ucxPlugin_v8;
+extern ncclNet_v7_t ucxPlugin_v7;
 extern ncclNet_v6_t ucxPlugin_v6;
 extern ncclNet_v5_t ucxPlugin_v5;
-extern ncclNet_v4_t ucxPlugin_v4;
+extern ncclNet_v8_t ucxRmaPlugin_v8;
+extern ncclNet_v7_t ucxRmaPlugin_v7;
 extern ncclNet_v6_t ucxRmaPlugin_v6;
 extern ncclNet_v5_t ucxRmaPlugin_v5;
-extern ncclNet_v4_t ucxRmaPlugin_v4;
 #endif
 
+extern ncclNet_v8_t ibPlugin_v8;
+extern ncclNet_v7_t ibPlugin_v7;
 extern ncclNet_v6_t ibPlugin_v6;
 extern ncclNet_v5_t ibPlugin_v5;
-extern ncclNet_v4_t ibPlugin_v4;
-
-extern ncclNet_v6_t socketPlugin_v6;
-extern ncclNet_v5_t socketPlugin_v5;
-extern ncclNet_v4_t socketPlugin_v4;
 pthread_mutex_t nccl_p2p_lock = PTHREAD_MUTEX_INITIALIZER;
 
 ncclDebugLogger_t pluginLogFunction;
+static int ncclNMergedIbDevs = -1;
 
-#ifdef HAVE_SHARP_PLUGIN
+#if 1
 extern int ncclNSharpDevs;
 #else
 /* In case sharp plugin is not there just define this variable locally to make code cleaner */
@@ -44,32 +44,43 @@ extern int ncclIbRelaxedOrderingEnabled;
 NCCL_PARAM(SharpMaxComms, "SHARP_MAX_COMMS", 1);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
 
-ncclResult_t pluginInit(ncclDebugLogger_t logFunction);
+ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction);
+ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction);
+ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction);
+ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction);
+
+ncclNet_v8_t ncclNetPlugin_v8 = {
+  "NCCL RDMA Plugin v8",
+  pluginInit_v8,
+};
+
+ncclNet_v7_t ncclNetPlugin_v7 = {
+  "NCCL RDMA Plugin v7",
+  pluginInit_v7,
+};
 
 ncclNet_v6_t ncclNetPlugin_v6 = {
   "NCCL RDMA Plugin v6",
-  pluginInit,
+  pluginInit_v6,
 };
 
 
 ncclNet_v5_t ncclNetPlugin_v5 = {
   "NCCL RDMA Plugin v5",
-  pluginInit,
+  pluginInit_v5,
 };
 
 
-ncclNet_v4_t ncclNetPlugin_v4 = {
-  "NCCL RDMA Plugin v4",
-  pluginInit,
-};
-
 static nccl_p2p_plugin_t p2p_plugin = NCCL_P2P_LAST;
 
-ncclResult_t pluginInit(ncclDebugLogger_t logFunction)
+static void pluginSetup()
 {
-  pluginLogFunction = logFunction;
-
   p2p_plugin = NCCL_P2P_IB;
+  const char *plugin_path = get_plugin_lib_path();
+  if (plugin_path != NULL) {
+    INFO(NCCL_INIT|NCCL_NET, "Plugin Path : %s", plugin_path);;
+  }
+
   const char *p2p_layer = getenv("NCCL_PLUGIN_P2P");
   if (p2p_layer != NULL) {
     if (!strcasecmp(p2p_layer, "ib")) p2p_plugin = NCCL_P2P_IB;
@@ -77,50 +88,67 @@ ncclResult_t pluginInit(ncclDebugLogger_t logFunction)
     else if (!strcasecmp(p2p_layer, "ucx")) p2p_plugin = NCCL_P2P_UCX;
     else if (!strcasecmp(p2p_layer, "ucx_rma")) p2p_plugin = NCCL_P2P_UCX_RMA;
 #endif
-    else if (!strcasecmp(p2p_layer, "socket")) p2p_plugin = NCCL_P2P_SOCKET;
     else {
       WARN("Invalid value %s for NCCL_PLUGIN_P2P, using default", p2p_layer);
     }
   }
   switch (p2p_plugin) {
-    case NCCL_P2P_IB:
-      ncclNetPlugin_v6 = ibPlugin_v6;
-      ncclNetPlugin_v5 = ibPlugin_v5;
-      ncclNetPlugin_v4 = ibPlugin_v4;
-      break;
-    case NCCL_P2P_SOCKET:
-      ncclNetPlugin_v6 = socketPlugin_v6;
-      ncclNetPlugin_v5 = socketPlugin_v5;
-      ncclNetPlugin_v4 = socketPlugin_v4;
-      break;
 #ifdef HAVE_UCX_PLUGIN
     case NCCL_P2P_UCX:
+      ncclNetPlugin_v8 = ucxPlugin_v8;
+      ncclNetPlugin_v7 = ucxPlugin_v7;
       ncclNetPlugin_v6 = ucxPlugin_v6;
       ncclNetPlugin_v5 = ucxPlugin_v5;
-      ncclNetPlugin_v4 = ucxPlugin_v4;
       break;
     case NCCL_P2P_UCX_RMA:
+      ncclNetPlugin_v8 = ucxRmaPlugin_v8;
+      ncclNetPlugin_v7 = ucxRmaPlugin_v7;
       ncclNetPlugin_v6 = ucxRmaPlugin_v6;
       ncclNetPlugin_v5 = ucxRmaPlugin_v5;
-      ncclNetPlugin_v4 = ucxRmaPlugin_v4;
       break;
 #endif
+    default:
+      ncclNetPlugin_v8 = ibPlugin_v8;
+      ncclNetPlugin_v7 = ibPlugin_v7;
+      ncclNetPlugin_v6 = ibPlugin_v6;
+      ncclNetPlugin_v5 = ibPlugin_v5;
+      break;
   }
 
-  INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", NCCL_PLUGIN_SYMBOL.name);
-  return NCCL_PLUGIN_SYMBOL.init(logFunction);
 }
 
-NCCL_PARAM(GdrDisable, "GDR_DISABLE", 0);
+ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction) {
+  pluginLogFunction = logFunction;
+  pluginSetup();
+  INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v8.name);
+  return ncclNetPlugin_v8.init(logFunction);
+}
+
+ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction) {
+  pluginLogFunction = logFunction;
+  pluginSetup();
+  INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v7.name);
+  return ncclNetPlugin_v7.init(logFunction);
+}
 
-ncclResult_t nccl_p2p_gdr_support(int dev)
+ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction) {
+  pluginLogFunction = logFunction;
+  pluginSetup();
+  INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v6.name);
+  return ncclNetPlugin_v6.init(logFunction);
+}
+
+ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction) {
+  pluginLogFunction = logFunction;
+  pluginSetup();
+  INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v5.name);
+  return ncclNetPlugin_v5.init(logFunction);
+}
+
+ncclResult_t nccl_p2p_gdr_support()
 {
   static int module_loaded = -1;
 
-  if (ncclParamGdrDisable() == 1) {
-    return ncclSystemError;
-  }
-
   if (module_loaded == -1) {
     module_loaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
   }
@@ -142,13 +170,20 @@ ncclResult_t nccl_p2p_dmabuf_support(int dev) {
     ncclResult_t res;
     struct ibv_pd* pd;
     struct ibv_context* ctx;
-    ctx = ncclIbDevs[dev].context;
-    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
-    // Test kernel DMA-BUF support with a dummy call (fd=-1)
-    (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
-    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
-    dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
-    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+    struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
+
+    // Test each dev
+    for (int i = 0; i < mergedDev->ndevs; i++) {
+      int ibDev = mergedDev->devs[i];
+      ctx = ncclIbDevs[ibDev].context;
+      NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+      // Test kernel DMA-BUF support with a dummy call (fd=-1)
+      (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
+      // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+      dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
+      NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+    }
+
   }
   if (dmaBufSupported == 0) return ncclSystemError;
   return ncclSuccess;
@@ -158,39 +193,52 @@ ncclResult_t nccl_p2p_dmabuf_support(int dev) {
 }
 
 
-ncclResult_t nccl_p2p_ib_get_properties(nccl_ib_dev_t *devs, int dev, ncclNetProperties_t* props)
+ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int dev, ncclNetProperties_t* props)
 {
-  props->name         = devs[dev].devName;
-  props->pciPath      = devs[dev].pciPath;
-  props->guid         = devs[dev].guid;
+  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev;
+  props->name = mergedDev->devName;
+  props->speed = mergedDev->speed;
+
+  // Take the rest of the properties from an arbitrary sub-device (should be the same)
+  struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0];
+  props->pciPath = ibDev->pciPath;
+  props->guid = ibDev->guid;
+
   props->ptrSupport   = NCCL_PTR_HOST;
-  if (nccl_p2p_gdr_support(dev) == ncclSuccess) {
+  if (nccl_p2p_gdr_support() == ncclSuccess) {
     props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
     INFO(NCCL_NET,"NET/IB : GPU Direct RDMA (nvidia-peermem) enabled for HCA %d '%s", dev, devs[dev].devName);
   }
+  props->regIsGlobal = 1;
   if (p2p_plugin == NCCL_P2P_IB && nccl_p2p_dmabuf_support(dev) == ncclSuccess) {
     props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
     INFO(NCCL_NET,"NET/IB : GPU Direct RDMA (DMABUF) enabled for HCA %d '%s", dev, devs[dev].devName);
   }
 
-  props->speed        = devs[dev].speed;
   props->latency      = 0; // Not set
-  props->port         = devs[dev].port + devs[dev].realPort;
-  props->maxComms     = devs[dev].maxQp;
-  props->maxRecvs     = (p2p_plugin == NCCL_P2P_IB) ? NCCL_NET_IB_MAX_RECVS : 1;
+  props->port = ibDev->portNum + ibDev->realPort;
+  props->maxComms = ibDev->maxQp;
+
+  if (p2p_plugin == NCCL_P2P_IB || p2p_plugin == NCCL_P2P_UCX) {
+    props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
+  } else {
+    props->maxRecvs = 1;
+  }
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
 
   return ncclSuccess;
 }
 
 static void* ncclIbAsyncThreadMain(void* args) {
-  struct ibv_context* context = (struct ibv_context*)args;
+  struct ncclIbDev* dev = (struct ncclIbDev*)args;
   while (1) {
     struct ibv_async_event event;
-    if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; }
+    if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; }
     char *str;
     if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
     if (event.event_type != IBV_EVENT_COMM_EST)
-      WARN("NET/IB : Got async event : %s", str);
+      WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str);
     if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
   }
   return NULL;
@@ -206,7 +254,26 @@ int devSharpCompare(const void *a, const void *b)
   else { return 1; }
 }
 
-ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction)
+// Compare ncclIbDev[dev] to all stored mergedIbDevs
+int ncclIbFindMatchingDev(int dev) {
+  for (int i = 0; i < ncclNMergedIbDevs; i++) {
+    if (ncclIbMergedDevs[i].ndevs < NCCL_IB_MAX_DEVS_PER_NIC) {
+      int compareDev = ncclIbMergedDevs[i].devs[0];
+      if (strcmp(ncclIbDevs[dev].pciPath, ncclIbDevs[compareDev].pciPath) == 0 &&
+          (ncclIbDevs[dev].guid == ncclIbDevs[compareDev].guid) &&
+          (ncclIbDevs[dev].link == ncclIbDevs[compareDev].link)) {
+          TRACE(NCCL_NET, "NET/IB: Matched name1=%s pciPath1=%s guid1=0x%lx link1=%u name2=%s pciPath2=%s guid2=0x%lx link2=%u",
+            ncclIbDevs[dev].devName, ncclIbDevs[dev].pciPath, ncclIbDevs[dev].guid, ncclIbDevs[dev].link,
+            ncclIbDevs[compareDev].devName, ncclIbDevs[compareDev].pciPath, ncclIbDevs[compareDev].guid, ncclIbDevs[compareDev].link);
+          return i;
+      }
+    }
+  }
+
+  return ncclNMergedIbDevs;
+}
+
+ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction)
 {
   int ncclNIbDevs = *num_devs;
 
@@ -216,6 +283,7 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc
     wrap_ibv_fork_init();
     if (ncclNIbDevs == -1) {
       ncclNIbDevs = 0;
+      ncclNMergedIbDevs = 0;
       ncclNSharpDevs = 0;
       if (ncclFindInterfaces(ncclIbIfName, ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
         WARN("NET/IB : No IP interface found.");
@@ -249,10 +317,10 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc
           if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
           continue;
         }
-        for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+        for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
           struct ibv_port_attr portAttr;
-          if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
-            WARN("NET/IB : Unable to query port %d", port);
+          if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
+            WARN("NET/IB : Unable to query port_num %d", port_num);
             continue;
           }
           if (portAttr.state != IBV_PORT_ACTIVE) continue;
@@ -260,15 +328,13 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc
               && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
           // check against user specified HCAs/ports
-          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+          if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
             continue;
           }
-          TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
-              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
           pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
           ncclIbDevs[ncclNIbDevs].device = d;
           ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
-          ncclIbDevs[ncclNIbDevs].port = port;
+          ncclIbDevs[ncclNIbDevs].portNum = port_num;
           ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
           ncclIbDevs[ncclNIbDevs].speed = nccl_p2p_ib_speed(portAttr.active_speed) * nccl_p2p_ib_width(portAttr.active_width);
           ncclIbDevs[ncclNIbDevs].context = context;
@@ -281,20 +347,49 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc
           ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
           ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
 
-          // Enable ADAPTIVE_ROUTING by default on IB networks
+         // Enable ADAPTIVE_ROUTING by default on IB networks
           // But allow it to be overloaded by an env parameter
           ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
           if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
 
-          ncclIbDevs[ncclNIbDevs].isSharpDev = 1;
-          ncclIbDevs[ncclNIbDevs].maxQp = ncclParamSharpMaxComms();
-          ncclNSharpDevs++;
-
+          ncclIbDevs[ncclNIbDevs].isSharpDev = 0;
+          if (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND)
+          {
+            ncclIbDevs[ncclNIbDevs].isSharpDev = 1;
+            ncclIbDevs[ncclNIbDevs].maxQp = ncclParamSharpMaxComms();
+            ncclNSharpDevs++;
+          }
+          TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
+            portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
           if (ncclIbAsyncThread != NULL) {
-            pthread_create(ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+            pthread_create(ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs);
             ncclSetThreadName(*ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
             pthread_detach(*ncclIbAsyncThread); // will not be pthread_join()'d
           }
+
+          int mergedDev = ncclNMergedIbDevs;
+          if (ncclParamIbMergeNics()) {
+            mergedDev = ncclIbFindMatchingDev(ncclNIbDevs);
+          }
+
+          // No matching dev found, create new mergedDev entry (it's okay if there's only one dev inside)
+          if (mergedDev == ncclNMergedIbDevs) {
+            // Set ndevs to 1, assign first ibDevN to the current IB device
+            ncclIbMergedDevs[mergedDev].ndevs = 1;
+            ncclIbMergedDevs[mergedDev].devs[0] = ncclNIbDevs;
+            ncclNMergedIbDevs++;
+            strncpy(ncclIbMergedDevs[mergedDev].devName, ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE);
+          // Matching dev found, edit name
+          } else {
+            // Set next device in this array to the current IB device
+            int ndevs = ncclIbMergedDevs[mergedDev].ndevs;
+            ncclIbMergedDevs[mergedDev].devs[ndevs] = ncclNIbDevs;
+            ncclIbMergedDevs[mergedDev].ndevs++;
+            snprintf(ncclIbMergedDevs[mergedDev].devName + strlen(ncclIbMergedDevs[mergedDev].devName), MAXNAMESIZE+1, "+%s", ncclIbDevs[ncclNIbDevs].devName);
+          }
+
+          // Aggregate speed
+          ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed;
           ncclNIbDevs++;
           nPorts++;
         }
@@ -310,33 +405,48 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *nc
         qsort(ncclIbDevs, ncclNIbDevs, sizeof(struct ncclIbDev), devSharpCompare);
       }
 
-      char line[1024];
+      char line[2048];
       line[0] = '\0';
       // Determine whether RELAXED_ORDERING is enabled and possible
       ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
-      for (int d=0; d<ncclNIbDevs; d++) {
+      for (int d = 0; d < ncclNMergedIbDevs; d++) {
+        struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + d;
+        if (mergedDev->ndevs > 1) {
+          // Print out merged dev info
+          snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d);
+          for (int i = 0; i < mergedDev->ndevs; i++) {
+            int ibDev = mergedDev->devs[i];
+            snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName,
+              ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE",
+              // Insert comma to delineate
+              i == (mergedDev->ndevs - 1) ? "" : ", ");
+          }
+          snprintf(line+strlen(line), 2047-strlen(line), "}");
+        } else {
+          int ibDev = mergedDev->devs[0];
 #ifdef HAVE_SHARP_PLUGIN
-        snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s%s", d, ncclIbDevs[d].devName,
-            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE",
-            ncclIbDevs[d].isSharpDev ? "/SHARP" : "");
+          snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName,
+            ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE",
+            ncclIbDevs[ibDev].isSharpDev ? "/SHARP" : "");
 #else
-        snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
-            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName,
+            ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
 #endif
+        }
       }
-      line[1023] = '\0';
+      line[2047] = '\0';
       char addrline[SOCKET_NAME_MAXLEN+1];
       INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
            ncclIbIfName, ncclSocketToString(ncclIbIfAddr, addrline, 1));
     }
-    *num_devs = ncclNIbDevs;
+    *num_devs = ncclNMergedIbDevs;
     pthread_mutex_unlock(&nccl_p2p_lock);
   }
   return ncclSuccess;
 
 }
 
-ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_name, char** path, int* real_port)
+ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port)
 {
   char device_path[PATH_MAX];
   snprintf(device_path, PATH_MAX, "/sys/class/infiniband/%s/device", dev_name);
@@ -347,7 +457,7 @@ ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_n
     // Merge multi-port NICs into the same PCI device
     p[strlen(p)-1] = '0';
     // Also merge virtual functions (VF) into the same device
-    p[strlen(p)-3] = '0';
+    if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
     // And keep the real port aside (the ibv port is always 1 on recent cards)
     *real_port = 0;
     for (int d=0; d<num_devs; d++) {
@@ -394,4 +504,4 @@ nccl_p2p_plugin_t nccl_p2p_get_plugin_type()
 
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct ncclIbDev userIbDevs[MAX_IB_DEVS];
-
+struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS];
diff --git a/nccl_plugin/src/param.c b/nccl_plugin/src/param.c
index c965331..32082b7 100644
--- a/nccl_plugin/src/param.c
+++ b/nccl_plugin/src/param.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -64,7 +64,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
   pthread_mutex_lock(&mutex);
   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
-    char* str = getenv(env);
+    const char* str = ncclGetEnv(env);
     int64_t value = deftVal;
     if (str && strlen(str) > 0) {
       errno = 0;
@@ -80,3 +80,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
   }
   pthread_mutex_unlock(&mutex);
 }
+
+const char *ncclGetEnv(const char *name) {
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, initEnv);
+  return getenv(name);
+}
diff --git a/nccl_plugin/src/socket_plugin.c b/nccl_plugin/src/socket_plugin.c
index 2336950..4fd3712 100644
--- a/nccl_plugin/src/socket_plugin.c
+++ b/nccl_plugin/src/socket_plugin.c
@@ -102,6 +102,19 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclSockGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
+  props->name = ncclNetSocketDevs[dev].devName;
+  props->pciPath = ncclNetSocketDevs[dev].pciPath;
+  props->guid = dev;
+  props->ptrSupport = NCCL_PTR_HOST;
+  NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
+  props->latency = 0; // Not set
+  props->port = 0;
+  props->maxComms = 65536;
+  props->maxRecvs = 1;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclSockGetProperties(int dev, ncclNetProperties_t* props) {
   props->name = ncclNetSocketDevs[dev].devName;
   props->pciPath = ncclNetSocketDevs[dev].pciPath;
@@ -614,7 +627,7 @@ const ncclNet_v6_t socketPlugin_v6 = {
   .name = "SOCKext",
   .init = ncclSockInit,
   .devices = ncclSockDevices,
-  .getProperties = ncclSockGetProperties,
+  .getProperties = ncclSockGetProperties_v6,
   .listen = ncclSockListen,
   .connect = ncclSockConnect,
   .accept = ncclSockAccept,
@@ -634,7 +647,7 @@ const ncclNet_v5_t socketPlugin_v5 = {
   .name = "SOCKext",
   .init = ncclSockInit,
   .devices = ncclSockDevices,
-  .getProperties = ncclSockGetProperties,
+  .getProperties = ncclSockGetProperties_v6,
   .listen = ncclSockListen,
   .connect = ncclSockConnect,
   .accept = ncclSockAccept,
@@ -648,61 +661,3 @@ const ncclNet_v5_t socketPlugin_v5 = {
   .closeRecv = ncclSockClose,
   .closeListen = ncclSockCloseListen,
 };
-
-static ncclResult_t ncclSockGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
-  ncclNetProperties_v6_t props_v6;
-  ncclResult_t ret = ncclSockGetProperties(dev, &props_v6);
-  if (ret != ncclSuccess) return ret;
-  props->name = props_v6.name;
-  props->pciPath = props_v6.pciPath;
-  props->guid = props_v6.guid;
-  props->ptrSupport = props_v6.ptrSupport;
-  props->speed = props_v6.speed;
-  props->port = props_v6.port;
-  props->maxComms = props_v6.maxComms;
-  return ncclSuccess;
-}
-static ncclResult_t ncclSockIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
-  return ncclSockIsend(sendComm, data, size, 0, mhandle, request);
-}
-static ncclResult_t ncclSockIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
-  int tag = 0;
-  return ncclSockIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
-}
-static ncclResult_t ncclSockIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
-  return ncclSockIflush(recvComm, 1, &data, &size, &mhandle, request);
-}
-static ncclResult_t ncclSockConnect_v4(int dev, void* handle, void** sendComm) {
-  ncclResult_t ret;
-  do {
-    ret = ncclSockConnect(dev, handle, sendComm);
-  } while (ret == ncclSuccess && *sendComm == NULL);
-  return ret;
-}
-static ncclResult_t ncclSockAccept_v4(void* listenComm, void** recvComm) {
-  ncclResult_t ret;
-  do {
-    ret = ncclSockAccept(listenComm, recvComm);
-  } while (ret == ncclSuccess && *recvComm == NULL);
-  return ret;
-}
-
-
-const ncclNet_v4_t socketPlugin_v4 = {
-  .name = "SOCKext",
-  .init = ncclSockInit,
-  .devices = ncclSockDevices,
-  .getProperties = ncclSockGetProperties_v4,
-  .listen = ncclSockListen,
-  .connect = ncclSockConnect_v4,
-  .accept = ncclSockAccept_v4,
-  .regMr = ncclSockRegMr,
-  .deregMr = ncclSockDeregMr,
-  .isend = ncclSockIsend_v4,
-  .irecv = ncclSockIrecv_v4,
-  .iflush = ncclSockIflush_v4,
-  .test = ncclSockTest,
-  .closeSend = ncclSockClose,
-  .closeRecv = ncclSockClose,
-  .closeListen = ncclSockCloseListen,
-};
diff --git a/nccl_plugin/src/utils.c b/nccl_plugin/src/utils.c
index 2564892..d36beb5 100644
--- a/nccl_plugin/src/utils.c
+++ b/nccl_plugin/src/utils.c
@@ -4,6 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#define _GNU_SOURCE
 #include <unistd.h>
 #include <string.h>
 #include <stdio.h>
@@ -109,71 +110,17 @@ int matchIfList(const char* string, int port, struct netIf* ifList, int listSize
   return 0;
 }
 
-static size_t readFileVarArg(char *buffer, size_t max,
-    const char *filename_fmt, va_list ap)
+const char *get_plugin_lib_path()
 {
-  char filename[PATH_MAX];
-  ssize_t read_bytes;
-  int fd;
+  Dl_info dl_info;
+  int ret;
 
-  vsnprintf(filename, PATH_MAX, filename_fmt, ap);
+  ret = dladdr((void*)&get_plugin_lib_path, &dl_info);
+  if (ret == 0) return NULL;
 
-  fd = open(filename, O_RDONLY);
-  if (fd < 0) {
-    return -1;
-  }
-
-  read_bytes = read(fd, buffer, max - 1);
-  if (read_bytes < 0) {
-    return -1;
-  }
-
-  if (read_bytes < max) {
-    buffer[read_bytes] = '\0';
-  }
-
-out_close:
-  close(fd);
+  return dl_info.dli_fname;
 }
 
-int readFileNumber(long *value, const char *filename_fmt, ...)
-{
-  char buffer[64], *tail;
-  ssize_t read_bytes;
-  va_list ap;
-  long n;
-
-  va_start(ap, filename_fmt);
-  read_bytes = readFileVarArg(buffer, sizeof(buffer) - 1,
-      filename_fmt, ap);
-  va_end(ap);
-
-  if (read_bytes < 0) {
-    /* read error */
-    return -1;
-  }
-
-  n = strtol(buffer, &tail, 0);
-  if ((*tail != '\0') && !isspace(*tail)) {
-    /* parse error */
-    return -1;
-  }
-
-  *value = n;
-  return 0;
-}
-
-//const char *get_plugin_lib_path()
-//{
-//  Dl_info dl_info;
-//  int ret;
-//
-//  ret = dladdr((void*)&get_plugin_lib_path, &dl_info);
-//  if (ret == 0) return NULL;
-//
-//  return dl_info.dli_fname;
-//}
-
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
 
 void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {

From 42ab4c68c2cb7eb415902b09e2aadf97d267a0e8 Mon Sep 17 00:00:00 2001
From: Wataru Ishida <wataru.ishid@gmail.com>
Date: Wed, 28 Feb 2024 13:24:48 +0000
Subject: [PATCH 2/3] fix(nccl_plugin): fix p2p_plugin type socket

Signed-off-by: Wataru Ishida <wataru.ishid@gmail.com>
---
 nccl_plugin/include/p2p_plugin.h | 1 +
 nccl_plugin/include/utils.h      | 8 ++++++++
 nccl_plugin/src/p2p_plugin.c     | 8 ++++++++
 3 files changed, 17 insertions(+)

diff --git a/nccl_plugin/include/p2p_plugin.h b/nccl_plugin/include/p2p_plugin.h
index b859777..8cee85c 100644
--- a/nccl_plugin/include/p2p_plugin.h
+++ b/nccl_plugin/include/p2p_plugin.h
@@ -29,6 +29,7 @@ typedef enum nccl_p2p_plugin {
   NCCL_P2P_IB,
   NCCL_P2P_UCX,
   NCCL_P2P_UCX_RMA,
+  NCCL_P2P_SOCKET,
   NCCL_P2P_LAST
 } nccl_p2p_plugin_t;
 
diff --git a/nccl_plugin/include/utils.h b/nccl_plugin/include/utils.h
index 96ca62e..33936a9 100644
--- a/nccl_plugin/include/utils.h
+++ b/nccl_plugin/include/utils.h
@@ -13,6 +13,10 @@
 #define NCCL_STATIC_ASSERT(_cond, _msg) \
     switch(0) {case 0:case (_cond):;}
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 ncclResult_t ncclIbMalloc(void** ptr, size_t size);
 ncclResult_t ncclRealloc(void** ptr, size_t old_size, size_t new_size);
 ncclResult_t getHostName(char* hostname, int maxlen);
@@ -28,4 +32,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList);
 int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact);
 const char *get_plugin_lib_path();
 
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
 #endif
diff --git a/nccl_plugin/src/p2p_plugin.c b/nccl_plugin/src/p2p_plugin.c
index daf90ce..d0824bf 100644
--- a/nccl_plugin/src/p2p_plugin.c
+++ b/nccl_plugin/src/p2p_plugin.c
@@ -25,6 +25,9 @@ extern ncclNet_v6_t ucxRmaPlugin_v6;
 extern ncclNet_v5_t ucxRmaPlugin_v5;
 #endif
 
+extern ncclNet_v6_t socketPlugin_v6;
+extern ncclNet_v5_t socketPlugin_v5;
+
 extern ncclNet_v8_t ibPlugin_v8;
 extern ncclNet_v7_t ibPlugin_v7;
 extern ncclNet_v6_t ibPlugin_v6;
@@ -88,11 +91,16 @@ static void pluginSetup()
     else if (!strcasecmp(p2p_layer, "ucx")) p2p_plugin = NCCL_P2P_UCX;
     else if (!strcasecmp(p2p_layer, "ucx_rma")) p2p_plugin = NCCL_P2P_UCX_RMA;
 #endif
+    else if (!strcasecmp(p2p_layer, "socket")) p2p_plugin = NCCL_P2P_SOCKET;
     else {
       WARN("Invalid value %s for NCCL_PLUGIN_P2P, using default", p2p_layer);
     }
   }
   switch (p2p_plugin) {
+    case NCCL_P2P_SOCKET:
+      ncclNetPlugin_v6 = socketPlugin_v6;
+      ncclNetPlugin_v5 = socketPlugin_v5;
+      break;
 #ifdef HAVE_UCX_PLUGIN
     case NCCL_P2P_UCX:
       ncclNetPlugin_v8 = ucxPlugin_v8;

From 6d57ebce5cdcce9f43fb096646717735dc893c98 Mon Sep 17 00:00:00 2001
From: Wataru Ishida <wataru.ishid@gmail.com>
Date: Wed, 28 Feb 2024 13:25:30 +0000
Subject: [PATCH 3/3] chore: fix Docker build

Signed-off-by: Wataru Ishida <wataru.ishid@gmail.com>
---
 Dockerfile | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1c576bc..035a6bb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,27 +1,48 @@
-FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS nccl
+FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS nccl
 
-RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common libibverbs-dev libopenmpi-dev autoconf libtool
 
 COPY test test
+
+RUN cd test && make build-nccl
+RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make build-nccl-tests
+
 COPY nccl_plugin nccl_plugin
 
-RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make && make install
+RUN cd test && make build-nccl-plugin
+RUN cd test && make install
 
-FROM ghcr.io/rust-lang/rust:nightly-bullseye-slim AS optcast
+FROM ubuntu:22.04 AS optcast
 
-COPY reduction_server reduction_server
+RUN apt-get update && apt-get install -y \
+    curl \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Rust install
+ENV RUST_HOME /usr/local/lib/rust
+ENV RUSTUP_HOME ${RUST_HOME}/rustup
+ENV CARGO_HOME ${RUST_HOME}/cargo
+RUN mkdir /usr/local/lib/rust && \
+    chmod 0755 $RUST_HOME
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > ${RUST_HOME}/rustup.sh \
+    && chmod +x ${RUST_HOME}/rustup.sh \
+    && ${RUST_HOME}/rustup.sh -y --default-toolchain nightly --no-modify-path
+ENV PATH $PATH:$CARGO_HOME/bin
 
 COPY --from=nccl /usr/local/lib /usr/local/lib
 
 RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy clang libibverbs1
 
+COPY reduction_server reduction_server
+
 RUN cd reduction_server && cargo build -r
 
 FROM optcast AS unittest
 
-RUN cd reduction_server && cargo test --all
+ENV RUST_LOG=info
+RUN cd reduction_server && cargo test --all -- --nocapture
 
-FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS final
+FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS final
 
 RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy --no-install-recommends openmpi-bin