Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: plugin update to nccl-2.20 #3

Merged
merged 3 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,48 @@
FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS nccl
FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS nccl

RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common
RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy python3 openmpi-bin openmpi-common libibverbs-dev libopenmpi-dev autoconf libtool

COPY test test

RUN cd test && make build-nccl
RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make build-nccl-tests

COPY nccl_plugin nccl_plugin

RUN cd test && MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi make && make install
RUN cd test && make build-nccl-plugin
RUN cd test && make install

FROM ghcr.io/rust-lang/rust:nightly-bullseye-slim AS optcast
FROM ubuntu:22.04 AS optcast

COPY reduction_server reduction_server
RUN apt-get update && apt-get install -y \
curl \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Rust install
ENV RUST_HOME /usr/local/lib/rust
ENV RUSTUP_HOME ${RUST_HOME}/rustup
ENV CARGO_HOME ${RUST_HOME}/cargo
RUN mkdir /usr/local/lib/rust && \
chmod 0755 $RUST_HOME
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > ${RUST_HOME}/rustup.sh \
&& chmod +x ${RUST_HOME}/rustup.sh \
&& ${RUST_HOME}/rustup.sh -y --default-toolchain nightly --no-modify-path
ENV PATH $PATH:$CARGO_HOME/bin

COPY --from=nccl /usr/local/lib /usr/local/lib

RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy clang libibverbs1

COPY reduction_server reduction_server

RUN cd reduction_server && cargo build -r

FROM optcast AS unittest

RUN cd reduction_server && cargo test --all
ENV RUST_LOG=info
RUN cd reduction_server && cargo test --all -- --nocapture

FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu20.04 AS final
FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 AS final

RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -qy --no-install-recommends openmpi-bin

Expand Down
4 changes: 3 additions & 1 deletion nccl_plugin/include/ibvwrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2015-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
Expand Down Expand Up @@ -59,6 +59,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
Expand Down
47 changes: 42 additions & 5 deletions nccl_plugin/include/nccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
#endif

#define NCCL_MAJOR 2
#define NCCL_MINOR 15
#define NCCL_PATCH 1
#define NCCL_MINOR 20
#define NCCL_PATCH 3
#define NCCL_SUFFIX ""

#define NCCL_VERSION_CODE 21510
#define NCCL_VERSION_CODE 22003
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))

#ifdef __cplusplus
Expand All @@ -42,15 +42,24 @@ typedef enum { ncclSuccess = 0,
ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t;

#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1

/* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */
typedef struct ncclConfig_v21400 {
typedef struct ncclConfig_v21700 {
/* attributes that users should never touch. */
size_t size;
unsigned int magic;
unsigned int version;
/* attributes that users are able to customize. */
int blocking;
int cgaClusterSize;
int minCTAs;
int maxCTAs;
const char *netName;
int splitShare;
} ncclConfig_t;

/* Config initializer must be assigned to initialize config structure when it is created.
Expand All @@ -59,9 +68,23 @@ typedef struct ncclConfig_v21400 {
sizeof(ncclConfig_t), /* size */ \
0xcafebeef, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
1 /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
}

/* NCCL malloc and free function for all types of NCCL optimizations
* (e.g. user buffer registration). The actual allocated size might
* be larger than requested due to granularity requirement. */
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
ncclResult_t pncclMemAlloc(void** ptr, size_t size);

ncclResult_t ncclMemFree(void *ptr);
ncclResult_t pncclMemFree(void *ptr);

/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
Expand Down Expand Up @@ -119,6 +142,10 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);

/* Returns a human-readable message of the last error that occurred. */
const char* ncclGetLastError(ncclComm_t comm);
const char* pncclGetLastError(ncclComm_t comm);

/* Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
Expand All @@ -135,6 +162,16 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);


/* Register CUDA buffer for zero-copy operation */
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);

/* Deregister CUDA buffer */
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);


/* Reduction operation selector */
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
typedef enum { ncclSum = 0,
Expand Down
5 changes: 2 additions & 3 deletions nccl_plugin/include/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
#include "net_v4.h"

#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6

#endif // end include guard
32 changes: 32 additions & 0 deletions nccl_plugin/include/net_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef NET_DEVICE_H_
#define NET_DEVICE_H_

#include "net_device.h"

#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
#define NCCL_NET_MTU_SIZE 4096

// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7

typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;

typedef struct {
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
void* handle;
size_t size;
int needsProxyProgress;
} ncclNetDeviceHandle_v7_t;

typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;

#endif
2 changes: 0 additions & 2 deletions nccl_plugin/include/net_v6.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ typedef struct {
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v6_t;

typedef ncclNetProperties_v6_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
Expand Down
121 changes: 121 additions & 0 deletions nccl_plugin/include/net_v7.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_NET_V7_H_
#define NCCL_NET_V7_H_

#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;

// v7 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v7_t;

#endif // end include guard
Loading