Skip to content

Commit

Permalink
param: Temporarily hack around ODR problem
Browse files Browse the repository at this point in the history
The param implementation stamped out accessor functions in each
compilation unit, which created obvious problems with C++'s naming
rules.  We know we want to rewrite this in the future, so for now
just stamp out one implementation in nccl_ofi_param.c that everyone
can call, and do some ugly macro hacking to make that a minimum
required change.

Signed-off-by: Brian Barrett <bbarrett@amazon.com>
  • Loading branch information
bwbarrett committed Oct 21, 2024
1 parent f1594d8 commit 061d8ee
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 4 deletions.
43 changes: 39 additions & 4 deletions include/nccl_ofi_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,40 @@ extern "C" {
#include "nccl_ofi_log.h"
#include "nccl_ofi_pthread.h"

/*
* This is an ugly hack. The original implementation of
* nccl_ofi_param created inline functions to access each environment
* variable, using the macros found in nccl_ofi_param.h. However,
* this creates something of an ODR problem, as multiple complication
* units can call the same param lookup function, and that results in
* naming conflicts. So instead, we have the header file act like a
* normal header file most of the time, and when included from
* nccl_ofi_param.c with OFI_NCCL_PARAM_DEFINE set to 1, stamps out
* the original implementations of the functions. So now we have one
* copy of each function that everyone can call.
*
* This is intended to be a transient state. We want to rewrite the
* entire param system once we've finished moving to C++, but need to
* solve the ODR problem before we move to C++. So here lies one of
* the more terrible pieces of code I've ever written.
*/
#ifndef OFI_NCCL_PARAM_DEFINE

#define OFI_NCCL_PARAM_UINT(name, env, default_value) \
uint64_t ofi_nccl_##name(void)

#define OFI_NCCL_PARAM_INT(name, env, default_value) \
int64_t ofi_nccl_##name(void)

#define OFI_NCCL_PARAM_STR(name, env, default_value) \
const char *ofi_nccl_##name(void)

#else

#define OFI_NCCL_PARAM_UINT(name, env, default_value) \
uint64_t ofi_nccl_##name(void); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
static inline uint64_t ofi_nccl_##name() \
uint64_t ofi_nccl_##name(void) \
{ \
static bool initialized = false; \
static uint64_t value = default_value; \
Expand Down Expand Up @@ -57,8 +88,9 @@ extern "C" {
}

#define OFI_NCCL_PARAM_INT(name, env, default_value) \
int64_t ofi_nccl_##name(); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
static inline int64_t ofi_nccl_##name() { \
int64_t ofi_nccl_##name() { \
static bool initialized = false; \
static int64_t value = default_value; \
if (initialized) { \
Expand Down Expand Up @@ -89,8 +121,9 @@ static inline int64_t ofi_nccl_##name() { \
}

#define OFI_NCCL_PARAM_STR(name, env, default_value) \
const char *ofi_nccl_##name(); \
static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \
static inline const char *ofi_nccl_##name() { \
const char *ofi_nccl_##name() { \
static bool initialized = false; \
static const char *value = default_value; \
if (initialized) { \
Expand Down Expand Up @@ -118,6 +151,8 @@ static inline const char *ofi_nccl_##name() { \
return value; \
}

#endif

/*
* Enable using endpoints with IPv6 addressing format for TCP provider.
* By default, we disable using endpoints having IPv6 addressing format.
Expand Down Expand Up @@ -288,7 +323,7 @@ OFI_NCCL_PARAM_UINT(eager_max_size, "EAGER_MAX_SIZE", 8192);
#define OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT 1
#endif
OFI_NCCL_PARAM_INT(errorcheck_mutex, "ERRORCHECK_MUTEX",
OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT)
OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT);

/*
* If 0, create a Libfabric endpoint per domain, shared across all
Expand Down
1 change: 1 addition & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ sources = \
nccl_ofi_pthread.c \
nccl_ofi_dmabuf.c \
nccl_ofi_ep_addr_list.c \
nccl_ofi_param.c \
tracepoint.c

if WANT_PLATFORM_AWS
Expand Down
25 changes: 25 additions & 0 deletions src/nccl_ofi_param.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#include "config.h"

/*
* This is an ugly hack. The original implementation of
* nccl_ofi_param created inline functions to access each environment
* variable, using the macros found in nccl_ofi_param.h. However,
* this creates something of an ODR problem, as multiple complication
* units can call the same param lookup function, and that results in
* naming conflicts. So instead, we have the header file act like a
* normal header file most of the time, and when included from
* nccl_ofi_param.c with OFI_NCCL_PARAM_DEFINE set to 1, stamps out
* the original implementations of the functions. So now we have one
* copy of each function that everyone can call.
*
* This is intended to be a transient state. We want to rewrite the
* entire param system once we've finished moving to C++, but need to
* solve the ODR problem before we move to C++. So here lies one of
* the more terrible pieces of code I've ever written.
*/
#define OFI_NCCL_PARAM_DEFINE 1
#include "nccl_ofi_param.h"

0 comments on commit 061d8ee

Please sign in to comment.