From 061d8eefe4024584660b9da5d1a6e496f0380c60 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 18 Oct 2024 17:13:24 +0000 Subject: [PATCH] param: Temporarily hack around ODR problem The param implementation stamped out accessor functions in each compilation unit, which created obvious problems with C++'s naming rules. We know we want to rewrite this in the future, so for now just stamp out one implementation in nccl_ofi_param.c that everyone can call, and do some ugly macro hacking to make that a minimum required change. Signed-off-by: Brian Barrett --- include/nccl_ofi_param.h | 43 ++++++++++++++++++++++++++++++++++++---- src/Makefile.am | 1 + src/nccl_ofi_param.c | 25 +++++++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 src/nccl_ofi_param.c diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index ac4ddcac0..0321acf16 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -18,9 +18,40 @@ extern "C" { #include "nccl_ofi_log.h" #include "nccl_ofi_pthread.h" +/* + * This is an ugly hack. The original implementation of + * nccl_ofi_param created inline functions to access each environment + * variable, using the macros found in nccl_ofi_param.h. However, + * this creates something of an ODR problem, as multiple complication + * units can call the same param lookup function, and that results in + * naming conflicts. So instead, we have the header file act like a + * normal header file most of the time, and when included from + * nccl_ofi_param.c with OFI_NCCL_PARAM_DEFINE set to 1, stamps out + * the original implementations of the functions. So now we have one + * copy of each function that everyone can call. + * + * This is intended to be a transient state. We want to rewrite the + * entire param system once we've finished moving to C++, but need to + * solve the ODR problem before we move to C++. So here lies one of + * the more terrible pieces of code I've ever written. + */ +#ifndef OFI_NCCL_PARAM_DEFINE + +#define OFI_NCCL_PARAM_UINT(name, env, default_value) \ +uint64_t ofi_nccl_##name(void) + +#define OFI_NCCL_PARAM_INT(name, env, default_value) \ +int64_t ofi_nccl_##name(void) + +#define OFI_NCCL_PARAM_STR(name, env, default_value) \ +const char *ofi_nccl_##name(void) + +#else + #define OFI_NCCL_PARAM_UINT(name, env, default_value) \ + uint64_t ofi_nccl_##name(void); \ static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \ - static inline uint64_t ofi_nccl_##name() \ + uint64_t ofi_nccl_##name(void) \ { \ static bool initialized = false; \ static uint64_t value = default_value; \ @@ -57,8 +88,9 @@ extern "C" { } #define OFI_NCCL_PARAM_INT(name, env, default_value) \ +int64_t ofi_nccl_##name(); \ static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \ -static inline int64_t ofi_nccl_##name() { \ +int64_t ofi_nccl_##name() { \ static bool initialized = false; \ static int64_t value = default_value; \ if (initialized) { \ @@ -89,8 +121,9 @@ static inline int64_t ofi_nccl_##name() { \ } #define OFI_NCCL_PARAM_STR(name, env, default_value) \ +const char *ofi_nccl_##name(); \ static pthread_mutex_t ofi_nccl_param_lock_##name = PTHREAD_MUTEX_INITIALIZER; \ -static inline const char *ofi_nccl_##name() { \ +const char *ofi_nccl_##name() { \ static bool initialized = false; \ static const char *value = default_value; \ if (initialized) { \ @@ -118,6 +151,8 @@ static inline const char *ofi_nccl_##name() { \ return value; \ } +#endif + /* * Enable using endpoints with IPv6 addressing format for TCP provider. * By default, we disable using endpoints having IPv6 addressing format. @@ -288,7 +323,7 @@ OFI_NCCL_PARAM_UINT(eager_max_size, "EAGER_MAX_SIZE", 8192); #define OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT 1 #endif OFI_NCCL_PARAM_INT(errorcheck_mutex, "ERRORCHECK_MUTEX", - OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT) + OFI_NCCL_PARAM_ERRORCHECK_MUTEX_DEFAULT); /* * If 0, create a Libfabric endpoint per domain, shared across all diff --git a/src/Makefile.am b/src/Makefile.am index ed5003db5..a9252d102 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,6 +25,7 @@ sources = \ nccl_ofi_pthread.c \ nccl_ofi_dmabuf.c \ nccl_ofi_ep_addr_list.c \ + nccl_ofi_param.c \ tracepoint.c if WANT_PLATFORM_AWS diff --git a/src/nccl_ofi_param.c b/src/nccl_ofi_param.c new file mode 100644 index 000000000..9d6748b3f --- /dev/null +++ b/src/nccl_ofi_param.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "config.h" + +/* + * This is an ugly hack. The original implementation of + * nccl_ofi_param created inline functions to access each environment + * variable, using the macros found in nccl_ofi_param.h. However, + * this creates something of an ODR problem, as multiple complication + * units can call the same param lookup function, and that results in + * naming conflicts. So instead, we have the header file act like a + * normal header file most of the time, and when included from + * nccl_ofi_param.c with OFI_NCCL_PARAM_DEFINE set to 1, stamps out + * the original implementations of the functions. So now we have one + * copy of each function that everyone can call. + * + * This is intended to be a transient state. We want to rewrite the + * entire param system once we've finished moving to C++, but need to + * solve the ODR problem before we move to C++. So here lies one of + * the more terrible pieces of code I've ever written. + */ +#define OFI_NCCL_PARAM_DEFINE 1 +#include "nccl_ofi_param.h"