Skip to content

Commit

Permalink
GPU: Fix atomics on the host
Browse files Browse the repository at this point in the history
  • Loading branch information
davidrohr committed Feb 16, 2024
1 parent 2ed0a63 commit 32a8350
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 73 deletions.
110 changes: 38 additions & 72 deletions GPU/Common/GPUCommonMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#if !defined(GPUCA_GPUCODE_DEVICE)
#include <cmath>
#include <algorithm>
#include <atomic>
#endif

#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
Expand Down Expand Up @@ -85,69 +86,49 @@ class GPUCommonMath
template <class T>
GPUdi() static T AtomicExch(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
return GPUCommonMath::AtomicExchInt(addr, val);
return GPUCommonMath::AtomicExchInternal(addr, val);
}

template <class T>
GPUdi() static T AtomicCAS(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T cmp, T val)
GPUdi() static bool AtomicCAS(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T cmp, T val)
{
return GPUCommonMath::AtomicCASInt(addr, cmp, val);
return GPUCommonMath::AtomicCASInternal(addr, cmp, val);
}

template <class T>
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
return GPUCommonMath::AtomicAddInt(addr, val);
return GPUCommonMath::AtomicAddInternal(addr, val);
}
template <class T>
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
GPUCommonMath::AtomicMaxInt(addr, val);
GPUCommonMath::AtomicMaxInternal(addr, val);
}
template <class T>
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
GPUCommonMath::AtomicMinInt(addr, val);
GPUCommonMath::AtomicMinInternal(addr, val);
}
template <class T>
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
#ifdef GPUCA_GPUCODE_DEVICE
return GPUCommonMath::AtomicExchInt(addr, val);
#else
T retVal = *addr;
*addr = val;
return retVal;
#endif
return GPUCommonMath::AtomicExchInternal(addr, val);
}
template <class T>
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
#ifdef GPUCA_GPUCODE_DEVICE
return GPUCommonMath::AtomicAddInt(addr, val);
#else
T retVal = *addr;
*addr += val;
return retVal;
#endif
return GPUCommonMath::AtomicAddInternal(addr, val);
}
template <class T>
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
#ifdef GPUCA_GPUCODE_DEVICE
GPUCommonMath::AtomicMaxInt(addr, val);
#else
*addr = std::max(*addr, val);
#endif
GPUCommonMath::AtomicMaxInternal(addr, val);
}
template <class T>
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
{
#ifdef GPUCA_GPUCODE_DEVICE
GPUCommonMath::AtomicMinInt(addr, val);
#else
*addr = std::min(*addr, val);
#endif
GPUCommonMath::AtomicMinInternal(addr, val);
}
GPUd() static int Mul24(int a, int b);
GPUd() static float FMulRZ(float a, float b);
Expand Down Expand Up @@ -175,15 +156,15 @@ class GPUCommonMath

private:
template <class S, class T>
GPUd() static unsigned int AtomicExchInt(S* addr, T val);
GPUd() static unsigned int AtomicExchInternal(S* addr, T val);
template <class S, class T>
GPUd() static T AtomicCASInt(S* addr, T cmp, T val);
GPUd() static bool AtomicCASInternal(S* addr, T cmp, T val);
template <class S, class T>
GPUd() static unsigned int AtomicAddInt(S* addr, T val);
GPUd() static unsigned int AtomicAddInternal(S* addr, T val);
template <class S, class T>
GPUd() static void AtomicMaxInt(S* addr, T val);
GPUd() static void AtomicMaxInternal(S* addr, T val);
template <class S, class T>
GPUd() static void AtomicMinInt(S* addr, T val);
GPUd() static void AtomicMinInternal(S* addr, T val);
};

typedef GPUCommonMath CAMath;
Expand Down Expand Up @@ -441,7 +422,7 @@ GPUhdi() float GPUCommonMath::Copysign(float x, float y)
}

template <class S, class T>
GPUdi() unsigned int GPUCommonMath::AtomicExchInt(S* addr, T val)
GPUdi() unsigned int GPUCommonMath::AtomicExchInternal(S* addr, T val)
{
#if defined(GPUCA_GPUCODE) && defined(__OPENCLCPP__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS))
return ::atomic_exchange(addr, val);
Expand All @@ -454,33 +435,28 @@ GPUdi() unsigned int GPUCommonMath::AtomicExchInt(S* addr, T val)
__atomic_exchange(addr, &val, &old, __ATOMIC_SEQ_CST);
return old;
#else
unsigned int old = *addr;
*addr = val;
return old;
return reinterpret_cast<std::atomic<T>*>(addr)->exchange(val);
#endif
}

template <class S, class T>
GPUdi() T GPUCommonMath::AtomicCASInt(S* addr, T cmp, T val)
GPUdi() bool GPUCommonMath::AtomicCASInternal(S* addr, T cmp, T val)
{
#if defined(GPUCA_GPUCODE) && defined(__OPENCLCPP__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS))
return ::atomic_compare_exchange(addr, cmp, val);
return ::atomic_compare_exchange(addr, cmp, val) == cmp;
#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
return ::atomic_cmpxchg(addr, cmp, val);
return ::atomic_cmpxchg(addr, cmp, val) == cmp;
#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
return ::atomicCAS(addr, cmp, val);
return ::atomicCAS(addr, cmp, val) == cmp;
#elif defined(WITH_OPENMP)
__atomic_compare_exchange(addr, &cmp, &val, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
return cmp;
return __atomic_compare_exchange(addr, &cmp, &val, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
#else
T old = *addr;
*addr = (old == cmp) ? val : old;
return old;
return reinterpret_cast<std::atomic<T>*>(addr)->compare_exchange_strong(cmp, val);
#endif
}

template <class S, class T>
GPUdi() unsigned int GPUCommonMath::AtomicAddInt(S* addr, T val)
GPUdi() unsigned int GPUCommonMath::AtomicAddInternal(S* addr, T val)
{
#if defined(GPUCA_GPUCODE) && defined(__OPENCLCPP__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS))
return ::atomic_fetch_add(addr, val);
Expand All @@ -491,76 +467,66 @@ GPUdi() unsigned int GPUCommonMath::AtomicAddInt(S* addr, T val)
#elif defined(WITH_OPENMP)
return __atomic_add_fetch(addr, val, __ATOMIC_SEQ_CST) - val;
#else
unsigned int old = *addr;
*addr += val;
return old;
return reinterpret_cast<std::atomic<T>*>(addr)->fetch_add(val);
#endif
}

template <class S, class T>
GPUdi() void GPUCommonMath::AtomicMaxInt(S* addr, T val)
GPUdi() void GPUCommonMath::AtomicMaxInternal(S* addr, T val)
{
#if defined(GPUCA_GPUCODE) && defined(__OPENCLCPP__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS))
::atomic_fetch_max(addr, val);
#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
::atomic_max(addr, val);
#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
::atomicMax(addr, val);
#elif defined(WITH_OPENMP)
while (*addr < val) {
AtomicExch(addr, val);
}
#else
if (*addr < val) {
*addr = val;
S current;
while ((current = *(volatile S*)addr) < val && !AtomicCASInternal(addr, current, val)) {
}
#endif // GPUCA_GPUCODE
}

template <class S, class T>
GPUdi() void GPUCommonMath::AtomicMinInt(S* addr, T val)
GPUdi() void GPUCommonMath::AtomicMinInternal(S* addr, T val)
{
#if defined(GPUCA_GPUCODE) && defined(__OPENCLCPP__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS))
::atomic_fetch_min(addr, val);
#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
::atomic_min(addr, val);
#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
::atomicMin(addr, val);
#elif defined(WITH_OPENMP)
while (*addr > val) {
AtomicExch(addr, val);
}
#else
if (*addr > val) {
*addr = val;
S current;
while ((current = *(volatile S*)addr) > val && !AtomicCASInternal(addr, current, val)) {
}
#endif // GPUCA_GPUCODE
}

#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__ROOTCINT__) && !defined(G__ROOT)
#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
template <>
GPUdii() void GPUCommonMath::AtomicMaxInt(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
GPUdii() void GPUCommonMath::AtomicMaxInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
{
if (val == -0.f) {
val = 0.f;
}
if (val >= 0) {
AtomicMaxInt((GPUAtomic(int)*)addr, __float_as_int(val));
AtomicMaxInternal((GPUAtomic(int)*)addr, __float_as_int(val));
} else {
AtomicMinInt((GPUAtomic(unsigned int)*)addr, __float_as_uint(val));
AtomicMinInternal((GPUAtomic(unsigned int)*)addr, __float_as_uint(val));
}
}
template <>
GPUdii() void GPUCommonMath::AtomicMinInt(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
{
if (val == -0.f) {
val = 0.f;
}
if (val >= 0) {
AtomicMinInt((GPUAtomic(int)*)addr, __float_as_int(val));
AtomicMinInternal((GPUAtomic(int)*)addr, __float_as_int(val));
} else {
AtomicMaxInt((GPUAtomic(unsigned int)*)addr, __float_as_uint(val));
AtomicMaxInternal((GPUAtomic(unsigned int)*)addr, __float_as_uint(val));
}
}
#endif
Expand Down
1 change: 0 additions & 1 deletion GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class
GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(unsigned int id, std::string&& name, int J, int num, int type, RecoStep step)
{
while (timerFlag.test_and_set()) {
;
}
if (mTimers.size() <= id) {
mTimers.resize(id + 1);
Expand Down

0 comments on commit 32a8350

Please sign in to comment.