diff --git a/csrc/atomics.cuh b/csrc/atomics.cuh index f5b5de0..b9179f7 100644 --- a/csrc/atomics.cuh +++ b/csrc/atomics.cuh @@ -172,7 +172,10 @@ static inline __device__ void atomAdd(int64_t *address, int64_t val) { static inline __device__ void atomAdd(float *address, float val) { atomicAdd(address, val); } -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) +// CUDA 13.x only supports SM 70+, where native atomicAdd for double is available. +// For older CUDA versions (< 8.0) or architectures without native double atomicAdd +// (SM < 6.0), fall back to CAS-based implementation. +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) && (CUDA_VERSION < 13000) static inline __device__ void atomAdd(double *address, double val) { AtomicAddDecimalImpl()(address, val); } diff --git a/setup_ops.py b/setup_ops.py index 7b74644..c5c6167 100644 --- a/setup_ops.py +++ b/setup_ops.py @@ -71,6 +71,14 @@ def get_extensions(): nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ") nvcc_flags += ["-O3"] nvcc_flags += ["--expt-relaxed-constexpr"] + + # CUDA 13.x may have stricter host compiler version checks, + # add --allow-unsupported-compiler to avoid build failures + # with newer or not-yet-certified host compilers. + cuda_major, _ = paddle.version.cuda_version.split(".") + if int(cuda_major) >= 13: + nvcc_flags += ["--allow-unsupported-compiler"] + extra_compile_args["nvcc"] = nvcc_flags src = get_sources()