diff --git a/csrc/atomics.cuh b/csrc/atomics.cuh
index f5b5de0..b9179f7 100644
--- a/csrc/atomics.cuh
+++ b/csrc/atomics.cuh
@@ -172,7 +172,10 @@ static inline __device__ void atomAdd(int64_t *address, int64_t val) {
 static inline __device__ void atomAdd(float *address, float val) {
   atomicAdd(address, val);
 }
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+// CUDA 13.x only supports SM 70+, where native atomicAdd for double is available.
+// For older CUDA versions (< 8.0) or architectures without native double atomicAdd
+// (SM < 6.0), fall back to CAS-based implementation.
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) && (CUDA_VERSION < 13000)
 static inline __device__ void atomAdd(double *address, double val) {
   AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
 }
diff --git a/setup_ops.py b/setup_ops.py
index 7b74644..c5c6167 100644
--- a/setup_ops.py
+++ b/setup_ops.py
@@ -71,6 +71,14 @@ def get_extensions():
         nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ")
         nvcc_flags += ["-O3"]
         nvcc_flags += ["--expt-relaxed-constexpr"]
+
+        # CUDA 13.x may have stricter host compiler version checks,
+        # add --allow-unsupported-compiler to avoid build failures
+        # with newer or not-yet-certified host compilers.
+        cuda_major, _ = paddle.version.cuda_version.split(".")
+        if int(cuda_major) >= 13:
+            nvcc_flags += ["--allow-unsupported-compiler"]
+
         extra_compile_args["nvcc"] = nvcc_flags
 
     src = get_sources()