duplicated gaussquad use in cuda version (pending unifying such utilities src code)

ahbarnett · ahbarnett · commit 1aa03dcb225f · 2025-06-08T15:32:58.000-04:00
diff --git a/CHANGELOG b/CHANGELOG
@@ -3,7 +3,7 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 
 * replaced LGPL-licensed Gauss-Legendre quadrature code by Apache2-licensed
-  code adapted from Jason Kaye's cppdlr. CPU only for now. PR #692 (Barnett).
+  code adapted from Jason Kaye's cppdlr. CPU and GPU. PR #692 (Barnett).
 
 V 2.4.0 (5/27/25)
 
diff --git a/include/cufinufft/contrib/legendre_rule_fast.h b/include/cufinufft/contrib/legendre_rule_fast.h
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -87,8 +87,10 @@ class WithCudaDevice {
   }
 };
 
-// ahb math helpers
+// math helpers whose source is in src/cuda/utils.cpp
 CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
+void gaussquad(int n, double *xgl, double *wgl);
+std::tuple<double, double> leg_eval(int n, double x);
 
 template<typename T> T infnorm(int n, std::complex<T> *a) {
   T nrm = 0.0;
@@ -107,8 +109,8 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
  */
 
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexShared(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   const auto raw_address = reinterpret_cast<T *>(address);
   atomicAdd(raw_address, res.x);
   atomicAdd(raw_address + 1, res.y);
@@ -120,8 +122,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
+set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp)
 
 set(PRECISION_DEPENDENT_SRC
     spreadinterp.cpp
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
@@ -14,8 +14,6 @@
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/utils.h>
 
-#include <legendre_rule_fast.h>
-
 namespace cufinufft {
 namespace common {
 using namespace cufinufft::spreadinterp;
@@ -205,10 +203,9 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *phase,
   const auto q = (int)(2 + 3.0 * J2); // matches CPU code
   double z[2 * MAX_NQUAD];
   double w[2 * MAX_NQUAD];
-  finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
-  // eg on (0,1)
-  for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
-    z[n] *= J2;                 // rescale nodes
+  cufinufft::utils::gaussquad(2 * q, z, w); // only half the nodes used, for (0,1)
+  for (int n = 0; n < q; ++n) {             // set up nodes z_n and vals f_n
+    z[n] *= J2;                             // rescale nodes
     f[n]     = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
     phase[n] = T(2.0 * M_PI * z[n] / T(nf));               // phase winding rates
   }
@@ -222,9 +219,7 @@ void onedim_nuft_kernel_precomp(T *f, T *z, finufft_spread_opts opts) {
   int q = (int)(2 + 2.0 * J2); // matches CPU code
   double z_local[2 * MAX_NQUAD];
   double w_local[2 * MAX_NQUAD];
-  finufft::quadrature::legendre_compute_glr(2 * q, z_local, w_local); // only half the
-                                                                      // nodes used, eg on
-                                                                      // (0,1)
+  cufinufft::utils::gaussquad(2 * q, z_local, w_local);   // half the nodes, (0,1)
   for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
     z[n] = J2 * T(z_local[n]);                            // rescale nodes
     f[n] = J2 * w_local[n] * evaluate_kernel(z[n], opts); // vals & quadr wei
diff --git a/src/cuda/utils.cpp b/src/cuda/utils.cpp
@@ -2,12 +2,13 @@
 
 namespace cufinufft {
 namespace utils {
+
 CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
 // finds even integer not less than n, with prime factors no larger than 5
 // (ie, "smooth") and is a multiple of b (b is a number that the only prime
 // factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
-// added condition about b Melody 05/31/20
+// added condition about b, Melody Shih 05/31/20
 {
   if (n <= 2) return 2;
   if (n % 2 == 1) n += 1;                // even
@@ -23,5 +24,61 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
   return nplus;
 }
 
+void gaussquad(int n, double *xgl, double *wgl) {
+  // copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
+
+  double x = 0, dx = 0;
+  int convcount = 0;
+
+  // Get Gauss-Legendre nodes
+  xgl[n / 2] = 0;                   // If odd number of nodes, middle node is 0
+  for (int i = 0; i < n / 2; i++) { // Loop through nodes
+    convcount = 0;
+    x         = cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node
+    while (true) {                               // Newton iteration
+      auto [p, dp] = leg_eval(n, x);
+      dx           = -p / dp;
+      x += dx; // Newton step
+      if (std::abs(dx) < 1e-14) {
+        convcount++;
+      }
+      if (convcount == 3) {
+        break;
+      } // If convergence tol hit 3 times, stop
+    }
+    xgl[i]         = -x;
+    xgl[n - i - 1] = x; // Symmetric nodes
+  }
+
+  // Get Gauss-Legendre weights from formula
+  // w_i = -2 / ((n+1)*P_n'(x_i)*P_{n+1}(x_i)) (Atkinson '89, pg. 276)
+  for (int i = 0; i < n / 2 + 1; i++) {
+    auto [junk1, dp] = leg_eval(n, xgl[i]);
+    auto [p, junk2]  = leg_eval(n + 1, xgl[i]); // This is a bit inefficient, but who
+                                                // cares...
+    wgl[i]         = -2 / ((n + 1) * dp * p);
+    wgl[n - i - 1] = wgl[i];
+  }
+}
+
+std::tuple<double, double> leg_eval(int n, double x) {
+  // copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
+
+  if (n == 0) {
+    return {1.0, 0.0};
+  }
+  if (n == 1) {
+    return {x, 1.0};
+  }
+  // Three-term recurrence and formula for derivative
+  double p0 = 0.0, p1 = 1.0, p2 = x;
+  for (int i = 1; i < n; i++) {
+    p0 = p1;
+    p1 = p2;
+    p2 = ((2 * i + 1) * x * p1 - i * p0) / (i + 1);
+  }
+  return {p2, n * (x * p2 - p1) / (x * x - 1)};
+}
+
 } // namespace utils
 } // namespace cufinufft

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)`
	`1`	`+set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp)`
`2`	`2`
`3`	`3`	`set(PRECISION_DEPENDENT_SRC`
`4`	`4`	`spreadinterp.cpp`