Skip to content

Commit 1aa03dc

Browse files
committed
duplicated gaussquad use in cuda version (pending unifying such utilities src code)
1 parent 277b26a commit 1aa03dc

File tree

6 files changed

+71
-23
lines changed

6 files changed

+71
-23
lines changed

CHANGELOG

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
33

44

55
* replaced LGPL-licensed Gauss-Legendre quadrature code by Apache2-licensed
6-
code adapted from Jason Kaye's cppdlr. CPU only for now. PR #692 (Barnett).
6+
code adapted from Jason Kaye's cppdlr. CPU and GPU. PR #692 (Barnett).
77

88
V 2.4.0 (5/27/25)
99

include/cufinufft/contrib/legendre_rule_fast.h

Lines changed: 0 additions & 6 deletions
This file was deleted.

include/cufinufft/utils.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,10 @@ class WithCudaDevice {
8787
}
8888
};
8989

90-
// ahb math helpers
90+
// math helpers whose source is in src/cuda/utils.cpp
9191
CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
92+
void gaussquad(int n, double *xgl, double *wgl);
93+
std::tuple<double, double> leg_eval(int n, double x);
9294

9395
template<typename T> T infnorm(int n, std::complex<T> *a) {
9496
T nrm = 0.0;
@@ -107,8 +109,8 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
107109
*/
108110

109111
template<typename T>
110-
static __forceinline__ __device__ void atomicAddComplexShared(
111-
cuda_complex<T> *address, cuda_complex<T> res) {
112+
static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
113+
cuda_complex<T> res) {
112114
const auto raw_address = reinterpret_cast<T *>(address);
113115
atomicAdd(raw_address, res.x);
114116
atomicAdd(raw_address + 1, res.y);
@@ -120,8 +122,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
120122
* on shared memory are supported so we leverage them
121123
*/
122124
template<typename T>
123-
static __forceinline__ __device__ void atomicAddComplexGlobal(
124-
cuda_complex<T> *address, cuda_complex<T> res) {
125+
static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
126+
cuda_complex<T> res) {
125127
if constexpr (
126128
std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
127129
atomicAdd(address, res);

src/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
1+
set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp)
22

33
set(PRECISION_DEPENDENT_SRC
44
spreadinterp.cpp

src/cuda/common.cu

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
#include <cufinufft/spreadinterp.h>
1515
#include <cufinufft/utils.h>
1616

17-
#include <legendre_rule_fast.h>
18-
1917
namespace cufinufft {
2018
namespace common {
2119
using namespace cufinufft::spreadinterp;
@@ -205,10 +203,9 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *phase,
205203
const auto q = (int)(2 + 3.0 * J2); // matches CPU code
206204
double z[2 * MAX_NQUAD];
207205
double w[2 * MAX_NQUAD];
208-
finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
209-
// eg on (0,1)
210-
for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
211-
z[n] *= J2; // rescale nodes
206+
cufinufft::utils::gaussquad(2 * q, z, w); // only half the nodes used, for (0,1)
207+
for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
208+
z[n] *= J2; // rescale nodes
212209
f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
213210
phase[n] = T(2.0 * M_PI * z[n] / T(nf)); // phase winding rates
214211
}
@@ -222,9 +219,7 @@ void onedim_nuft_kernel_precomp(T *f, T *z, finufft_spread_opts opts) {
222219
int q = (int)(2 + 2.0 * J2); // matches CPU code
223220
double z_local[2 * MAX_NQUAD];
224221
double w_local[2 * MAX_NQUAD];
225-
finufft::quadrature::legendre_compute_glr(2 * q, z_local, w_local); // only half the
226-
// nodes used, eg on
227-
// (0,1)
222+
cufinufft::utils::gaussquad(2 * q, z_local, w_local); // half the nodes, (0,1)
228223
for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
229224
z[n] = J2 * T(z_local[n]); // rescale nodes
230225
f[n] = J2 * w_local[n] * evaluate_kernel(z[n], opts); // vals & quadr wei

src/cuda/utils.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
namespace cufinufft {
44
namespace utils {
5+
56
CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
67
// finds even integer not less than n, with prime factors no larger than 5
78
// (ie, "smooth") and is a multiple of b (b is a number that the only prime
89
// factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
910
// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
10-
// added condition about b Melody 05/31/20
11+
// added condition about b, Melody Shih 05/31/20
1112
{
1213
if (n <= 2) return 2;
1314
if (n % 2 == 1) n += 1; // even
@@ -23,5 +24,61 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
2324
return nplus;
2425
}
2526

27+
void gaussquad(int n, double *xgl, double *wgl) {
28+
// copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
29+
30+
double x = 0, dx = 0;
31+
int convcount = 0;
32+
33+
// Get Gauss-Legendre nodes
34+
xgl[n / 2] = 0; // If odd number of nodes, middle node is 0
35+
for (int i = 0; i < n / 2; i++) { // Loop through nodes
36+
convcount = 0;
37+
x = cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node
38+
while (true) { // Newton iteration
39+
auto [p, dp] = leg_eval(n, x);
40+
dx = -p / dp;
41+
x += dx; // Newton step
42+
if (std::abs(dx) < 1e-14) {
43+
convcount++;
44+
}
45+
if (convcount == 3) {
46+
break;
47+
} // If convergence tol hit 3 times, stop
48+
}
49+
xgl[i] = -x;
50+
xgl[n - i - 1] = x; // Symmetric nodes
51+
}
52+
53+
// Get Gauss-Legendre weights from formula
54+
// w_i = -2 / ((n+1)*P_n'(x_i)*P_{n+1}(x_i)) (Atkinson '89, pg. 276)
55+
for (int i = 0; i < n / 2 + 1; i++) {
56+
auto [junk1, dp] = leg_eval(n, xgl[i]);
57+
auto [p, junk2] = leg_eval(n + 1, xgl[i]); // This is a bit inefficient, but who
58+
// cares...
59+
wgl[i] = -2 / ((n + 1) * dp * p);
60+
wgl[n - i - 1] = wgl[i];
61+
}
62+
}
63+
64+
std::tuple<double, double> leg_eval(int n, double x) {
65+
// copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
66+
67+
if (n == 0) {
68+
return {1.0, 0.0};
69+
}
70+
if (n == 1) {
71+
return {x, 1.0};
72+
}
73+
// Three-term recurrence and formula for derivative
74+
double p0 = 0.0, p1 = 1.0, p2 = x;
75+
for (int i = 1; i < n; i++) {
76+
p0 = p1;
77+
p1 = p2;
78+
p2 = ((2 * i + 1) * x * p1 - i * p0) / (i + 1);
79+
}
80+
return {p2, n * (x * p2 - p1) / (x * x - 1)};
81+
}
82+
2683
} // namespace utils
2784
} // namespace cufinufft

0 commit comments

Comments
 (0)