From 02356c67578493255b54ab8176fd03022cd95efa Mon Sep 17 00:00:00 2001 From: Sofie Martins Date: Sat, 9 Nov 2024 10:44:37 +0200 Subject: [PATCH 1/4] Force inline functions for exponential clover calculations --- Include/Utils/clover_exp.h | 481 ++++++++++++++++++++++++- Include/Utils/factorial.h | 109 +++++- LibHR/Update/force_fermion_core_gpu.cu | 6 +- LibHR/Utils/clover_exp.c | 3 + LibHR/Utils/factorial.c | 124 ------- 5 files changed, 581 insertions(+), 142 deletions(-) delete mode 100644 LibHR/Utils/factorial.c diff --git a/Include/Utils/clover_exp.h b/Include/Utils/clover_exp.h index 810230819..1bd8db367 100644 --- a/Include/Utils/clover_exp.h +++ b/Include/Utils/clover_exp.h @@ -4,29 +4,482 @@ #ifdef WITH_EXPCLOVER #include "libhr_core.h" +#include "Utils/factorial.h" #ifdef __cplusplus extern "C" { #endif -visible void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A); -visible void _su2Nfc_times_su2Nf(suNfc *C, suNfc *B, suNfc *A); -visible void _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc *B, suNfc *A); -visible void _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, suNfc *B, suNfc *A); -visible void _su2Nfc_times_su2Nfc_trace(hr_complex *trace, suNfc *B, suNfc *A); -visible void _su2Nfc_times_su2Nfc_trace_herm_sq(hr_complex *trace, suNfc *B); -visible void _su2Nfc_unit(suNfc *A); -visible void _su2Nfc_trace(hr_complex *p, suNfc *A); -visible void factorialCoef(double *C, int NNexp); - -visible void clover_exp(suNfc *Aplus, suNfc *expAplus, int NN); -visible void clover_exp_taylor(suNfc *Aplus, suNfc *expAplus); -visible void doublehorner(double *C, suNfc *A, int NNexp); - void evaluate_sw_order(double *mass); int get_NNexp(); int get_NN(); +//C = B*A when C hermitian! +visible static __forceinline__ void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A) { + // new zero component + _suNfc_times_suNfc(C[0], B[0], A[0]); + _suNfc_times_suNfc_assign(C[0], B[1], A[2]); + + // new one component + _suNfc_times_suNfc(C[1], B[0], A[1]); + _suNfc_times_suNfc_assign(C[1], B[1], A[3]); + + _suNfc_dagger(C[2], C[1]); + + // new three component + _suNfc_times_suNfc(C[3], B[2], A[1]); + _suNfc_times_suNfc_assign(C[3], B[3], A[3]); +} + +//C = B*A +visible static __forceinline__ void _su2Nfc_times_su2Nfc(suNfc *C, suNfc *B, suNfc *A) { + // new zero component + _suNfc_times_suNfc(C[0], B[0], A[0]); + _suNfc_times_suNfc_assign(C[0], B[1], A[2]); + + // new one component + _suNfc_times_suNfc(C[1], B[0], A[1]); + _suNfc_times_suNfc_assign(C[1], B[1], A[3]); + + // new two component + _suNfc_times_suNfc(C[2], B[2], A[0]); + _suNfc_times_suNfc_assign(C[2], B[3], A[2]); + + // new three component + _suNfc_times_suNfc(C[3], B[2], A[1]); + _suNfc_times_suNfc_assign(C[3], B[3], A[3]); +} + +// C += A*B +visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc *B, suNfc *A) { + // new zero component + _suNfc_times_suNfc_assign(C[0], B[0], A[0]); + _suNfc_times_suNfc_assign(C[0], B[1], A[2]); + + // new one component + _suNfc_times_suNfc_assign(C[1], B[0], A[1]); + _suNfc_times_suNfc_assign(C[1], B[1], A[3]); + + // new two component + _suNfc_times_suNfc_assign(C[2], B[2], A[0]); + _suNfc_times_suNfc_assign(C[2], B[3], A[2]); + + // new three component + _suNfc_times_suNfc_assign(C[3], B[2], A[1]); + _suNfc_times_suNfc_assign(C[3], B[3], A[3]); +} + +//C += B*A when C hermitian! +visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, suNfc *B, suNfc *A) { + // new zero component + _suNfc_times_suNfc_assign(C[0], B[0], A[0]); + _suNfc_times_suNfc_assign(C[0], B[1], A[2]); + + // new one component + _suNfc_times_suNfc_assign(C[1], B[0], A[1]); + _suNfc_times_suNfc_assign(C[1], B[1], A[3]); + + _suNfc_dagger(C[2], C[1]); + + // new three component + _suNfc_times_suNfc_assign(C[3], B[2], A[1]); + _suNfc_times_suNfc_assign(C[3], B[3], A[3]); +} + +//trace of B*A +visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace(hr_complex *trace, suNfc *B, suNfc *A) { + suNfc aux; + + _suNfc_times_suNfc(aux, B[0], A[0]); + _suNfc_times_suNfc_assign(aux, B[1], A[2]); + _suNfc_times_suNfc_assign(aux, B[2], A[1]); + _suNfc_times_suNfc_assign(aux, B[3], A[3]); + + _suNfc_trace(*trace, aux); +} + +//trace of the square of a 2NF hermitian matrix +visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace_herm_sq(hr_complex *trace, suNfc *B) { + suNfc aux; + hr_complex auxtrace; + + _suNfc_times_suNfc(aux, B[1], B[2]); + _suNfc_trace(auxtrace, aux); + _suNfc_times_suNfc(aux, B[0], B[0]); + _suNfc_times_suNfc_assign(aux, B[3], B[3]); + + _suNfc_trace(*trace, aux); + + *trace = *trace + 2 * auxtrace; +} + +visible static __forceinline__ void _su2Nfc_unit(suNfc *A) { + _suNfc_unit(A[0]); + _suNfc_unit(A[3]); + _suNfc_zero(A[1]); + _suNfc_zero(A[2]); +} + +visible static __forceinline__ void _su2Nfc_trace(hr_complex *p, suNfc *A) { + hr_complex aux = 0.; + _suNfc_trace(aux, A[0]); + _suNfc_trace(*p, A[3]); + *p = *p + aux; +} + +#if (NF == 3) + +visible static __forceinline__ void clover_exp_NF3(suNfc *Aplus, suNfc *expAplus, int NN) { + suNfc A0[3], A2[4], A3[4], tmp1[4], tmp2[3]; + + int i = 0, j = 0; + hr_complex p[2 * NF - 1]; + _su2Nfc_times_su2Nfc_herm(A2, Aplus, Aplus); + _su2Nfc_times_su2Nfc_herm(A3, A2, Aplus); + _suNfc_unit(A0[0]); + _suNfc_unit(A0[2]); + _suNfc_zero(A0[1]); + + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); + _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); + _su2Nfc_trace(&p[3], A3); + _su2Nfc_trace(&p[4], A2); + + /* + p[0] = -p[0]/6 + p[4]*p[2]/8 + p[3]*p[3]/18 -p[4]*p[4]*p[4]/48; + p[1] = -p[1]/5 + p[4]*p[3]/6; + p[2] = -p[2]/4 + p[4]*p[4]/8; + p[3] = -p[3]/3; + p[4] = -p[4]/2; + */ + + p[4] = -p[4] / 2; + p[3] = -p[3] / 3; + p[2] = -p[2] / 4; + + p[0] = p[3] * p[3] / 2 + p[4] * p[2] + (p[4] * p[4] * p[4] - p[0]) / 6; + p[1] = -p[1] / 5 + p[4] * p[3]; + + p[2] += p[4] * p[4] / 2; + + double q[2 * NF]; + for (i = 0; i < 2 * NF; i++) { + q[i] = 0.; + } + + double qlast; + q[0] = inverse_fact(NN); + + for (i = NN - 1; i >= 0; i--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (j = 2 * NF - 2; j > 0; j--) { + q[j] = q[j - 1] - creal(p[j]) * qlast; + } + q[0] = inverse_fact(i) - creal(p[0]) * qlast; + } + + //Optimized to reduce operations! + + _suNfc_mul_add(expAplus[0], q[0], A0[0], q[1], Aplus[0]); + _suNfc_mul(tmp1[0], q[2], A2[0]); + _suNfc_add_assign(expAplus[0], tmp1[0]); + + _suNfc_mul_add(tmp1[0], q[3], A0[0], q[4], Aplus[0]); + _suNfc_mul(tmp2[0], q[5], A2[0]); + _suNfc_add_assign(tmp1[0], tmp2[0]); + + _suNfc_mul_add(expAplus[1], q[0], A0[1], q[1], Aplus[1]); + _suNfc_mul(tmp1[1], q[2], A2[1]); + _suNfc_add_assign(expAplus[1], tmp1[1]); + + _suNfc_mul_add(tmp1[1], q[3], A0[1], q[4], Aplus[1]); + _suNfc_mul(tmp2[1], q[5], A2[1]); + _suNfc_add_assign(tmp1[1], tmp2[1]); + + _suNfc_dagger(tmp1[2], tmp1[1]); + + _suNfc_mul_add(expAplus[3], q[0], A0[2], q[1], Aplus[3]); + _suNfc_mul(tmp1[3], q[2], A2[3]); + _suNfc_add_assign(expAplus[3], tmp1[3]); + + _suNfc_mul_add(tmp1[3], q[3], A0[2], q[4], Aplus[3]); + _suNfc_mul(tmp2[2], q[5], A2[3]); + _suNfc_add_assign(tmp1[3], tmp2[2]); + + _su2Nfc_times_su2Nfc_assign_herm(expAplus, A3, tmp1); +} +#endif + +#if (NF == 2) + +visible static __forceinline__ void clover_exp_NF2(suNfc *Aplus, suNfc *expAplus, int lNN) { + suNfc A0[3], A2[4], tmp1[4]; + + int i = 0, j = 0; + hr_complex p[2 * NF - 1]; + _su2Nfc_times_su2Nfc_herm(A2, Aplus, Aplus); + _suNfc_unit(A0[0]); + _suNfc_unit(A0[2]); + _suNfc_zero(A0[1]); + + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A2); + _su2Nfc_times_su2Nfc_trace(&p[1], Aplus, A2); + _su2Nfc_trace(&p[2], A2); + + p[0] = -p[0] / 4 + p[2] * p[2] / 8; + p[1] = -p[1] / 3; + p[2] = -p[2] / 2; + + double q[2 * NF]; + for (i = 0; i < 2 * NF; i++) { + q[i] = 0.; + } + double qlast; + q[0] = inverse_fact(lNN); + for (i = lNN - 1; i >= 0; i--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (j = 2 * NF - 2; j > 0; j--) { + q[j] = q[j - 1] - creal(p[j]) * qlast; + } + q[0] = inverse_fact(i) - creal(p[0]) * qlast; + } + + //Optimized to reduce operations! + + _suNfc_mul_add(expAplus[0], q[0], A0[0], q[1], Aplus[0]); + _suNfc_mul_add(tmp1[0], q[2], A0[0], q[3], Aplus[0]); + + _suNfc_mul_add(expAplus[1], q[0], A0[1], q[1], Aplus[1]); + _suNfc_mul_add(tmp1[1], q[2], A0[1], q[3], Aplus[1]); + + _suNfc_dagger(tmp1[2], tmp1[1]); + + _suNfc_mul_add(expAplus[3], q[0], A0[2], q[1], Aplus[3]); + _suNfc_mul_add(tmp1[3], q[2], A0[2], q[3], Aplus[3]); + + _su2Nfc_times_su2Nfc_assign_herm(expAplus, A2, tmp1); +} + +#endif + +visible static __forceinline__ void clover_exp_taylor(suNfc *Xin, suNfc *u) { + suNfc Xk[4], tmp[4]; + _su2Nfc_unit(u); + _su2Nfc_unit(Xk); + + int k = 1; + int i = 0; + double error = 0., erroraux; + while (1) { + _su2Nfc_times_su2Nfc(tmp, Xk, Xin); + + for (i = 0; i < 4; i++) { + _suNfc_mul(Xk[i], 1. / k, tmp[i]); + _suNfc_add_assign(u[i], Xk[i]); + } + + k++; + + error = 0.; + + for (i = 0; i < 4; i++) { + _suNfc_sqnorm(erroraux, Xk[i]); + error += erroraux; + } + + if (sqrt(error) < 1e-28) { break; } + } +} + +visible __forceinline__ void clover_exp(suNfc *Aplus, suNfc *expAplus, int lNN) { +#if (NF == 2) + clover_exp_NF2(Aplus, expAplus, lNN); +#elif (NF == 3) + clover_exp_NF3(Aplus, expAplus, lNN); +#else + clover_exp_taylor(Aplus, expAplus); +#endif +} + +#if (NF == 3) + +visible static __forceinline__ void doublehornerNF3(double *C, suNfc *A, int lNNexp) { + suNfc A2[4], A3[4]; + hr_complex p[2 * NF - 1]; + + _su2Nfc_times_su2Nfc_herm(A2, A, A); + _su2Nfc_times_su2Nfc_herm(A3, A2, A); + + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); + _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); + _su2Nfc_trace(&p[3], A3); + _su2Nfc_trace(&p[4], A2); + + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); + _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); + _su2Nfc_trace(&p[3], A3); + _su2Nfc_trace(&p[4], A2); + + p[0] = -p[0] / 6 + p[4] * p[2] / 8 + p[3] * p[3] / 18 - p[4] * p[4] * p[4] / 48; + p[1] = -p[1] / 5 + p[4] * p[3] / 6; + p[2] = -p[2] / 4 + p[4] * p[4] / 8; + p[3] = -p[3] / 3; + p[4] = -p[4] / 2; + + int i, j, k; + double q[2 * NF], qlast; + + double **q2 = (double **)malloc((lNNexp + 1) * sizeof(double *)); + for (int l = 0; l < lNNexp + 1; ++l) { + q2[l] = (double *)malloc((2 * NF) * sizeof(double)); + } + + for (j = 0; j <= lNNexp; j++) { + q[0] = inverse_fact(lNNexp + 2); + for (k = 1; k < 2 * NF; k++) { + q[k] = 0.; + } + + for (i = lNNexp - j; i > -1; i--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (k = 2 * NF - 2; k > 0; k--) { + q[k] = q[k - 1] - creal(p[k]) * qlast; + } + q[0] = -creal(p[0]) * qlast + inverse_fact(i + j + 1); + } + + for (i = 0; i < 2 * NF; i++) { + q2[j][i] = q[i]; + } + } + + for (i = 0; i < 2 * NF; i++) { + q[0] = q2[lNNexp][i]; + for (k = 1; k < 2 * NF; k++) { + q[k] = 0.; + } + + for (j = lNNexp - 1; j > -1; j--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (k = 2 * NF - 2; k > 0; k--) { + q[k] = q[k - 1] - creal(p[k]) * qlast; + } + q[0] = -creal(p[0]) * qlast + q2[j][i]; + } + + for (j = 0; j < 2 * NF; j++) { + // INDX = 2*NF*j + i + C[2 * NF * j + i] = q[j]; + } + } + + for (int l = 0; l < lNNexp + 1; ++l) { + free(q2[l]); + } + free(q2); +} + +#endif + +#if (NF == 2) + +visible static __forceinline__ void doublehornerNF2(double *C, suNfc *A, int lNNexp) { + suNfc A2[4]; + hr_complex p[2 * NF - 1]; + _su2Nfc_times_su2Nfc_herm(A2, A, A); + + _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A2); + _su2Nfc_times_su2Nfc_trace(&p[1], A, A2); + _su2Nfc_trace(&p[2], A2); + + p[0] = -p[0] / 4 + p[2] * p[2] / 8; + p[1] = -p[1] / 3; + p[2] = -p[2] / 2; + + int i, j, k; + double q[2 * NF], qlast; + + double *q2 = (double *)malloc((lNNexp + 1) * 2 * NF * sizeof(double *)); + + for (j = 0; j <= lNNexp; j++) { + q[0] = inverse_fact(lNNexp + 2); + for (k = 1; k < 2 * NF; k++) { + q[k] = 0.; + } + + for (i = lNNexp - j; i > -1; i--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (k = 2 * NF - 2; k > 0; k--) { + q[k] = q[k - 1] - creal(p[k]) * qlast; + } + q[0] = -creal(p[0]) * qlast + inverse_fact(i + j + 1); + } + + for (i = 0; i < 2 * NF; i++) { + q2[2 * NF * j + i] = q[i]; + } + } + + for (i = 0; i < 2 * NF; i++) { + q[0] = q2[2 * NF * lNNexp + i]; + for (k = 1; k < 2 * NF; k++) { + q[k] = 0.; + } + + for (j = lNNexp - 1; j > -1; j--) { + qlast = q[2 * NF - 1]; + q[2 * NF - 1] = q[2 * NF - 2]; + for (k = 2 * NF - 2; k > 0; k--) { + q[k] = q[k - 1] - creal(p[k]) * qlast; + } + q[0] = -creal(p[0]) * qlast + q2[2 * NF * j + i]; + } + + for (j = 0; j < 2 * NF; j++) { + // INDX = 2*NF*j + i + C[2 * NF * j + i] = q[j]; + } + } + + free(q2); +} +#endif + +visible __forceinline__ void doublehorner(double *C, suNfc *A, int lNNexp) { +#if (NF == 3) + doublehornerNF3(C, A, lNNexp); +#elif (NF == 2) + doublehornerNF2(C, A, lNNexp); +#else + // TODO: this does not work because error is not a host + // device function. There is now a compile time + // check that forbids NF > 3 to compile WITH_EXPCLOVER + //error(0, 1, "doublehorner " __FILE__, "Force only implemented for NF=2 and NF=3"); +#endif +} + +visible __forceinline__ void factorialCoef(double *C, int lNNexp) { + int i, j; + + for (j = 0; j < lNNexp; j++) { + for (i = 0; i < lNNexp; i++) { + if (i + j <= lNNexp) { + C[(lNNexp)*i + j] = inverse_fact(i + j + 1); + } else { + C[(lNNexp)*i + j] = 0.; + } + } + } +} + #ifdef __cplusplus } #endif diff --git a/Include/Utils/factorial.h b/Include/Utils/factorial.h index 65493945a..ab9509250 100644 --- a/Include/Utils/factorial.h +++ b/Include/Utils/factorial.h @@ -14,7 +14,114 @@ extern "C" { #endif -visible double inverse_fact(int); +#if MAX_FACTORIAL > 100 +#error "MAX_FACTORIAL cannot be larger than 100. There is probably no reason, to calculate inverse factorials this high." +#endif + +visible __forceinline__ double inverse_fact(int i) { + static const double inv_fact[] = { 1.0, + 1.0, + 0.5, + 0.16666666666666666, + 0.041666666666666664, + 0.008333333333333333, + 0.001388888888888889, + 0.0001984126984126984, + 2.48015873015873e-05, + 2.7557319223985893e-06, + 2.755731922398589e-07, + 2.505210838544172e-08, + 2.08767569878681e-09, + 1.6059043836821613e-10, + 1.1470745597729725e-11, + 7.647163731819816e-13, + 4.779477332387385e-14, + 2.8114572543455206e-15, + 1.5619206968586225e-16, + 8.22063524662433e-18, + 4.110317623312165e-19, + 1.9572941063391263e-20, + 8.896791392450574e-22, + 3.8681701706306835e-23, + 1.6117375710961184e-24, + 6.446950284384474e-26, + 2.4795962632247972e-27, + 9.183689863795546e-29, + 3.279889237069838e-30, + 1.1309962886447718e-31, + 3.7699876288159054e-33, + 1.2161250415535181e-34, + 3.800390754854744e-36, + 1.151633562077195e-37, + 3.387157535521162e-39, + 9.67759295863189e-41, + 2.688220266286636e-42, + 7.265460179153071e-44, + 1.911963205040282e-45, + 4.902469756513544e-47, + 1.225617439128386e-48, + 2.9893108271424046e-50, + 7.117406731291439e-52, + 1.6552108677421951e-53, + 3.7618428812322616e-55, + 8.359650847182804e-57, + 1.817315401561479e-58, + 3.866628513960594e-60, + 8.055476070751238e-62, + 1.643974708316579e-63, + 3.2879494166331584e-65, + 6.446959640457174e-67, + 1.2397999308571486e-68, + 2.3392451525606576e-70, + 4.331935467704922e-72, + 7.876246304918039e-74, + 1.4064725544496498e-75, + 2.4674957095607893e-77, + 4.254302947518602e-79, + 7.210682961895936e-81, + 1.2017804936493227e-82, + 1.9701319568021682e-84, + 3.177632188390594e-86, + 5.043860616493007e-88, + 7.881032213270323e-90, + 1.2124664943492803e-91, + 1.8370704459837581e-93, + 2.74189618803546e-95, + 4.0322002765227353e-97, + 5.843768516699617e-99, + 8.348240738142308e-101, + 1.1758085546679308e-102, + 1.633067437038793e-104, + 2.2370786808750587e-106, + 3.023079298479809e-108, + 4.030772397973078e-110, + 5.30364789206984e-112, + 6.887854405285506e-114, + 8.830582570878855e-116, + 1.117795262136564e-117, + 1.397244077670705e-119, + 1.724992688482352e-121, + 2.103649620100429e-123, + 2.53451761457883e-125, + 3.0172828744986073e-127, + 3.549744558233656e-129, + 4.127609951434483e-131, + 4.7443792545223946e-133, + 5.3913400619572666e-135, + 6.057685462873333e-137, + 6.730761625414815e-139, + 7.396441346609687e-141, + 8.039610159358354e-143, + 8.64474210683694e-145, + 9.196534156209511e-147, + 9.680562269694223e-149, + 1.0083919030931482e-150, + 1.039579281539328e-152, + 1.0607951852442122e-154, + 1.0715102881254669e-156, + 1.071510288125467e-158 }; + return inv_fact[i]; +} #ifdef __cplusplus } diff --git a/LibHR/Update/force_fermion_core_gpu.cu b/LibHR/Update/force_fermion_core_gpu.cu index 106bc0001..5883fede5 100644 --- a/LibHR/Update/force_fermion_core_gpu.cu +++ b/LibHR/Update/force_fermion_core_gpu.cu @@ -85,7 +85,7 @@ static suNg_av_field *force_sum = NULL; #if defined(WITH_CLOVER) || defined(WITH_EXPCLOVER) -__device__ static void g5_sigma(suNf_spinor *s, suNf_spinor *u, int mu, int nu) { +__device__ static __forceinline__ void g5_sigma(suNf_spinor *s, suNf_spinor *u, int mu, int nu) { if (mu == 0 && nu == 1) { for (int i = 0; i < NF; i++) { s->c[0].c[i] = -I * u->c[1].c[i]; @@ -173,7 +173,7 @@ __device__ static void g5_sigma(suNf_spinor *s, suNf_spinor *u, int mu, int nu) } } -__device__ static suNf fmat_create(suNf_spinor *a_lhs, suNf_spinor *a_rhs, suNf_spinor *b_lhs, suNf_spinor *b_rhs) { +__device__ static __forceinline__ suNf fmat_create(suNf_spinor *a_lhs, suNf_spinor *a_rhs, suNf_spinor *b_lhs, suNf_spinor *b_rhs) { suNf fmat; _suNf_zero(fmat); for (int i = 0; i < NF; i++) { @@ -425,7 +425,7 @@ __global__ static void _force_clover_fermion(suNf *cl_force, suNf_spinor *Xs, su #ifdef WITH_EXPCLOVER -visible static void A_times_spinor(suNf_spinor *out, suNfc *Aplus, suNfc *Aminus, suNf_spinor *in) { +visible static __forceinline__ void A_times_spinor(suNf_spinor *out, suNfc *Aplus, suNfc *Aminus, suNf_spinor *in) { suNf_vector aux; // Comp 0 1 diff --git a/LibHR/Utils/clover_exp.c b/LibHR/Utils/clover_exp.c index db26f2db5..1307b51bb 100644 --- a/LibHR/Utils/clover_exp.c +++ b/LibHR/Utils/clover_exp.c @@ -48,6 +48,7 @@ void evaluate_sw_order(double *mass) { error(0 == 0, 1, "set_sw_order" __FILE__, "SW parameters are out of range"); } } +<<<<<<< HEAD //C = B*A when C hermitian! visible void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A) { // new zero component @@ -503,5 +504,7 @@ visible void factorialCoef(double *C, int lNNexp) { } } } +======= +>>>>>>> 6bb9b21d (Force inline functions for exponential clover calculations) #endif diff --git a/LibHR/Utils/factorial.c b/LibHR/Utils/factorial.c deleted file mode 100644 index 50214756e..000000000 --- a/LibHR/Utils/factorial.c +++ /dev/null @@ -1,124 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2023, Sofie Martins * - * All rights reserved. * - ***************************************************************************/ - -#include "libhr_core.h" -#include "utils.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if MAX_FACTORIAL > 100 -#error "MAX_FACTORIAL cannot be larger than 100. There is probably no reason, to calculate inverse factorials this high." -#endif - -visible double inverse_fact(int i) { - static const double inv_fact[] = { 1.0, - 1.0, - 0.5, - 0.16666666666666666, - 0.041666666666666664, - 0.008333333333333333, - 0.001388888888888889, - 0.0001984126984126984, - 2.48015873015873e-05, - 2.7557319223985893e-06, - 2.755731922398589e-07, - 2.505210838544172e-08, - 2.08767569878681e-09, - 1.6059043836821613e-10, - 1.1470745597729725e-11, - 7.647163731819816e-13, - 4.779477332387385e-14, - 2.8114572543455206e-15, - 1.5619206968586225e-16, - 8.22063524662433e-18, - 4.110317623312165e-19, - 1.9572941063391263e-20, - 8.896791392450574e-22, - 3.8681701706306835e-23, - 1.6117375710961184e-24, - 6.446950284384474e-26, - 2.4795962632247972e-27, - 9.183689863795546e-29, - 3.279889237069838e-30, - 1.1309962886447718e-31, - 3.7699876288159054e-33, - 1.2161250415535181e-34, - 3.800390754854744e-36, - 1.151633562077195e-37, - 3.387157535521162e-39, - 9.67759295863189e-41, - 2.688220266286636e-42, - 7.265460179153071e-44, - 1.911963205040282e-45, - 4.902469756513544e-47, - 1.225617439128386e-48, - 2.9893108271424046e-50, - 7.117406731291439e-52, - 1.6552108677421951e-53, - 3.7618428812322616e-55, - 8.359650847182804e-57, - 1.817315401561479e-58, - 3.866628513960594e-60, - 8.055476070751238e-62, - 1.643974708316579e-63, - 3.2879494166331584e-65, - 6.446959640457174e-67, - 1.2397999308571486e-68, - 2.3392451525606576e-70, - 4.331935467704922e-72, - 7.876246304918039e-74, - 1.4064725544496498e-75, - 2.4674957095607893e-77, - 4.254302947518602e-79, - 7.210682961895936e-81, - 1.2017804936493227e-82, - 1.9701319568021682e-84, - 3.177632188390594e-86, - 5.043860616493007e-88, - 7.881032213270323e-90, - 1.2124664943492803e-91, - 1.8370704459837581e-93, - 2.74189618803546e-95, - 4.0322002765227353e-97, - 5.843768516699617e-99, - 8.348240738142308e-101, - 1.1758085546679308e-102, - 1.633067437038793e-104, - 2.2370786808750587e-106, - 3.023079298479809e-108, - 4.030772397973078e-110, - 5.30364789206984e-112, - 6.887854405285506e-114, - 8.830582570878855e-116, - 1.117795262136564e-117, - 1.397244077670705e-119, - 1.724992688482352e-121, - 2.103649620100429e-123, - 2.53451761457883e-125, - 3.0172828744986073e-127, - 3.549744558233656e-129, - 4.127609951434483e-131, - 4.7443792545223946e-133, - 5.3913400619572666e-135, - 6.057685462873333e-137, - 6.730761625414815e-139, - 7.396441346609687e-141, - 8.039610159358354e-143, - 8.64474210683694e-145, - 9.196534156209511e-147, - 9.680562269694223e-149, - 1.0083919030931482e-150, - 1.039579281539328e-152, - 1.0607951852442122e-154, - 1.0715102881254669e-156, - 1.071510288125467e-158 }; - return inv_fact[i]; -} - -#ifdef __cplusplus -} -#endif From 500a7f2d6ca1d1daf4aad43296e53788a3102879 Mon Sep 17 00:00:00 2001 From: Sofie Martins Date: Sat, 9 Nov 2024 11:01:09 +0200 Subject: [PATCH 2/4] Added factorial.c file again --- LibHR/Utils/factorial.c | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 LibHR/Utils/factorial.c diff --git a/LibHR/Utils/factorial.c b/LibHR/Utils/factorial.c new file mode 100644 index 000000000..e69de29bb From e6c1bfe69d85b59e635f0f9ef0fb4fd27f814824 Mon Sep 17 00:00:00 2001 From: Sofie Martins Date: Sun, 10 Nov 2024 15:33:40 +0100 Subject: [PATCH 3/4] Removed issue from rebasing --- Include/Core/gpu.h | 6 + Include/Utils/clover_exp.h | 32 +-- Include/Utils/factorial.h | 2 +- LibHR/Utils/clover_exp.c | 458 ------------------------------------- 4 files changed, 23 insertions(+), 475 deletions(-) diff --git a/Include/Core/gpu.h b/Include/Core/gpu.h index d7b6d30e5..840ae01b1 100644 --- a/Include/Core/gpu.h +++ b/Include/Core/gpu.h @@ -14,6 +14,12 @@ #define deviceonly #endif +#ifdef WITH_GPU +#define forceinline __forceinline__ +#else +#define forceinline __attribute__((always_inline)) inline +#endif + #ifdef WITH_GPU #include diff --git a/Include/Utils/clover_exp.h b/Include/Utils/clover_exp.h index 1bd8db367..c23ae17b7 100644 --- a/Include/Utils/clover_exp.h +++ b/Include/Utils/clover_exp.h @@ -15,7 +15,7 @@ int get_NNexp(); int get_NN(); //C = B*A when C hermitian! -visible static __forceinline__ void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A) { +visible void forceinline _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A) { // new zero component _suNfc_times_suNfc(C[0], B[0], A[0]); _suNfc_times_suNfc_assign(C[0], B[1], A[2]); @@ -32,7 +32,7 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B } //C = B*A -visible static __forceinline__ void _su2Nfc_times_su2Nfc(suNfc *C, suNfc *B, suNfc *A) { +visible void forceinline _su2Nfc_times_su2Nfc(suNfc *C, suNfc *B, suNfc *A) { // new zero component _suNfc_times_suNfc(C[0], B[0], A[0]); _suNfc_times_suNfc_assign(C[0], B[1], A[2]); @@ -51,7 +51,7 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc(suNfc *C, suNfc *B, suN } // C += A*B -visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc *B, suNfc *A) { +visible void forceinline _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc *B, suNfc *A) { // new zero component _suNfc_times_suNfc_assign(C[0], B[0], A[0]); _suNfc_times_suNfc_assign(C[0], B[1], A[2]); @@ -70,7 +70,7 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc } //C += B*A when C hermitian! -visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, suNfc *B, suNfc *A) { +visible void forceinline _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, suNfc *B, suNfc *A) { // new zero component _suNfc_times_suNfc_assign(C[0], B[0], A[0]); _suNfc_times_suNfc_assign(C[0], B[1], A[2]); @@ -87,7 +87,7 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, s } //trace of B*A -visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace(hr_complex *trace, suNfc *B, suNfc *A) { +visible void forceinline _su2Nfc_times_su2Nfc_trace(hr_complex *trace, suNfc *B, suNfc *A) { suNfc aux; _suNfc_times_suNfc(aux, B[0], A[0]); @@ -99,7 +99,7 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace(hr_complex *trace } //trace of the square of a 2NF hermitian matrix -visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace_herm_sq(hr_complex *trace, suNfc *B) { +visible void forceinline _su2Nfc_times_su2Nfc_trace_herm_sq(hr_complex *trace, suNfc *B) { suNfc aux; hr_complex auxtrace; @@ -113,14 +113,14 @@ visible static __forceinline__ void _su2Nfc_times_su2Nfc_trace_herm_sq(hr_comple *trace = *trace + 2 * auxtrace; } -visible static __forceinline__ void _su2Nfc_unit(suNfc *A) { +visible void forceinline _su2Nfc_unit(suNfc *A) { _suNfc_unit(A[0]); _suNfc_unit(A[3]); _suNfc_zero(A[1]); _suNfc_zero(A[2]); } -visible static __forceinline__ void _su2Nfc_trace(hr_complex *p, suNfc *A) { +visible void forceinline _su2Nfc_trace(hr_complex *p, suNfc *A) { hr_complex aux = 0.; _suNfc_trace(aux, A[0]); _suNfc_trace(*p, A[3]); @@ -129,7 +129,7 @@ visible static __forceinline__ void _su2Nfc_trace(hr_complex *p, suNfc *A) { #if (NF == 3) -visible static __forceinline__ void clover_exp_NF3(suNfc *Aplus, suNfc *expAplus, int NN) { +visible void forceinline clover_exp_NF3(suNfc *Aplus, suNfc *expAplus, int NN) { suNfc A0[3], A2[4], A3[4], tmp1[4], tmp2[3]; int i = 0, j = 0; @@ -214,7 +214,7 @@ visible static __forceinline__ void clover_exp_NF3(suNfc *Aplus, suNfc *expAplus #if (NF == 2) -visible static __forceinline__ void clover_exp_NF2(suNfc *Aplus, suNfc *expAplus, int lNN) { +visible void forceinline clover_exp_NF2(suNfc *Aplus, suNfc *expAplus, int lNN) { suNfc A0[3], A2[4], tmp1[4]; int i = 0, j = 0; @@ -265,7 +265,7 @@ visible static __forceinline__ void clover_exp_NF2(suNfc *Aplus, suNfc *expAplus #endif -visible static __forceinline__ void clover_exp_taylor(suNfc *Xin, suNfc *u) { +visible void forceinline clover_exp_taylor(suNfc *Xin, suNfc *u) { suNfc Xk[4], tmp[4]; _su2Nfc_unit(u); _su2Nfc_unit(Xk); @@ -294,7 +294,7 @@ visible static __forceinline__ void clover_exp_taylor(suNfc *Xin, suNfc *u) { } } -visible __forceinline__ void clover_exp(suNfc *Aplus, suNfc *expAplus, int lNN) { +visible void forceinline clover_exp(suNfc *Aplus, suNfc *expAplus, int lNN) { #if (NF == 2) clover_exp_NF2(Aplus, expAplus, lNN); #elif (NF == 3) @@ -306,7 +306,7 @@ visible __forceinline__ void clover_exp(suNfc *Aplus, suNfc *expAplus, int lNN) #if (NF == 3) -visible static __forceinline__ void doublehornerNF3(double *C, suNfc *A, int lNNexp) { +visible void forceinline doublehornerNF3(double *C, suNfc *A, int lNNexp) { suNfc A2[4], A3[4]; hr_complex p[2 * NF - 1]; @@ -390,7 +390,7 @@ visible static __forceinline__ void doublehornerNF3(double *C, suNfc *A, int lNN #if (NF == 2) -visible static __forceinline__ void doublehornerNF2(double *C, suNfc *A, int lNNexp) { +visible void forceinline doublehornerNF2(double *C, suNfc *A, int lNNexp) { suNfc A2[4]; hr_complex p[2 * NF - 1]; _su2Nfc_times_su2Nfc_herm(A2, A, A); @@ -453,7 +453,7 @@ visible static __forceinline__ void doublehornerNF2(double *C, suNfc *A, int lNN } #endif -visible __forceinline__ void doublehorner(double *C, suNfc *A, int lNNexp) { +visible void forceinline doublehorner(double *C, suNfc *A, int lNNexp) { #if (NF == 3) doublehornerNF3(C, A, lNNexp); #elif (NF == 2) @@ -466,7 +466,7 @@ visible __forceinline__ void doublehorner(double *C, suNfc *A, int lNNexp) { #endif } -visible __forceinline__ void factorialCoef(double *C, int lNNexp) { +visible void forceinline factorialCoef(double *C, int lNNexp) { int i, j; for (j = 0; j < lNNexp; j++) { diff --git a/Include/Utils/factorial.h b/Include/Utils/factorial.h index ab9509250..411177d1e 100644 --- a/Include/Utils/factorial.h +++ b/Include/Utils/factorial.h @@ -18,7 +18,7 @@ extern "C" { #error "MAX_FACTORIAL cannot be larger than 100. There is probably no reason, to calculate inverse factorials this high." #endif -visible __forceinline__ double inverse_fact(int i) { +visible double forceinline inverse_fact(int i) { static const double inv_fact[] = { 1.0, 1.0, 0.5, diff --git a/LibHR/Utils/clover_exp.c b/LibHR/Utils/clover_exp.c index 1307b51bb..6c5ddf055 100644 --- a/LibHR/Utils/clover_exp.c +++ b/LibHR/Utils/clover_exp.c @@ -48,463 +48,5 @@ void evaluate_sw_order(double *mass) { error(0 == 0, 1, "set_sw_order" __FILE__, "SW parameters are out of range"); } } -<<<<<<< HEAD -//C = B*A when C hermitian! -visible void _su2Nfc_times_su2Nfc_herm(suNfc *C, suNfc *B, suNfc *A) { - // new zero component - _suNfc_times_suNfc(C[0], B[0], A[0]); - _suNfc_times_suNfc_assign(C[0], B[1], A[2]); - - // new one component - _suNfc_times_suNfc(C[1], B[0], A[1]); - _suNfc_times_suNfc_assign(C[1], B[1], A[3]); - - _suNfc_dagger(C[2], C[1]); - - // new three component - _suNfc_times_suNfc(C[3], B[2], A[1]); - _suNfc_times_suNfc_assign(C[3], B[3], A[3]); -} - -//C = B*A -visible void _su2Nfc_times_su2Nfc(suNfc *C, suNfc *B, suNfc *A) { - // new zero component - _suNfc_times_suNfc(C[0], B[0], A[0]); - _suNfc_times_suNfc_assign(C[0], B[1], A[2]); - - // new one component - _suNfc_times_suNfc(C[1], B[0], A[1]); - _suNfc_times_suNfc_assign(C[1], B[1], A[3]); - - // new two component - _suNfc_times_suNfc(C[2], B[2], A[0]); - _suNfc_times_suNfc_assign(C[2], B[3], A[2]); - - // new three component - _suNfc_times_suNfc(C[3], B[2], A[1]); - _suNfc_times_suNfc_assign(C[3], B[3], A[3]); -} - -// C += A*B -visible void _su2Nfc_times_su2Nfc_assign(suNfc *C, suNfc *B, suNfc *A) { - // new zero component - _suNfc_times_suNfc_assign(C[0], B[0], A[0]); - _suNfc_times_suNfc_assign(C[0], B[1], A[2]); - - // new one component - _suNfc_times_suNfc_assign(C[1], B[0], A[1]); - _suNfc_times_suNfc_assign(C[1], B[1], A[3]); - - // new two component - _suNfc_times_suNfc_assign(C[2], B[2], A[0]); - _suNfc_times_suNfc_assign(C[2], B[3], A[2]); - - // new three component - _suNfc_times_suNfc_assign(C[3], B[2], A[1]); - _suNfc_times_suNfc_assign(C[3], B[3], A[3]); -} - -//C += B*A when C hermitian! -visible void _su2Nfc_times_su2Nfc_assign_herm(suNfc *C, suNfc *B, suNfc *A) { - // new zero component - _suNfc_times_suNfc_assign(C[0], B[0], A[0]); - _suNfc_times_suNfc_assign(C[0], B[1], A[2]); - - // new one component - _suNfc_times_suNfc_assign(C[1], B[0], A[1]); - _suNfc_times_suNfc_assign(C[1], B[1], A[3]); - - _suNfc_dagger(C[2], C[1]); - - // new three component - _suNfc_times_suNfc_assign(C[3], B[2], A[1]); - _suNfc_times_suNfc_assign(C[3], B[3], A[3]); -} - -//trace of B*A -visible void _su2Nfc_times_su2Nfc_trace(hr_complex *trace, suNfc *B, suNfc *A) { - suNfc aux; - - _suNfc_times_suNfc(aux, B[0], A[0]); - _suNfc_times_suNfc_assign(aux, B[1], A[2]); - _suNfc_times_suNfc_assign(aux, B[2], A[1]); - _suNfc_times_suNfc_assign(aux, B[3], A[3]); - - _suNfc_trace(*trace, aux); -} - -//trace of the square of a 2NF hermitian matrix -visible void _su2Nfc_times_su2Nfc_trace_herm_sq(hr_complex *trace, suNfc *B) { - suNfc aux; - hr_complex auxtrace; - - _suNfc_times_suNfc(aux, B[1], B[2]); - _suNfc_trace(auxtrace, aux); - _suNfc_times_suNfc(aux, B[0], B[0]); - _suNfc_times_suNfc_assign(aux, B[3], B[3]); - - _suNfc_trace(*trace, aux); - - *trace = *trace + 2 * auxtrace; -} - -visible void _su2Nfc_unit(suNfc *A) { - _suNfc_unit(A[0]); - _suNfc_unit(A[3]); - _suNfc_zero(A[1]); - _suNfc_zero(A[2]); -} - -visible void _su2Nfc_trace(hr_complex *p, suNfc *A) { - hr_complex aux = 0.; - _suNfc_trace(aux, A[0]); - _suNfc_trace(*p, A[3]); - *p = *p + aux; -} - -#if (NF == 3) - -visible static void clover_exp_NF3(suNfc *Aplus, suNfc *expAplus, int NN) { - suNfc A0[3], A2[4], A3[4], tmp1[4], tmp2[3]; - - int i = 0, j = 0; - hr_complex p[2 * NF - 1]; - _su2Nfc_times_su2Nfc_herm(A2, Aplus, Aplus); - _su2Nfc_times_su2Nfc_herm(A3, A2, Aplus); - _suNfc_unit(A0[0]); - _suNfc_unit(A0[2]); - _suNfc_zero(A0[1]); - - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); - _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); - _su2Nfc_trace(&p[3], A3); - _su2Nfc_trace(&p[4], A2); - - /* - p[0] = -p[0]/6 + p[4]*p[2]/8 + p[3]*p[3]/18 -p[4]*p[4]*p[4]/48; - p[1] = -p[1]/5 + p[4]*p[3]/6; - p[2] = -p[2]/4 + p[4]*p[4]/8; - p[3] = -p[3]/3; - p[4] = -p[4]/2; - */ - - p[4] = -p[4] / 2; - p[3] = -p[3] / 3; - p[2] = -p[2] / 4; - - p[0] = p[3] * p[3] / 2 + p[4] * p[2] + (p[4] * p[4] * p[4] - p[0]) / 6; - p[1] = -p[1] / 5 + p[4] * p[3]; - - p[2] += p[4] * p[4] / 2; - - double q[2 * NF]; - for (i = 0; i < 2 * NF; i++) { - q[i] = 0.; - } - - double qlast; - q[0] = inverse_fact(NN); - - for (i = NN - 1; i >= 0; i--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (j = 2 * NF - 2; j > 0; j--) { - q[j] = q[j - 1] - creal(p[j]) * qlast; - } - q[0] = inverse_fact(i) - creal(p[0]) * qlast; - } - - //Optimized to reduce operations! - - _suNfc_mul_add(expAplus[0], q[0], A0[0], q[1], Aplus[0]); - _suNfc_mul(tmp1[0], q[2], A2[0]); - _suNfc_add_assign(expAplus[0], tmp1[0]); - - _suNfc_mul_add(tmp1[0], q[3], A0[0], q[4], Aplus[0]); - _suNfc_mul(tmp2[0], q[5], A2[0]); - _suNfc_add_assign(tmp1[0], tmp2[0]); - - _suNfc_mul_add(expAplus[1], q[0], A0[1], q[1], Aplus[1]); - _suNfc_mul(tmp1[1], q[2], A2[1]); - _suNfc_add_assign(expAplus[1], tmp1[1]); - - _suNfc_mul_add(tmp1[1], q[3], A0[1], q[4], Aplus[1]); - _suNfc_mul(tmp2[1], q[5], A2[1]); - _suNfc_add_assign(tmp1[1], tmp2[1]); - - _suNfc_dagger(tmp1[2], tmp1[1]); - - _suNfc_mul_add(expAplus[3], q[0], A0[2], q[1], Aplus[3]); - _suNfc_mul(tmp1[3], q[2], A2[3]); - _suNfc_add_assign(expAplus[3], tmp1[3]); - - _suNfc_mul_add(tmp1[3], q[3], A0[2], q[4], Aplus[3]); - _suNfc_mul(tmp2[2], q[5], A2[3]); - _suNfc_add_assign(tmp1[3], tmp2[2]); - - _su2Nfc_times_su2Nfc_assign_herm(expAplus, A3, tmp1); -} -#endif - -#if (NF == 2) - -visible static void clover_exp_NF2(suNfc *Aplus, suNfc *expAplus, int lNN) { - suNfc A0[3], A2[4], tmp1[4]; - - int i = 0, j = 0; - hr_complex p[2 * NF - 1]; - _su2Nfc_times_su2Nfc_herm(A2, Aplus, Aplus); - _suNfc_unit(A0[0]); - _suNfc_unit(A0[2]); - _suNfc_zero(A0[1]); - - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A2); - _su2Nfc_times_su2Nfc_trace(&p[1], Aplus, A2); - _su2Nfc_trace(&p[2], A2); - - p[0] = -p[0] / 4 + p[2] * p[2] / 8; - p[1] = -p[1] / 3; - p[2] = -p[2] / 2; - - double q[2 * NF]; - for (i = 0; i < 2 * NF; i++) { - q[i] = 0.; - } - double qlast; - q[0] = inverse_fact(lNN); - for (i = lNN - 1; i >= 0; i--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (j = 2 * NF - 2; j > 0; j--) { - q[j] = q[j - 1] - creal(p[j]) * qlast; - } - q[0] = inverse_fact(i) - creal(p[0]) * qlast; - } - - //Optimized to reduce operations! - - _suNfc_mul_add(expAplus[0], q[0], A0[0], q[1], Aplus[0]); - _suNfc_mul_add(tmp1[0], q[2], A0[0], q[3], Aplus[0]); - - _suNfc_mul_add(expAplus[1], q[0], A0[1], q[1], Aplus[1]); - _suNfc_mul_add(tmp1[1], q[2], A0[1], q[3], Aplus[1]); - - _suNfc_dagger(tmp1[2], tmp1[1]); - - _suNfc_mul_add(expAplus[3], q[0], A0[2], q[1], Aplus[3]); - _suNfc_mul_add(tmp1[3], q[2], A0[2], q[3], Aplus[3]); - - _su2Nfc_times_su2Nfc_assign_herm(expAplus, A2, tmp1); -} - -#endif - -visible void clover_exp_taylor(suNfc *Xin, suNfc *u) { - suNfc Xk[4], tmp[4]; - _su2Nfc_unit(u); - _su2Nfc_unit(Xk); - - int k = 1; - int i = 0; - double error = 0., erroraux; - while (1) { - _su2Nfc_times_su2Nfc(tmp, Xk, Xin); - - for (i = 0; i < 4; i++) { - _suNfc_mul(Xk[i], 1. / k, tmp[i]); - _suNfc_add_assign(u[i], Xk[i]); - } - - k++; - - error = 0.; - - for (i = 0; i < 4; i++) { - _suNfc_sqnorm(erroraux, Xk[i]); - error += erroraux; - } - - if (sqrt(error) < 1e-28) { break; } - } -} - -visible void clover_exp(suNfc *Aplus, suNfc *expAplus, int lNN) { -#if (NF == 2) - clover_exp_NF2(Aplus, expAplus, lNN); -#elif (NF == 3) - clover_exp_NF3(Aplus, expAplus, lNN); -#else - clover_exp_taylor(Aplus, expAplus); -#endif -} - -#if (NF == 3) - -visible static void doublehornerNF3(double *C, suNfc *A, int lNNexp) { - suNfc A2[4], A3[4]; - hr_complex p[2 * NF - 1]; - - _su2Nfc_times_su2Nfc_herm(A2, A, A); - _su2Nfc_times_su2Nfc_herm(A3, A2, A); - - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); - _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); - _su2Nfc_trace(&p[3], A3); - _su2Nfc_trace(&p[4], A2); - - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A3); - _su2Nfc_times_su2Nfc_trace(&p[1], A3, A2); - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[2], A2); - _su2Nfc_trace(&p[3], A3); - _su2Nfc_trace(&p[4], A2); - - p[0] = -p[0] / 6 + p[4] * p[2] / 8 + p[3] * p[3] / 18 - p[4] * p[4] * p[4] / 48; - p[1] = -p[1] / 5 + p[4] * p[3] / 6; - p[2] = -p[2] / 4 + p[4] * p[4] / 8; - p[3] = -p[3] / 3; - p[4] = -p[4] / 2; - - int i, j, k; - double q[2 * NF], qlast; - double q2[MAX_FACTORIAL + 1][2 * NF]; - - for (j = 0; j <= lNNexp; j++) { - q[0] = inverse_fact(lNNexp + 2); - for (k = 1; k < 2 * NF; k++) { - q[k] = 0.; - } - - for (i = lNNexp - j; i > -1; i--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (k = 2 * NF - 2; k > 0; k--) { - q[k] = q[k - 1] - creal(p[k]) * qlast; - } - q[0] = -creal(p[0]) * qlast + inverse_fact(i + j + 1); - } - - for (i = 0; i < 2 * NF; i++) { - q2[j][i] = q[i]; - } - } - - for (i = 0; i < 2 * NF; i++) { - q[0] = q2[lNNexp][i]; - for (k = 1; k < 2 * NF; k++) { - q[k] = 0.; - } - - for (j = lNNexp - 1; j > -1; j--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (k = 2 * NF - 2; k > 0; k--) { - q[k] = q[k - 1] - creal(p[k]) * qlast; - } - q[0] = -creal(p[0]) * qlast + q2[j][i]; - } - - for (j = 0; j < 2 * NF; j++) { - // INDX = 2*NF*j + i - C[2 * NF * j + i] = q[j]; - } - } -} - -#endif - -#if (NF == 2) - -visible static void doublehornerNF2(double *C, suNfc *A, int lNNexp) { - suNfc A2[4]; - hr_complex p[2 * NF - 1]; - _su2Nfc_times_su2Nfc_herm(A2, A, A); - - _su2Nfc_times_su2Nfc_trace_herm_sq(&p[0], A2); - _su2Nfc_times_su2Nfc_trace(&p[1], A, A2); - _su2Nfc_trace(&p[2], A2); - - p[0] = -p[0] / 4 + p[2] * p[2] / 8; - p[1] = -p[1] / 3; - p[2] = -p[2] / 2; - - int i, j, k; - double q[2 * NF], qlast; - double q2[MAX_FACTORIAL + 1][2 * NF]; - - for (j = 0; j <= lNNexp; j++) { - q[0] = inverse_fact(lNNexp + 2); - for (k = 1; k < 2 * NF; k++) { - q[k] = 0.; - } - - for (i = lNNexp - j; i > -1; i--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (k = 2 * NF - 2; k > 0; k--) { - q[k] = q[k - 1] - creal(p[k]) * qlast; - } - q[0] = -creal(p[0]) * qlast + inverse_fact(i + j + 1); - } - - for (i = 0; i < 2 * NF; i++) { - q2[j][i] = q[i]; - } - } - - for (i = 0; i < 2 * NF; i++) { - q[0] = q2[lNNexp][i]; - for (k = 1; k < 2 * NF; k++) { - q[k] = 0.; - } - - for (j = lNNexp - 1; j > -1; j--) { - qlast = q[2 * NF - 1]; - q[2 * NF - 1] = q[2 * NF - 2]; - for (k = 2 * NF - 2; k > 0; k--) { - q[k] = q[k - 1] - creal(p[k]) * qlast; - } - q[0] = -creal(p[0]) * qlast + q2[j][i]; - } - - for (j = 0; j < 2 * NF; j++) { - // INDX = 2*NF*j + i - C[2 * NF * j + i] = q[j]; - } - } -} -#endif - -visible void doublehorner(double *C, suNfc *A, int lNNexp) { -#if (NF == 3) - doublehornerNF3(C, A, lNNexp); -#elif (NF == 2) - doublehornerNF2(C, A, lNNexp); -#else - // TODO: this does not work because error is not a host - // device function. There is now a compile time - // check that forbids NF > 3 to compile WITH_EXPCLOVER -#ifndef __CUDA_ARCH__ - error(0, 1, "doublehorner " __FILE__, "Force only implemented for NF=2 and NF=3"); -#endif -#endif -} - -visible void factorialCoef(double *C, int lNNexp) { - int i, j; - - for (j = 0; j < lNNexp; j++) { - for (i = 0; i < lNNexp; i++) { - if (i + j <= lNNexp) { - C[(lNNexp)*i + j] = inverse_fact(i + j + 1); - } else { - C[(lNNexp)*i + j] = 0.; - } - } - } -} -======= ->>>>>>> 6bb9b21d (Force inline functions for exponential clover calculations) #endif From a11e10ee7e332d9475bebdf260f55d38665773dd Mon Sep 17 00:00:00 2001 From: Sofie Martins Date: Sun, 10 Nov 2024 15:38:59 +0100 Subject: [PATCH 4/4] formatting --- Include/Core/hr_complex.h | 4 ++-- LibHR/Update/Dphi_gpu_kernels.hpp | 2 +- LibHR/Update/force_fermion_core_gpu.cu | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Include/Core/hr_complex.h b/Include/Core/hr_complex.h index c60c16d3b..816de5034 100644 --- a/Include/Core/hr_complex.h +++ b/Include/Core/hr_complex.h @@ -32,7 +32,7 @@ #define _complex_add(a, b, c) (a) = (b) + (c) #define _complex_sub(a, b, c) (a) = (b) - (c) #define _complex_add_star(a, b, c) (a) = (b) + conj(c) -#define _complex_sub_star(a, b, c) (a) = (b) - conj(c) +#define _complex_sub_star(a, b, c) (a) = (b)-conj(c) #define _complex_div(a, b, c) (a) = (b) / (c) #define _complex_inv(a, b) (a) = 1 / (b) #define _complex_prod(a, b) conj(a) * b @@ -45,7 +45,7 @@ #define _complex_i_minus(a, b) (a) = -I * (b) #define _complex_i_plus(a, b) (a) = I * (b) #define _complex_i_add(a, b, c) (a) = (b) + I * (c) -#define _complex_i_sub(a, b, c) (a) = (b) - I * (c) +#define _complex_i_sub(a, b, c) (a) = (b)-I * (c) #define _complex_add_assign(a, b) (a) += (b) #define _complex_sub_assign(a, b) (a) -= (b) #define _complex_add_star_assign(a, b, c) (a) += (b) + conj(c) diff --git a/LibHR/Update/Dphi_gpu_kernels.hpp b/LibHR/Update/Dphi_gpu_kernels.hpp index b8df8c76e..e060cbe14 100644 --- a/LibHR/Update/Dphi_gpu_kernels.hpp +++ b/LibHR/Update/Dphi_gpu_kernels.hpp @@ -343,7 +343,7 @@ do { \ const int block_offset = base_in; \ const HSPINOR_TYPE *in_offset = (HSPINOR_TYPE *)((in) + block_offset); \ - const int iy_loc = (iy) - block_offset; \ + const int iy_loc = (iy)-block_offset; \ read_gpu(0, &(sn), in_offset, iy_loc, 0, 1); \ } while (0) diff --git a/LibHR/Update/force_fermion_core_gpu.cu b/LibHR/Update/force_fermion_core_gpu.cu index 5883fede5..79f38ddac 100644 --- a/LibHR/Update/force_fermion_core_gpu.cu +++ b/LibHR/Update/force_fermion_core_gpu.cu @@ -173,7 +173,8 @@ __device__ static __forceinline__ void g5_sigma(suNf_spinor *s, suNf_spinor *u, } } -__device__ static __forceinline__ suNf fmat_create(suNf_spinor *a_lhs, suNf_spinor *a_rhs, suNf_spinor *b_lhs, suNf_spinor *b_rhs) { +__device__ static __forceinline__ suNf fmat_create(suNf_spinor *a_lhs, suNf_spinor *a_rhs, suNf_spinor *b_lhs, + suNf_spinor *b_rhs) { suNf fmat; _suNf_zero(fmat); for (int i = 0; i < NF; i++) {